api update fixes

emcf · Sep 30, 2024 · 5183396 · 5183396
1 parent 29d3716
commit 5183396
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 33 deletions.
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@ def read_git_requirements(file):
 
 setup(
     name='thepipe_api',
-    version='1.3.7',
+    version='1.3.9',
     author='Emmett McFarlane',
     author_email='[email protected]',
     description='AI-native extractor, powered by multimodal LLMs.',

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -0,0 +1,86 @@
+"""
+import unittest
+import os
+from thepipe.extract import extract_from_file, extract_from_url
+
+class TestAPI(unittest.TestCase):
+        
+    def setUp(self):
+        self.files_directory = os.path.join(os.path.dirname(__file__), 'files')
+        # Example schema for extraction
+        self.schema = {
+            "document_topic": "string",
+            "document_sentiment": "float",
+        }
+
+    def test_extract_from_file_with_multiple_extractions(self):
+        # Path to the file you want to test
+        file_path = os.path.join(self.files_directory, 'example.pdf')
+
+        # Call the real API for extracting data from the file
+        try:
+            result = extract_from_file(
+                file_path=file_path,
+                schema=self.schema,
+                ai_model="gpt-4o-mini",
+                multiple_extractions=True,
+                text_only=True,
+                ai_extraction=False,
+                host_images=False
+            )
+            print("Extract from file result:", result)
+
+            # Basic assertions to ensure extraction happened
+            self.assertIsInstance(result, list)
+            self.assertGreater(len(result), 0)
+            # Check if the extracted data matches the schema
+            # since multiple extractions is enabled, we have the 'extractions' key for each chunk
+            # containing all the extractions.
+            # the result looks like: [{'chunk_index': 0, 'source': 'example.pdf', 'extraction': [{'document_topic': 'Density PDFs in Supersonic Turbulence', 'document_sentiment': None}]}]
+            for item in result:
+                self.assertIsInstance(item, dict)
+                if 'extractions' in item:
+                    for extraction in item['extractions']:
+                        self.assertIsInstance(extraction, dict)
+                        for key in self.schema:
+                            self.assertIn(key, extraction)
+
+        except Exception as e:
+            self.fail(f"test_extract_from_file failed with error: {e}")
+
+    def test_extract_from_url_with_one_extraction(self):
+        # URL you want to extract information from
+        url = 'https://thepi.pe/'  # Update this with your actual URL
+
+        # Call the real API for extracting data from the URL
+        try:
+            result = extract_from_url(
+                url=url,
+                schema=self.schema,
+                ai_model="gpt-4o-mini",
+                multiple_extractions=False,
+                text_only=True,
+                ai_extraction=False,
+                host_images=False
+            )
+            print("Extract from URL result:", result)
+
+            # Basic assertions to ensure extraction happened
+            self.assertIsInstance(result, list)
+            self.assertGreater(len(result), 0)
+
+            # Check if the extracted data matches the schema
+            # since multiple extractions is disabled, we don't have the 'extractions' key for each chunk
+            # [{'chunk_index': 0, 'source': 'https://thepi.pe/', 'document_topic': 'AI document extraction and data processing', 'document_sentiment': 0.8}]
+            for item in result:
+                self.assertIsInstance(item, dict)
+                for key in self.schema:
+                    self.assertIn(key, item)
+
+        except Exception as e:
+            self.fail(f"test_extract_from_url failed with error: {e}")
+
+
+if __name__ == '__main__':
+    unittest.main()
+"""
diff --git a/thepipe/core.py b/thepipe/core.py
@@ -72,22 +72,25 @@ def to_json(self, host_images: bool = False) -> Dict:
     @staticmethod
     def from_json(data: Dict, host_images: bool = False) -> 'Chunk':
         images = []
-        for image_str in data['images']:
-            if host_images:
-                image_data = requests.get(image_str).content
-                image = Image.open(BytesIO(image_data))
-                images.append(image)
-            else:
-                remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
-                image_data = base64.b64decode(remove_prefix)
-                image = Image.open(BytesIO(image_data))
-                images.append(image)
+        if 'images' in data:
+            for image_str in data['images']:
+                if host_images:
+                    image_data = requests.get(image_str).content
+                    image = Image.open(BytesIO(image_data))
+                    images.append(image)
+                else:
+                    remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
+                    image_data = base64.b64decode(remove_prefix)
+                    image = Image.open(BytesIO(image_data))
+                    images.append(image)
+        if 'texts' in data:
+            texts = [text.strip() for text in data['texts']]
         return Chunk(
             path=data['path'],
-            texts=[text.strip() for text in data['texts']],
+            texts=texts,
             images=images,
-            audios=data['audios'],
-            videos=data['videos'],
+            #audios=data['audios'],
+            #videos=data['videos'],
         )
 
 def make_image_url(image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None) -> str:

diff --git a/thepipe/extract.py b/thepipe/extract.py
@@ -9,26 +9,24 @@
 import os
 from openai import OpenAI
 
-DEFAULT_EXTRACTION_PROMPT = "Extract structured information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but use your reasoning to always fill in every column as best you can. Always immediately return valid JSON."
+DEFAULT_EXTRACTION_PROMPT = "Extract all the information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but always fill in every column as best you can. Always immediately return valid JSON. You must extract ALL the information available in the entire document."
 DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o-mini")
 
 def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
     def clean_response_text(llm_response: str) -> str:
         return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip()
 
-    # try to match inside of code block
     code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$'
     match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL)
     if match:
         llm_response = match.group(1)
     llm_response = clean_response_text(llm_response)
 
-    # parse json by matching curly braces
     try:
         parsed_json = json.loads(llm_response)
         return parsed_json
     except json.JSONDecodeError:
-        json_pattern = r'(\[[\s\S]*\]|\{[\s\S]*\})'
+        json_pattern = r'($$[\s\S]*$$|\{[\s\S]*\})'
         match = re.search(json_pattern, llm_response)
         if match:
             try:
@@ -78,7 +76,7 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st
             model=ai_model,
             messages=messages,
             response_format={"type": "json_object"},
-            temperature=0.1,
+            temperature=0,
         )
         llm_response = response.choices[0].message.content
         input_tokens = calculate_tokens([chunk])
@@ -162,10 +160,10 @@ def extract_from_url(
     verbose: bool = False,
     chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page,
     local: bool = False
-) -> List[Dict]: #Tuple[List[Dict], int]:
+) -> List[Dict]:
     if local:
         chunks = scrape_url(url, text_only=text_only, ai_extraction=ai_extraction, verbose=verbose, local=local, chunking_method=chunking_method)
-        return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)
+        return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0]
     else:
         headers = {
             "Authorization": f"Bearer {THEPIPE_API_KEY}"
@@ -181,15 +179,16 @@ def extract_from_url(
             'ai_extraction': str(ai_extraction).lower(),
             'chunking_method': chunking_method.__name__
         }
-        response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data)
+        response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, stream=True)
         if response.status_code != 200:
             raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
 
         results = []
-        total_tokens_used = 0
         for line in response.iter_lines(decode_unicode=True):
             if line:
                 data = json.loads(line)
+                if 'extraction_complete' in data:
+                    break
                 result = data['result']
                 if 'error' in result:
                     results.append(result)
@@ -207,9 +206,8 @@ def extract_from_url(
                             if key not in extracted_data:
                                 extracted_data[key] = None
                     results.append(extracted_data)
-                total_tokens_used += data['tokens_used']
 
-        return results#, total_tokens_used
+        return results
 
 def extract_from_file(
     file_path: str, 
@@ -223,10 +221,10 @@ def extract_from_file(
     verbose: bool = False,
     chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page,
     local: bool = False
-) -> List[Dict]: #Tuple[List[Dict], int]:
+) -> List[Dict]:
     if local:
         chunks = scrape_file(file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local, chunking_method=chunking_method)
-        return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)
+        return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0]
     else:
         headers = {
             "Authorization": f"Bearer {THEPIPE_API_KEY}"
@@ -243,15 +241,16 @@ def extract_from_file(
         }
         files = {'files': (os.path.basename(file_path), open(file_path, 'rb'))}
 
-        response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files)
+        response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files, stream=True)
         if response.status_code != 200:
             raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
 
         results = []
-        total_tokens_used = 0
         for line in response.iter_lines(decode_unicode=True):
             if line:
                 data = json.loads(line)
+                if 'extraction_complete' in data:
+                    break
                 result = data['result']
                 if 'error' in result:
                     results.append(result)
@@ -269,6 +268,5 @@ def extract_from_file(
                             if key not in extracted_data:
                                 extracted_data[key] = None
                     results.append(extracted_data)
-                total_tokens_used += data['tokens_used']
 
-        return results#, total_tokens_used
+        return results
diff --git a/thepipe/scraper.py b/thepipe/scraper.py
@@ -206,7 +206,7 @@ def process_page(page_num):
                 response = openrouter_client.chat.completions.create(
                     model=ai_model,
                     messages=messages,
-                    temperature=0.1
+                    temperature=0
                 )
                 try:
                     llm_response = response.choices[0].message.content.strip()
@@ -377,7 +377,7 @@ def ai_extract_webpage_content(url: str, text_only: Optional[bool] = False, verb
         response = openrouter_client.chat.completions.create(
             model=ai_model,
             messages=messages,
-            temperature=0.1
+            temperature=0
         )
         llm_response = response.choices[0].message.content
         chunk = Chunk(path=url, texts=[llm_response], images=[stacked_image])