diff --git a/setup.py b/setup.py index a696f37..851521f 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_git_requirements(file): setup( name='thepipe_api', - version='1.3.7', + version='1.3.9', author='Emmett McFarlane', author_email='emmett@thepi.pe', description='AI-native extractor, powered by multimodal LLMs.', diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..5035dd6 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,86 @@ +""" +import unittest +import os +from thepipe.extract import extract_from_file, extract_from_url + +class TestAPI(unittest.TestCase): + + def setUp(self): + self.files_directory = os.path.join(os.path.dirname(__file__), 'files') + # Example schema for extraction + self.schema = { + "document_topic": "string", + "document_sentiment": "float", + } + + def test_extract_from_file_with_multiple_extractions(self): + # Path to the file you want to test + file_path = os.path.join(self.files_directory, 'example.pdf') + + # Call the real API for extracting data from the file + try: + result = extract_from_file( + file_path=file_path, + schema=self.schema, + ai_model="gpt-4o-mini", + multiple_extractions=True, + text_only=True, + ai_extraction=False, + host_images=False + ) + print("Extract from file result:", result) + + # Basic assertions to ensure extraction happened + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + # Check if the extracted data matches the schema + # since multiple extractions is enabled, we have the 'extractions' key for each chunk + # containing all the extractions. + # the result looks like: [{'chunk_index': 0, 'source': 'example.pdf', 'extraction': [{'document_topic': 'Density PDFs in Supersonic Turbulence', 'document_sentiment': None}]}] + for item in result: + self.assertIsInstance(item, dict) + if 'extractions' in item: + for extraction in item['extractions']: + self.assertIsInstance(extraction, dict) + for key in self.schema: + self.assertIn(key, extraction) + + except Exception as e: + self.fail(f"test_extract_from_file failed with error: {e}") + + def test_extract_from_url_with_one_extraction(self): + # URL you want to extract information from + url = 'https://thepi.pe/' # Update this with your actual URL + + # Call the real API for extracting data from the URL + try: + result = extract_from_url( + url=url, + schema=self.schema, + ai_model="gpt-4o-mini", + multiple_extractions=False, + text_only=True, + ai_extraction=False, + host_images=False + ) + print("Extract from URL result:", result) + + # Basic assertions to ensure extraction happened + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + + # Check if the extracted data matches the schema + # since multiple extractions is disabled, we don't have the 'extractions' key for each chunk + # [{'chunk_index': 0, 'source': 'https://thepi.pe/', 'document_topic': 'AI document extraction and data processing', 'document_sentiment': 0.8}] + for item in result: + self.assertIsInstance(item, dict) + for key in self.schema: + self.assertIn(key, item) + + except Exception as e: + self.fail(f"test_extract_from_url failed with error: {e}") + + +if __name__ == '__main__': + unittest.main() +""" \ No newline at end of file diff --git a/thepipe/core.py b/thepipe/core.py index ddfc1fe..360ba23 100644 --- a/thepipe/core.py +++ b/thepipe/core.py @@ -72,22 +72,25 @@ def to_json(self, host_images: bool = False) -> Dict: @staticmethod def from_json(data: Dict, host_images: bool = False) -> 'Chunk': images = [] - for image_str in data['images']: - if host_images: - image_data = requests.get(image_str).content - image = Image.open(BytesIO(image_data)) - images.append(image) - else: - remove_prefix = image_str.replace("data:image/jpeg;base64,", "") - image_data = base64.b64decode(remove_prefix) - image = Image.open(BytesIO(image_data)) - images.append(image) + if 'images' in data: + for image_str in data['images']: + if host_images: + image_data = requests.get(image_str).content + image = Image.open(BytesIO(image_data)) + images.append(image) + else: + remove_prefix = image_str.replace("data:image/jpeg;base64,", "") + image_data = base64.b64decode(remove_prefix) + image = Image.open(BytesIO(image_data)) + images.append(image) + if 'texts' in data: + texts = [text.strip() for text in data['texts']] return Chunk( path=data['path'], - texts=[text.strip() for text in data['texts']], + texts=texts, images=images, - audios=data['audios'], - videos=data['videos'], + #audios=data['audios'], + #videos=data['videos'], ) def make_image_url(image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None) -> str: diff --git a/thepipe/extract.py b/thepipe/extract.py index dfa3433..e460e1f 100644 --- a/thepipe/extract.py +++ b/thepipe/extract.py @@ -9,26 +9,24 @@ import os from openai import OpenAI -DEFAULT_EXTRACTION_PROMPT = "Extract structured information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but use your reasoning to always fill in every column as best you can. Always immediately return valid JSON." +DEFAULT_EXTRACTION_PROMPT = "Extract all the information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but always fill in every column as best you can. Always immediately return valid JSON. You must extract ALL the information available in the entire document." DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o-mini") def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]: def clean_response_text(llm_response: str) -> str: return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip() - # try to match inside of code block code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$' match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL) if match: llm_response = match.group(1) llm_response = clean_response_text(llm_response) - # parse json by matching curly braces try: parsed_json = json.loads(llm_response) return parsed_json except json.JSONDecodeError: - json_pattern = r'(\[[\s\S]*\]|\{[\s\S]*\})' + json_pattern = r'($$[\s\S]*$$|\{[\s\S]*\})' match = re.search(json_pattern, llm_response) if match: try: @@ -78,7 +76,7 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st model=ai_model, messages=messages, response_format={"type": "json_object"}, - temperature=0.1, + temperature=0, ) llm_response = response.choices[0].message.content input_tokens = calculate_tokens([chunk]) @@ -162,10 +160,10 @@ def extract_from_url( verbose: bool = False, chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page, local: bool = False -) -> List[Dict]: #Tuple[List[Dict], int]: +) -> List[Dict]: if local: chunks = scrape_url(url, text_only=text_only, ai_extraction=ai_extraction, verbose=verbose, local=local, chunking_method=chunking_method) - return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images) + return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0] else: headers = { "Authorization": f"Bearer {THEPIPE_API_KEY}" @@ -181,15 +179,16 @@ def extract_from_url( 'ai_extraction': str(ai_extraction).lower(), 'chunking_method': chunking_method.__name__ } - response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data) + response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, stream=True) if response.status_code != 200: raise Exception(f"API request failed with status code {response.status_code}: {response.text}") results = [] - total_tokens_used = 0 for line in response.iter_lines(decode_unicode=True): if line: data = json.loads(line) + if 'extraction_complete' in data: + break result = data['result'] if 'error' in result: results.append(result) @@ -207,9 +206,8 @@ def extract_from_url( if key not in extracted_data: extracted_data[key] = None results.append(extracted_data) - total_tokens_used += data['tokens_used'] - return results#, total_tokens_used + return results def extract_from_file( file_path: str, @@ -223,10 +221,10 @@ def extract_from_file( verbose: bool = False, chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page, local: bool = False -) -> List[Dict]: #Tuple[List[Dict], int]: +) -> List[Dict]: if local: chunks = scrape_file(file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local, chunking_method=chunking_method) - return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images) + return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0] else: headers = { "Authorization": f"Bearer {THEPIPE_API_KEY}" @@ -243,15 +241,16 @@ def extract_from_file( } files = {'files': (os.path.basename(file_path), open(file_path, 'rb'))} - response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files) + response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files, stream=True) if response.status_code != 200: raise Exception(f"API request failed with status code {response.status_code}: {response.text}") results = [] - total_tokens_used = 0 for line in response.iter_lines(decode_unicode=True): if line: data = json.loads(line) + if 'extraction_complete' in data: + break result = data['result'] if 'error' in result: results.append(result) @@ -269,6 +268,5 @@ def extract_from_file( if key not in extracted_data: extracted_data[key] = None results.append(extracted_data) - total_tokens_used += data['tokens_used'] - return results#, total_tokens_used \ No newline at end of file + return results diff --git a/thepipe/scraper.py b/thepipe/scraper.py index 402d8a9..d720110 100644 --- a/thepipe/scraper.py +++ b/thepipe/scraper.py @@ -206,7 +206,7 @@ def process_page(page_num): response = openrouter_client.chat.completions.create( model=ai_model, messages=messages, - temperature=0.1 + temperature=0 ) try: llm_response = response.choices[0].message.content.strip() @@ -377,7 +377,7 @@ def ai_extract_webpage_content(url: str, text_only: Optional[bool] = False, verb response = openrouter_client.chat.completions.create( model=ai_model, messages=messages, - temperature=0.1 + temperature=0 ) llm_response = response.choices[0].message.content chunk = Chunk(path=url, texts=[llm_response], images=[stacked_image])