Skip to content

Commit

Permalink
api update fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Sep 30, 2024
1 parent 29d3716 commit 5183396
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 33 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_git_requirements(file):

setup(
name='thepipe_api',
version='1.3.7',
version='1.3.9',
author='Emmett McFarlane',
author_email='[email protected]',
description='AI-native extractor, powered by multimodal LLMs.',
Expand Down
86 changes: 86 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
import unittest
import os
from thepipe.extract import extract_from_file, extract_from_url
class TestAPI(unittest.TestCase):
def setUp(self):
self.files_directory = os.path.join(os.path.dirname(__file__), 'files')
# Example schema for extraction
self.schema = {
"document_topic": "string",
"document_sentiment": "float",
}
def test_extract_from_file_with_multiple_extractions(self):
# Path to the file you want to test
file_path = os.path.join(self.files_directory, 'example.pdf')
# Call the real API for extracting data from the file
try:
result = extract_from_file(
file_path=file_path,
schema=self.schema,
ai_model="gpt-4o-mini",
multiple_extractions=True,
text_only=True,
ai_extraction=False,
host_images=False
)
print("Extract from file result:", result)
# Basic assertions to ensure extraction happened
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)
# Check if the extracted data matches the schema
# since multiple extractions is enabled, we have the 'extractions' key for each chunk
# containing all the extractions.
# the result looks like: [{'chunk_index': 0, 'source': 'example.pdf', 'extraction': [{'document_topic': 'Density PDFs in Supersonic Turbulence', 'document_sentiment': None}]}]
for item in result:
self.assertIsInstance(item, dict)
if 'extractions' in item:
for extraction in item['extractions']:
self.assertIsInstance(extraction, dict)
for key in self.schema:
self.assertIn(key, extraction)
except Exception as e:
self.fail(f"test_extract_from_file failed with error: {e}")
def test_extract_from_url_with_one_extraction(self):
# URL you want to extract information from
url = 'https://thepi.pe/' # Update this with your actual URL
# Call the real API for extracting data from the URL
try:
result = extract_from_url(
url=url,
schema=self.schema,
ai_model="gpt-4o-mini",
multiple_extractions=False,
text_only=True,
ai_extraction=False,
host_images=False
)
print("Extract from URL result:", result)
# Basic assertions to ensure extraction happened
self.assertIsInstance(result, list)
self.assertGreater(len(result), 0)
# Check if the extracted data matches the schema
# since multiple extractions is disabled, we don't have the 'extractions' key for each chunk
# [{'chunk_index': 0, 'source': 'https://thepi.pe/', 'document_topic': 'AI document extraction and data processing', 'document_sentiment': 0.8}]
for item in result:
self.assertIsInstance(item, dict)
for key in self.schema:
self.assertIn(key, item)
except Exception as e:
self.fail(f"test_extract_from_url failed with error: {e}")
if __name__ == '__main__':
unittest.main()
"""
29 changes: 16 additions & 13 deletions thepipe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,25 @@ def to_json(self, host_images: bool = False) -> Dict:
@staticmethod
def from_json(data: Dict, host_images: bool = False) -> 'Chunk':
images = []
for image_str in data['images']:
if host_images:
image_data = requests.get(image_str).content
image = Image.open(BytesIO(image_data))
images.append(image)
else:
remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
image_data = base64.b64decode(remove_prefix)
image = Image.open(BytesIO(image_data))
images.append(image)
if 'images' in data:
for image_str in data['images']:
if host_images:
image_data = requests.get(image_str).content
image = Image.open(BytesIO(image_data))
images.append(image)
else:
remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
image_data = base64.b64decode(remove_prefix)
image = Image.open(BytesIO(image_data))
images.append(image)
if 'texts' in data:
texts = [text.strip() for text in data['texts']]
return Chunk(
path=data['path'],
texts=[text.strip() for text in data['texts']],
texts=texts,
images=images,
audios=data['audios'],
videos=data['videos'],
#audios=data['audios'],
#videos=data['videos'],
)

def make_image_url(image: Image.Image, host_images: bool = False, max_resolution: Optional[int] = None) -> str:
Expand Down
32 changes: 15 additions & 17 deletions thepipe/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,24 @@
import os
from openai import OpenAI

DEFAULT_EXTRACTION_PROMPT = "Extract structured information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but use your reasoning to always fill in every column as best you can. Always immediately return valid JSON."
DEFAULT_EXTRACTION_PROMPT = "Extract all the information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but always fill in every column as best you can. Always immediately return valid JSON. You must extract ALL the information available in the entire document."
DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o-mini")

def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
def clean_response_text(llm_response: str) -> str:
return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip()

# try to match inside of code block
code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$'
match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL)
if match:
llm_response = match.group(1)
llm_response = clean_response_text(llm_response)

# parse json by matching curly braces
try:
parsed_json = json.loads(llm_response)
return parsed_json
except json.JSONDecodeError:
json_pattern = r'(\[[\s\S]*\]|\{[\s\S]*\})'
json_pattern = r'($$[\s\S]*$$|\{[\s\S]*\})'
match = re.search(json_pattern, llm_response)
if match:
try:
Expand Down Expand Up @@ -78,7 +76,7 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st
model=ai_model,
messages=messages,
response_format={"type": "json_object"},
temperature=0.1,
temperature=0,
)
llm_response = response.choices[0].message.content
input_tokens = calculate_tokens([chunk])
Expand Down Expand Up @@ -162,10 +160,10 @@ def extract_from_url(
verbose: bool = False,
chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page,
local: bool = False
) -> List[Dict]: #Tuple[List[Dict], int]:
) -> List[Dict]:
if local:
chunks = scrape_url(url, text_only=text_only, ai_extraction=ai_extraction, verbose=verbose, local=local, chunking_method=chunking_method)
return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)
return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0]
else:
headers = {
"Authorization": f"Bearer {THEPIPE_API_KEY}"
Expand All @@ -181,15 +179,16 @@ def extract_from_url(
'ai_extraction': str(ai_extraction).lower(),
'chunking_method': chunking_method.__name__
}
response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data)
response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, stream=True)
if response.status_code != 200:
raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

results = []
total_tokens_used = 0
for line in response.iter_lines(decode_unicode=True):
if line:
data = json.loads(line)
if 'extraction_complete' in data:
break
result = data['result']
if 'error' in result:
results.append(result)
Expand All @@ -207,9 +206,8 @@ def extract_from_url(
if key not in extracted_data:
extracted_data[key] = None
results.append(extracted_data)
total_tokens_used += data['tokens_used']

return results#, total_tokens_used
return results

def extract_from_file(
file_path: str,
Expand All @@ -223,10 +221,10 @@ def extract_from_file(
verbose: bool = False,
chunking_method: Optional[Callable[[List[Chunk]], List[Chunk]]] = chunk_by_page,
local: bool = False
) -> List[Dict]: #Tuple[List[Dict], int]:
) -> List[Dict]:
if local:
chunks = scrape_file(file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local, chunking_method=chunking_method)
return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)
return extract(chunks=chunks, schema=schema, ai_model=ai_model, multiple_extractions=multiple_extractions, extraction_prompt=extraction_prompt, host_images=host_images)[0]
else:
headers = {
"Authorization": f"Bearer {THEPIPE_API_KEY}"
Expand All @@ -243,15 +241,16 @@ def extract_from_file(
}
files = {'files': (os.path.basename(file_path), open(file_path, 'rb'))}

response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files)
response = requests.post(f"{HOST_URL}/extract", headers=headers, data=data, files=files, stream=True)
if response.status_code != 200:
raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

results = []
total_tokens_used = 0
for line in response.iter_lines(decode_unicode=True):
if line:
data = json.loads(line)
if 'extraction_complete' in data:
break
result = data['result']
if 'error' in result:
results.append(result)
Expand All @@ -269,6 +268,5 @@ def extract_from_file(
if key not in extracted_data:
extracted_data[key] = None
results.append(extracted_data)
total_tokens_used += data['tokens_used']

return results#, total_tokens_used
return results
4 changes: 2 additions & 2 deletions thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def process_page(page_num):
response = openrouter_client.chat.completions.create(
model=ai_model,
messages=messages,
temperature=0.1
temperature=0
)
try:
llm_response = response.choices[0].message.content.strip()
Expand Down Expand Up @@ -377,7 +377,7 @@ def ai_extract_webpage_content(url: str, text_only: Optional[bool] = False, verb
response = openrouter_client.chat.completions.create(
model=ai_model,
messages=messages,
temperature=0.1
temperature=0
)
llm_response = response.choices[0].message.content
chunk = Chunk(path=url, texts=[llm_response], images=[stacked_image])
Expand Down

0 comments on commit 5183396

Please sign in to comment.