From 740591db88e3aef379459aa291832e7d4299b689 Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Wed, 17 Apr 2024 00:48:25 -0400 Subject: [PATCH] fixed failed file --- thepipe_api/extractor.py | 49 +++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py index a52f0ed..723b1a7 100644 --- a/thepipe_api/extractor.py +++ b/thepipe_api/extractor.py @@ -46,16 +46,23 @@ def extract_from_source(source: str, match: Optional[str] = None, ignore: Option def extract_from_file(file_path: str, source_type: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True) -> List[Chunk]: if not local: - with open(file_path, 'rb') as f: - response = requests.post( - url=API_URL, - files={'file': (file_path, f)}, - data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only} - ).json() - if 'error' in response: - raise ValueError(f"{response['error']}") - chunks = create_chunks_from_messages(response['messages']) - return chunks + try: + with open(file_path, 'rb') as f: + response = requests.post( + url=API_URL, + files={'file': (file_path, f)}, + data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only} + ) + except Exception as e: + raise ValueError(f"Failed to extract from file. This may mean our backend couldn't handle this request. Exception: {e}.") + try: + response = response.json() + except json.JSONDecodeError: + raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content, very large files, or unsupported files") + if 'error' in response: + raise ValueError(f"{response['error']}") + chunks = create_chunks_from_messages(response['messages']) + return chunks try: if source_type == SourceTypes.PDF: extraction = extract_pdf(file_path=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) @@ -176,11 +183,14 @@ def extract_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = F files={'file': (file_path, f)}, data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only} ) + try: response_json = response.json() - if 'error' in response_json: - raise ValueError(f"{response_json['error']}") - messages = response_json['messages'] - chunks = create_chunks_from_messages(messages) + except json.JSONDecodeError: + raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content such as videos, streams, or very large files/websites. See emmett@thepi.pe for help.") + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + messages = response_json['messages'] + chunks = create_chunks_from_messages(messages) else: import fitz # extract text and images of each page from the PDF @@ -234,10 +244,13 @@ def extract_spreadsheet(file_path: str) -> Chunk: def extract_url(url: str, text_only: bool = False, local: bool = True) -> List[Chunk]: if not local: - response = requests.post( - url=API_URL, - data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only} - ).json() + try: + response = requests.post( + url=API_URL, + data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only} + ).json() + except Exception as e: + raise ValueError(f"Failed to extract from URL. This may mean our backend couldn't handle this request. Exception: {e}.") if 'error' in response: raise ValueError(f"{response['error']}") chunks = create_chunks_from_messages(response['messages'])