From 740591db88e3aef379459aa291832e7d4299b689 Mon Sep 17 00:00:00 2001
From: Emmett McFaralne <staminacode@gmail.com>
Date: Wed, 17 Apr 2024 00:48:25 -0400
Subject: [PATCH] fixed failed file

---
 thepipe_api/extractor.py | 49 +++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py
index a52f0ed..723b1a7 100644
--- a/thepipe_api/extractor.py
+++ b/thepipe_api/extractor.py
@@ -46,16 +46,23 @@ def extract_from_source(source: str, match: Optional[str] = None, ignore: Option
 
 def extract_from_file(file_path: str, source_type: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True) -> List[Chunk]:
     if not local:
-        with open(file_path, 'rb') as f:
-            response = requests.post(
-                url=API_URL,
-                files={'file': (file_path, f)},
-                data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
-            ).json()
-            if 'error' in response:
-                raise ValueError(f"{response['error']}")
-            chunks = create_chunks_from_messages(response['messages'])
-            return chunks
+        try:
+            with open(file_path, 'rb') as f:
+                response = requests.post(
+                    url=API_URL,
+                    files={'file': (file_path, f)},
+                    data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
+                )
+        except Exception as e:
+            raise ValueError(f"Failed to extract from file. This may mean our backend couldn't handle this request. Exception: {e}.")
+        try:
+            response = response.json()
+        except json.JSONDecodeError:
+            raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content, very large files, or unsupported files")
+        if 'error' in response:
+            raise ValueError(f"{response['error']}")
+        chunks = create_chunks_from_messages(response['messages'])
+        return chunks
     try:    
         if source_type == SourceTypes.PDF:
             extraction = extract_pdf(file_path=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
@@ -176,11 +183,14 @@ def extract_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = F
                 files={'file': (file_path, f)},
                 data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
             )
+        try:
             response_json = response.json()
-            if 'error' in response_json:
-                raise ValueError(f"{response_json['error']}")
-            messages = response_json['messages']
-            chunks = create_chunks_from_messages(messages)
+        except json.JSONDecodeError:
+            raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content such as videos, streams, or very large files/websites. See emmett@thepi.pe for help.")
+        if 'error' in response_json:
+            raise ValueError(f"{response_json['error']}")
+        messages = response_json['messages']
+        chunks = create_chunks_from_messages(messages)
     else:
         import fitz
         # extract text and images of each page from the PDF
@@ -234,10 +244,13 @@ def extract_spreadsheet(file_path: str) -> Chunk:
     
 def extract_url(url: str, text_only: bool = False, local: bool = True) -> List[Chunk]:
     if not local:
-        response = requests.post(
-            url=API_URL,
-            data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only}
-        ).json()
+        try:
+            response = requests.post(
+                url=API_URL,
+                data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only}
+            ).json()
+        except Exception as e:
+            raise ValueError(f"Failed to extract from URL. This may mean our backend couldn't handle this request. Exception: {e}.")
         if 'error' in response:
             raise ValueError(f"{response['error']}")
         chunks = create_chunks_from_messages(response['messages'])