Skip to content

Commit

Permalink
fixed failed file
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Apr 17, 2024
1 parent dc19331 commit 740591d
Showing 1 changed file with 31 additions and 18 deletions.
49 changes: 31 additions & 18 deletions thepipe_api/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,23 @@ def extract_from_source(source: str, match: Optional[str] = None, ignore: Option

def extract_from_file(file_path: str, source_type: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True) -> List[Chunk]:
if not local:
with open(file_path, 'rb') as f:
response = requests.post(
url=API_URL,
files={'file': (file_path, f)},
data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
).json()
if 'error' in response:
raise ValueError(f"{response['error']}")
chunks = create_chunks_from_messages(response['messages'])
return chunks
try:
with open(file_path, 'rb') as f:
response = requests.post(
url=API_URL,
files={'file': (file_path, f)},
data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
)
except Exception as e:
raise ValueError(f"Failed to extract from file. This may mean our backend couldn't handle this request. Exception: {e}.")
try:
response = response.json()
except json.JSONDecodeError:
raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content, very large files, or unsupported files")
if 'error' in response:
raise ValueError(f"{response['error']}")
chunks = create_chunks_from_messages(response['messages'])
return chunks
try:
if source_type == SourceTypes.PDF:
extraction = extract_pdf(file_path=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
Expand Down Expand Up @@ -176,11 +183,14 @@ def extract_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = F
files={'file': (file_path, f)},
data={'api_key': THEPIPE_API_KEY, 'ai_extraction': ai_extraction, 'text_only': text_only}
)
try:
response_json = response.json()
if 'error' in response_json:
raise ValueError(f"{response_json['error']}")
messages = response_json['messages']
chunks = create_chunks_from_messages(messages)
except json.JSONDecodeError:
raise ValueError(f"Our backend likely couldn't handle this request. This can happen with large content such as videos, streams, or very large files/websites. See [email protected] for help.")
if 'error' in response_json:
raise ValueError(f"{response_json['error']}")
messages = response_json['messages']
chunks = create_chunks_from_messages(messages)
else:
import fitz
# extract text and images of each page from the PDF
Expand Down Expand Up @@ -234,10 +244,13 @@ def extract_spreadsheet(file_path: str) -> Chunk:

def extract_url(url: str, text_only: bool = False, local: bool = True) -> List[Chunk]:
if not local:
response = requests.post(
url=API_URL,
data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only}
).json()
try:
response = requests.post(
url=API_URL,
data={'url': url, 'api_key': THEPIPE_API_KEY, 'text_only': text_only}
).json()
except Exception as e:
raise ValueError(f"Failed to extract from URL. This may mean our backend couldn't handle this request. Exception: {e}.")
if 'error' in response:
raise ValueError(f"{response['error']}")
chunks = create_chunks_from_messages(response['messages'])
Expand Down

0 comments on commit 740591d

Please sign in to comment.