From 3fdda829b2bd82e53926dc0d5bf8fc363e668fa8 Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Thu, 18 Apr 2024 14:13:04 -0400 Subject: [PATCH 1/2] added dotenv read --- requirements.txt | 3 ++- requirements_local.txt | 3 ++- thepipe_api/extractor.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 21d2fa0..8745692 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ requests pillow cssutils beautifulsoup4 -magika \ No newline at end of file +magika +python-dotenv \ No newline at end of file diff --git a/requirements_local.txt b/requirements_local.txt index 9962995..e42bb9d 100644 --- a/requirements_local.txt +++ b/requirements_local.txt @@ -19,4 +19,5 @@ torch llmlingua PyMuPDF pdf2image -python-magic \ No newline at end of file +python-magic +python-dotenv \ No newline at end of file diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py index 57758ef..b88a8e1 100644 --- a/thepipe_api/extractor.py +++ b/thepipe_api/extractor.py @@ -14,6 +14,8 @@ from .core import Chunk, print_status, SourceTypes, create_chunks_from_messages, API_URL import tempfile import mimetypes +import dotenv +dotenv.load_dotenv() FILES_TO_IGNORE = {'package-lock.json', '.gitignore', '.bin', '.pyc', '.pyo', '.exe', '.bat', '.dll', '.obj', '.o', '.a', '.lib', '.so', '.dylib', '.ncb', '.sdf', '.suo', '.pdb', '.idb', '.pyd', '.ipynb_checkpoints', '.npy', '.pth'} # Files to ignore, please feel free to customize! CODE_EXTENSIONS = {'.h', '.json', '.js', '.jsx', '.ts', '.tsx', '.cs', '.java', '.html', '.css', '.ini', '.xml', '.yaml', '.xaml', '.sh'} # Plaintext files that should not be compressed with LLMLingua From 5d950897968f2590a400de796599ca2da84ab3d1 Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Fri, 19 Apr 2024 10:45:28 -0400 Subject: [PATCH 2/2] fixed output prompt format --- thepipe_api/extractor.py | 3 +++ thepipe_api/thepipe.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py index b88a8e1..8d6da50 100644 --- a/thepipe_api/extractor.py +++ b/thepipe_api/extractor.py @@ -64,6 +64,9 @@ def extract_from_file(file_path: str, source_type: str, verbose: bool = False, a if 'error' in response: raise ValueError(f"{response['error']}") chunks = create_chunks_from_messages(response['messages']) + for c in chunks: + c.path = file_path + c.source_type = source_type return chunks try: if source_type == SourceTypes.PDF: diff --git a/thepipe_api/thepipe.py b/thepipe_api/thepipe.py index c2cc926..28703e5 100644 --- a/thepipe_api/thepipe.py +++ b/thepipe_api/thepipe.py @@ -15,7 +15,7 @@ def save_outputs(chunks: List[Chunk], verbose: bool = False, text_only: bool = F if chunk is None: continue if chunk.text is not None: - text += f"""```\n{chunk.text}\n```\n\n""" + text += chunk.text if (chunk.image is not None) and (not text_only): if chunk.path is None: clean_path = f"image"