Updated compression method, now doing largest chunks first

emcf · May 17, 2024 · 061c626 · 061c626
1 parent 87bcf1c
commit 061c626
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 15 deletions.
diff --git a/tests/test_thepipe.py b/tests/test_thepipe.py
@@ -148,15 +148,7 @@ def test_extract_pdf_with_ai_extraction(self):
             elif chunks[i].source_type == core.SourceTypes.IMAGE:
                 # verify extraction contains image
                 self.assertIsNotNone(chunks[i].image)
-
-    def test_compress_spreadsheet(self):
-        chunks = extractor.extract_from_source(source=self.files_directory+"/example.xlsx")
-        new_chunks = compressor.compress_chunks(chunks=chunks, limit=30)
-        # verify that the compressed text is shorter than the original
-        all_text = ''.join([chunk.text for chunk in chunks if chunk.text])
-        all_new_text = ''.join([chunk.text for chunk in new_chunks if chunk.text])
-        self.assertLess(len(all_new_text), len(all_text))
-
+
     def test_compress_with_llmlingua(self):
         chunks = extractor.extract_from_source(source=self.files_directory+"/example.md")
         new_chunks = compressor.compress_chunks(chunks=chunks, limit=30)

diff --git a/thepipe_api/compressor.py b/thepipe_api/compressor.py
@@ -2,15 +2,15 @@
 import shutil
 import subprocess
 import tempfile
-from typing import List
+from typing import List, Optional
 import os
 from .core import Chunk, SourceTypes, print_status, count_tokens
 from .thepipe import count_tokens
 
 CTAGS_EXECUTABLE_PATH = "C:\ctags.exe" if os.name == 'nt' else "ctags-universal"
 CTAGS_LANGUAGES = {'py': "Python", "cpp": "C++", "c": "C"}
 CTAGS_OUTPUT_FILE = 'ctags_output.json'
-MAX_COMPRESSION_ATTEMPTS = 3
+MAX_COMPRESSION_ATTEMPTS = 10
 
 def compress_with_ctags(chunk: Chunk, extension: str) -> Chunk:
     if chunk.text is None:
@@ -68,24 +68,27 @@ def compress_with_llmlingua(chunk: Chunk) -> Chunk:
     new_chunk = Chunk(path=chunk.path, text=new_chunk_text, image=chunk.image, source_type=chunk.source_type)
     return new_chunk
 
-def compress_chunks(chunks: List[Chunk], verbose: bool = False, limit: int = 1e5) -> List[Chunk]:
+def compress_chunks(chunks: List[Chunk], verbose: bool = False, limit: Optional[int] = None) -> List[Chunk]:
     new_chunks = chunks
     for _ in range(MAX_COMPRESSION_ATTEMPTS):
         if count_tokens(new_chunks) <= limit:
             break
         if verbose: print_status(f"Compressing prompt ({count_tokens(chunks)} tokens / {limit} limit)", status='info')
         new_chunks = []
+        longest_chunk = max(chunks, key=lambda x: len(x.text) if x.text is not None else 0)
         for chunk in chunks:
+            # if not longest, skip
+            if chunk != longest_chunk:
+                new_chunks.append(chunk)
+                continue
             new_chunk = None
-            if chunk is None or  chunk.text is None:
+            if chunk is None or chunk.text is None:
                 new_chunk = chunk
             elif chunk.source_type == SourceTypes.COMPRESSIBLE_CODE:
                 extension = chunk.path.split('.')[-1]
                 new_chunk = compress_with_ctags(chunk, extension=extension)
             elif chunk.source_type in {SourceTypes.PLAINTEXT, SourceTypes.PDF, SourceTypes.DOCX, SourceTypes.PPTX, SourceTypes.URL}:
                 new_chunk = compress_with_llmlingua(chunk)
-            elif chunk.source_type == SourceTypes.SPREADSHEET:
-                new_chunk = compress_with_llmlingua(chunk)
             else:
                 # if the chunk is not compressible, keep the original text
                 new_chunk = chunk