Skip to content

Commit

Permalink
Updated compression method, now doing largest chunks first
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed May 17, 2024
1 parent 87bcf1c commit 061c626
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 15 deletions.
10 changes: 1 addition & 9 deletions tests/test_thepipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,7 @@ def test_extract_pdf_with_ai_extraction(self):
elif chunks[i].source_type == core.SourceTypes.IMAGE:
# verify extraction contains image
self.assertIsNotNone(chunks[i].image)

def test_compress_spreadsheet(self):
chunks = extractor.extract_from_source(source=self.files_directory+"/example.xlsx")
new_chunks = compressor.compress_chunks(chunks=chunks, limit=30)
# verify that the compressed text is shorter than the original
all_text = ''.join([chunk.text for chunk in chunks if chunk.text])
all_new_text = ''.join([chunk.text for chunk in new_chunks if chunk.text])
self.assertLess(len(all_new_text), len(all_text))


def test_compress_with_llmlingua(self):
chunks = extractor.extract_from_source(source=self.files_directory+"/example.md")
new_chunks = compressor.compress_chunks(chunks=chunks, limit=30)
Expand Down
15 changes: 9 additions & 6 deletions thepipe_api/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
import shutil
import subprocess
import tempfile
from typing import List
from typing import List, Optional
import os
from .core import Chunk, SourceTypes, print_status, count_tokens
from .thepipe import count_tokens

CTAGS_EXECUTABLE_PATH = "C:\ctags.exe" if os.name == 'nt' else "ctags-universal"
CTAGS_LANGUAGES = {'py': "Python", "cpp": "C++", "c": "C"}
CTAGS_OUTPUT_FILE = 'ctags_output.json'
MAX_COMPRESSION_ATTEMPTS = 3
MAX_COMPRESSION_ATTEMPTS = 10

def compress_with_ctags(chunk: Chunk, extension: str) -> Chunk:
if chunk.text is None:
Expand Down Expand Up @@ -68,24 +68,27 @@ def compress_with_llmlingua(chunk: Chunk) -> Chunk:
new_chunk = Chunk(path=chunk.path, text=new_chunk_text, image=chunk.image, source_type=chunk.source_type)
return new_chunk

def compress_chunks(chunks: List[Chunk], verbose: bool = False, limit: int = 1e5) -> List[Chunk]:
def compress_chunks(chunks: List[Chunk], verbose: bool = False, limit: Optional[int] = None) -> List[Chunk]:
new_chunks = chunks
for _ in range(MAX_COMPRESSION_ATTEMPTS):
if count_tokens(new_chunks) <= limit:
break
if verbose: print_status(f"Compressing prompt ({count_tokens(chunks)} tokens / {limit} limit)", status='info')
new_chunks = []
longest_chunk = max(chunks, key=lambda x: len(x.text) if x.text is not None else 0)
for chunk in chunks:
# if not longest, skip
if chunk != longest_chunk:
new_chunks.append(chunk)
continue
new_chunk = None
if chunk is None or chunk.text is None:
if chunk is None or chunk.text is None:
new_chunk = chunk
elif chunk.source_type == SourceTypes.COMPRESSIBLE_CODE:
extension = chunk.path.split('.')[-1]
new_chunk = compress_with_ctags(chunk, extension=extension)
elif chunk.source_type in {SourceTypes.PLAINTEXT, SourceTypes.PDF, SourceTypes.DOCX, SourceTypes.PPTX, SourceTypes.URL}:
new_chunk = compress_with_llmlingua(chunk)
elif chunk.source_type == SourceTypes.SPREADSHEET:
new_chunk = compress_with_llmlingua(chunk)
else:
# if the chunk is not compressible, keep the original text
new_chunk = chunk
Expand Down

0 comments on commit 061c626

Please sign in to comment.