fixed to_message, fixed by_document chunker

emcf · Jul 5, 2024 · 9fa148b · 9fa148b
1 parent dab1555
commit 9fa148b
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 123 deletions.
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v
 
 ## Get started in 5 minutes  🚀
 
-thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).
+thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal PDF & video response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).
 
 ### Hosted API (Python)
 
@@ -49,7 +49,7 @@ import thepipe_api as tp
 from openai import OpenAI
 
 # scrape markdown + images
-chunks = tp.scrape_file(
+chunks = tp.scrape(
   source="example.pdf",
   ai_extraction=True
 )
@@ -58,7 +58,7 @@ chunks = tp.scrape_file(
 client = OpenAI()
 response = client.chat.completions.create(
     model="gpt-4o",
-    messages=tp.to_messages(chunks),
+    messages=tp.chunks_to_messages(chunks),
 )
 ```
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='thepipe_api',
-    version='1.0.0',
+    version='1.0.1',
     author='Emmett McFarlane',
     author_email='[email protected]',
     description='AI-native scraper for multimodal LLMs.',

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -38,9 +38,9 @@ def test_image_to_base64(self):
         decoded_image = Image.open(BytesIO(image_data))
         self.assertEqual(image.size, decoded_image.size)
 
-    def test_to_messages(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.md")
-        messages = core.to_messages(chunks)
+    def test_chunks_to_messages(self):
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
+        messages = core.chunks_to_messsages(chunks)
         self.assertEqual(type(messages), list)
         for message in messages:
             self.assertEqual(type(message), dict)
@@ -55,12 +55,23 @@ def test_save_outputs(self):
             text = file.read()
         self.assertIn('Hello, World!', text)
         # verify with images
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg")
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True)
         thepipe.save_outputs(chunks)
         self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))
 
     def test_parse_arguments(self):
         args = thepipe.parse_arguments()
         self.assertEqual(type(args), argparse.Namespace)
         self.assertIn('source', vars(args))
-        self.assertIn('include_regex', vars(args))
+        self.assertIn('include_regex', vars(args))
+
+    def test_calculate_tokens(self):
+        text = "Hello, World!"
+        tokens = core.calculate_tokens([core.Chunk(texts=[text])])
+        self.assertAlmostEqual(tokens, 3.25, places=0)
+
+    def test_calculate_image_tokens(self):
+        image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
+        image.load() # needed to close the file
+        tokens = core.calculate_image_tokens(image)
+        self.assertAlmostEqual(tokens, 85, places=0)
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -20,7 +20,7 @@ def tearDown(self):
             os.rmdir(self.outputs_directory)
 
     def test_scrape_zip(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True)
         # verify it scraped the zip file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -31,7 +31,7 @@ def test_scrape_zip(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_scrape_ipynb(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True, local=True)
         # verify it scraped the ipynb file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -42,7 +42,7 @@ def test_scrape_ipynb(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_scrape_pdf_with_ai_extraction(self):
-        chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True)
+        chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True, local=True)
         # verify it scraped the pdf file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -52,7 +52,7 @@ def test_scrape_pdf_with_ai_extraction(self):
             self.assertIsNotNone(chunk.texts or chunk.images)
 
     def test_scrape_docx(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True, local=True)
         # verify it scraped the docx file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -63,7 +63,7 @@ def test_scrape_docx(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_extract_pdf_without_ai_extraction(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True, local=True)
         # verify it scraped the pdf file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -74,7 +74,7 @@ def test_extract_pdf_without_ai_extraction(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_scrape_audio(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True, local=True)
         # verify it scraped the audio file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -85,7 +85,7 @@ def test_scrape_audio(self):
         self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))
 
     def test_scrape_video(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True)
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True)
         # verify it scraped the video file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -98,7 +98,7 @@ def test_scrape_video(self):
         self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))
 
     def test_scrape_pptx(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True, local=True)
         # verify it scraped the pptx file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -110,7 +110,7 @@ def test_scrape_pptx(self):
 
     def test_scrape_tweet(self):
         tweet_url = "https://x.com/ylecun/status/1796734866156843480"
-        chunks = scraper.scrape_url(tweet_url)
+        chunks = scraper.scrape_url(tweet_url, local=True)
         # verify it returned chunks representing the tweet
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -120,7 +120,7 @@ def test_scrape_tweet(self):
         self.assertTrue(len(chunks[0].images) > 0)
 
     def test_scrape_youtube(self):
-        chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8")
+        chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8", local=True)
         # verify it scraped the youtube video into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -134,7 +134,7 @@ def test_scrape_youtube(self):
 
     def test_scrape_url(self):
         # verify web page scrape result
-        chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping')
+        chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping', local=True)
         for chunk in chunks:
             self.assertEqual(type(chunk), core.Chunk)
             self.assertEqual(chunk.path, 'https://en.wikipedia.org/wiki/Piping')
@@ -144,12 +144,12 @@ def test_scrape_url(self):
         # verify if at least one image was scraped
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
         # verify file url scrape result
-        chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf')
+        chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', local=True)
         self.assertEqual(len(chunks), 1)
 
     @unittest.skipUnless(os.environ.get('GITHUB_TOKEN'), "requires GITHUB_TOKEN")
     def test_scrape_github(self):
-        chunks = scraper.scrape_url('https://github.com/emcf/thepipe')
+        chunks = scraper.scrape_url('https://github.com/emcf/thepipe', local=True)
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0) # should have some repo contents
 

diff --git a/thepipe_api/__init__.py b/thepipe_api/__init__.py
@@ -1,2 +1,4 @@
 from .scraper import scrape_file, scrape_url, scrape_directory
-from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
+from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
+from .core import Chunk, calculate_tokens, chunks_to_messsages
+from .thepipe import extract # deprecated
diff --git a/thepipe_api/chunker.py b/thepipe_api/chunker.py
@@ -3,6 +3,24 @@
 from .core import Chunk, calculate_tokens
 from sklearn.metrics.pairwise import cosine_similarity
 
+def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]:
+    chunks_by_doc = {}
+    new_chunks = []
+    for chunk in chunks:
+        if not chunk.path:
+            raise ValueError("Document chunking requires the path attribute to determine the document boundaries")
+        if chunk.path not in chunks_by_doc:
+            chunks_by_doc[chunk.path] = []
+        chunks_by_doc[chunk.path].append(chunk)
+    for doc_chunks in chunks_by_doc.values():
+        doc_texts = []
+        doc_images = []
+        for chunk in doc_chunks:
+            doc_texts.extend(chunk.texts)
+            doc_images.extend(chunk.images)
+        new_chunks.append(Chunk(path=doc_chunks[0].path, texts=doc_texts, images=doc_images))
+    return new_chunks    
+
 def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]:
     # by-page chunking is default behavior
     return chunks
@@ -28,7 +46,7 @@ def chunk_by_section(chunks: List[Chunk]) -> List[Chunk]:
         section_chunks.append(Chunk(texts=[current_chunk_text], images=current_chunk_images))
     return section_chunks
 
-def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 2, similarity_threshold: float = 0.5) -> List[Chunk]:
+def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 3, similarity_threshold: float = 0.1) -> List[Chunk]:
     from sentence_transformers import SentenceTransformer
     model = SentenceTransformer(model_name)
     # Flatten the chunks into sentences
@@ -69,8 +87,12 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers
     for group in grouped_sentences:
         group_texts = [sentences[i] for i in group]
         group_images = []
+        seen_images = []
         for i in group:
-            group_images.extend(sentence_chunk_map[i].images)
+            for image in sentence_chunk_map[i].images:
+                if image not in seen_images:
+                    group_images.append(image)
+                    seen_images.append(image)
         new_chunks.append(Chunk(texts=group_texts, images=group_images))
 
     return new_chunks
diff --git a/thepipe_api/core.py b/thepipe_api/core.py
@@ -19,7 +19,18 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
             return [ImageDocument(text=document_text, image=image) for image in self.images]
         else:
             return [Document(text=document_text)]
-
+
+    def to_message(self) -> Dict:
+        content = []
+        if self.texts:
+            for text in self.texts:
+                content.append({"type": "text", "text": {"content": text}})
+        if self.images:
+            for image in self.images:
+                base64_image = image_to_base64(image)
+                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
+        return {"role": "user", "content": content}
+
 # uses https://platform.openai.com/docs/guides/vision
 def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int:
     width, height = image.size
@@ -56,18 +67,5 @@ def image_to_base64(image: Image.Image) -> str:
     image.save(buffered, format="JPEG")
     return base64.b64encode(buffered.getvalue()).decode()
 
-def to_messages(chunks: List[Chunk]) -> List[Dict]:
-    # audio and video are not yet supported as they
-    # are not common in SOTA multimodel LLMs (June 2024)
-    messages = []
-    for chunk in chunks:
-        content = []
-        if chunk.texts:
-            for text in chunk.texts:
-                content.append({"type": "text", "text": {"content": text}})
-        if chunk.images:
-            for image in chunk.images:
-                base64_image = image_to_base64(image)
-                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
-        messages.append({"role": "user", "content": content})
-    return messages
+def chunks_to_messsages(chunks: List[Chunk]) -> List[Dict]:
+    return [chunk.to_message() for chunk in chunks]