passing host_images to json converters

emcf · Jul 6, 2024 · c219ddd · c219ddd
1 parent 56094a2
commit c219ddd
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -27,11 +27,11 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v
 
 ## Features 🌟
 
-- Extract markdown, tables, and images from any document or web page 📊
-- Output works out-of-the-box with all multimodal LLMs and RAG frameworks 🖼️📚
-- AI filetype detection for missing file extensions and unknown web data 💾
-- Quick-start integrations for Twitter, YouTube, GitHub, and more 🌐
-- GPU-accelerated ⚡️
+- Extract markdown, images, and structured data from any document or web page
+- Output works out-of-the-box with all multimodal LLMs and RAG frameworks
+- AI filetype detection for missing file extensions and unknown web data
+- Quick-start integrations for Twitter, YouTube, GitHub, and more
+- GPU-accelerated
 
 ## Get started in 5 minutes  🚀
 

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -27,16 +27,6 @@ def test_chunk_to_llamaindex(self):
         self.assertEqual(type(llama_index), list)
         self.assertEqual(len(llama_index), 1)
 
-    def test_image_to_base64(self):
-        image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
-        image.load() # needed to close the file
-        base64_string = core.image_to_base64(image)
-        self.assertEqual(type(base64_string), str)
-        # converting back should be the same
-        image_data = base64.b64decode(base64_string)
-        decoded_image = Image.open(BytesIO(image_data))
-        self.assertEqual(image.size, decoded_image.size)
-
     def test_chunks_to_messages(self):
         chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
         messages = core.chunks_to_messages(chunks)
@@ -58,6 +48,21 @@ def test_save_outputs(self):
         core.save_outputs(chunks)
         self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))
 
+    def test_chunk_json(self):
+        chunk = core.Chunk(path="example.md", texts=["Hello, World!"])
+        # convert to json
+        chunk_json = chunk.to_json()
+        # verify it is a dictionary with the expected items
+        self.assertEqual(type(chunk_json), dict)
+        self.assertIn('texts', chunk_json)
+        self.assertIn('path', chunk_json)
+        # convert back
+        chunk = core.Chunk.from_json(chunk_json)
+        # verify it is the correct Chunk object
+        self.assertEqual(type(chunk), core.Chunk)
+        self.assertEqual(chunk.path, "example.md")
+        self.assertEqual(chunk.texts, ["Hello, World!"])
+
     def test_parse_arguments(self):
         args = core.parse_arguments()
         self.assertEqual(type(args), argparse.Namespace)

diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -16,21 +16,6 @@ def tearDown(self):
             for file in os.listdir(self.outputs_directory):
                 os.remove(os.path.join(self.outputs_directory, file))
             os.rmdir(self.outputs_directory)
-
-    def test_chunk_json(self):
-        chunk = core.Chunk(path="example.md", texts=["Hello, World!"])
-        # convert to json
-        chunk_json = chunk.to_json()
-        # verify it is a dictionary with the expected items
-        self.assertEqual(type(chunk_json), dict)
-        self.assertIn('texts', chunk_json)
-        self.assertIn('path', chunk_json)
-        # convert back
-        chunk = core.Chunk.from_json(chunk_json)
-        # verify it is the correct Chunk object
-        self.assertEqual(type(chunk), core.Chunk)
-        self.assertEqual(chunk.path, "example.md")
-        self.assertEqual(chunk.texts, ["Hello, World!"])
 
     def test_scrape_zip(self):
         chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True)

diff --git a/thepipe/core.py b/thepipe/core.py
@@ -5,7 +5,7 @@
 import os
 import time
 from typing import Dict, List, Optional, Union
-from urllib import request
+import requests
 from PIL import Image
 from llama_index.core.schema import Document, ImageDocument
 
@@ -36,30 +36,29 @@ def to_message(self, host_images: bool = False, max_resolution : Optional[int] =
             message["content"].append({"type": "image_url", "image_url": image_url})
         return message
 
-    def to_json(self) -> str:
+    def to_json(self, host_images: bool = False) -> str:
         data = {
             'path': self.path,
             'texts': self.texts,
-            'images': [self.image_to_base64(image) for image in self.images],
+            'images': [make_image_url(image=image, host_images=host_images) for image in self.images],
             'audios': self.audios,
             'videos': self.videos,
         }
         return json.dumps(data)
 
     @staticmethod
-    def from_json(json_str: str) -> 'Chunk':
+    def from_json(json_str: str, host_images: bool = False) -> 'Chunk':
         data = json.loads(json_str)
         images = []
         for image_str in data['images']:
-            # Try to decode the image from base64
-            # if that fails, try to download it
-            try:
-                image_data = base64.b64decode(image_str)
+            if host_images:
+                image_data = requests.get(image_str).content
                 image = Image.open(BytesIO(image_data))
                 images.append(image)
-            except:
-                response = request.get(image_str)
-                image = Image.open(BytesIO(response.content))
+            else:
+                remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
+                image_data = base64.b64decode(remove_prefix)
+                image = Image.open(BytesIO(image_data))
                 images.append(image)
         return Chunk(
             path=data['path'],

diff --git a/thepipe/scraper.py b/thepipe/scraper.py
@@ -60,7 +60,7 @@ def detect_source_type(source: str) -> str:
 def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]:
     if not local:
         with open(source, 'rb') as f:
-            response = request.post(
+            response = requests.post(
                 url=f"{HOST_URL}/scrape",
                 files={'file': (source, f)},
                 data={'ai_extraction': ai_extraction, 'text_only': text_only}