Skip to content

Commit

Permalink
passing host_images to json converters
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Jul 6, 2024
1 parent 56094a2 commit c219ddd
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 42 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v

## Features 🌟

- Extract markdown, tables, and images from any document or web page 📊
- Output works out-of-the-box with all multimodal LLMs and RAG frameworks 🖼️📚
- AI filetype detection for missing file extensions and unknown web data 💾
- Quick-start integrations for Twitter, YouTube, GitHub, and more 🌐
- GPU-accelerated ⚡️
- Extract markdown, images, and structured data from any document or web page
- Output works out-of-the-box with all multimodal LLMs and RAG frameworks
- AI filetype detection for missing file extensions and unknown web data
- Quick-start integrations for Twitter, YouTube, GitHub, and more
- GPU-accelerated

## Get started in 5 minutes 🚀

Expand Down
25 changes: 15 additions & 10 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,6 @@ def test_chunk_to_llamaindex(self):
self.assertEqual(type(llama_index), list)
self.assertEqual(len(llama_index), 1)

def test_image_to_base64(self):
image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
image.load() # needed to close the file
base64_string = core.image_to_base64(image)
self.assertEqual(type(base64_string), str)
# converting back should be the same
image_data = base64.b64decode(base64_string)
decoded_image = Image.open(BytesIO(image_data))
self.assertEqual(image.size, decoded_image.size)

def test_chunks_to_messages(self):
chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
messages = core.chunks_to_messages(chunks)
Expand All @@ -58,6 +48,21 @@ def test_save_outputs(self):
core.save_outputs(chunks)
self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))

def test_chunk_json(self):
chunk = core.Chunk(path="example.md", texts=["Hello, World!"])
# convert to json
chunk_json = chunk.to_json()
# verify it is a dictionary with the expected items
self.assertEqual(type(chunk_json), dict)
self.assertIn('texts', chunk_json)
self.assertIn('path', chunk_json)
# convert back
chunk = core.Chunk.from_json(chunk_json)
# verify it is the correct Chunk object
self.assertEqual(type(chunk), core.Chunk)
self.assertEqual(chunk.path, "example.md")
self.assertEqual(chunk.texts, ["Hello, World!"])

def test_parse_arguments(self):
args = core.parse_arguments()
self.assertEqual(type(args), argparse.Namespace)
Expand Down
15 changes: 0 additions & 15 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,6 @@ def tearDown(self):
for file in os.listdir(self.outputs_directory):
os.remove(os.path.join(self.outputs_directory, file))
os.rmdir(self.outputs_directory)

def test_chunk_json(self):
chunk = core.Chunk(path="example.md", texts=["Hello, World!"])
# convert to json
chunk_json = chunk.to_json()
# verify it is a dictionary with the expected items
self.assertEqual(type(chunk_json), dict)
self.assertIn('texts', chunk_json)
self.assertIn('path', chunk_json)
# convert back
chunk = core.Chunk.from_json(chunk_json)
# verify it is the correct Chunk object
self.assertEqual(type(chunk), core.Chunk)
self.assertEqual(chunk.path, "example.md")
self.assertEqual(chunk.texts, ["Hello, World!"])

def test_scrape_zip(self):
chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True)
Expand Down
21 changes: 10 additions & 11 deletions thepipe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import time
from typing import Dict, List, Optional, Union
from urllib import request
import requests
from PIL import Image
from llama_index.core.schema import Document, ImageDocument

Expand Down Expand Up @@ -36,30 +36,29 @@ def to_message(self, host_images: bool = False, max_resolution : Optional[int] =
message["content"].append({"type": "image_url", "image_url": image_url})
return message

def to_json(self) -> str:
def to_json(self, host_images: bool = False) -> str:
data = {
'path': self.path,
'texts': self.texts,
'images': [self.image_to_base64(image) for image in self.images],
'images': [make_image_url(image=image, host_images=host_images) for image in self.images],
'audios': self.audios,
'videos': self.videos,
}
return json.dumps(data)

@staticmethod
def from_json(json_str: str) -> 'Chunk':
def from_json(json_str: str, host_images: bool = False) -> 'Chunk':
data = json.loads(json_str)
images = []
for image_str in data['images']:
# Try to decode the image from base64
# if that fails, try to download it
try:
image_data = base64.b64decode(image_str)
if host_images:
image_data = requests.get(image_str).content
image = Image.open(BytesIO(image_data))
images.append(image)
except:
response = request.get(image_str)
image = Image.open(BytesIO(response.content))
else:
remove_prefix = image_str.replace("data:image/jpeg;base64,", "")
image_data = base64.b64decode(remove_prefix)
image = Image.open(BytesIO(image_data))
images.append(image)
return Chunk(
path=data['path'],
Expand Down
2 changes: 1 addition & 1 deletion thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def detect_source_type(source: str) -> str:
def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]:
if not local:
with open(source, 'rb') as f:
response = request.post(
response = requests.post(
url=f"{HOST_URL}/scrape",
files={'file': (source, f)},
data={'ai_extraction': ai_extraction, 'text_only': text_only}
Expand Down

0 comments on commit c219ddd

Please sign in to comment.