diff --git a/README.md b/README.md index 377ebdc..60afd7c 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,11 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v ## Features 🌟 -- Extract markdown, tables, and images from any document or web page 📊 -- Output works out-of-the-box with all multimodal LLMs and RAG frameworks 🖼️📚 -- AI filetype detection for missing file extensions and unknown web data 💾 -- Quick-start integrations for Twitter, YouTube, GitHub, and more 🌐 -- GPU-accelerated ⚡️ +- Extract markdown, images, and structured data from any document or web page +- Output works out-of-the-box with all multimodal LLMs and RAG frameworks +- AI filetype detection for missing file extensions and unknown web data +- Quick-start integrations for Twitter, YouTube, GitHub, and more +- GPU-accelerated ## Get started in 5 minutes 🚀 diff --git a/tests/test_core.py b/tests/test_core.py index c189c79..df3e2b4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -27,16 +27,6 @@ def test_chunk_to_llamaindex(self): self.assertEqual(type(llama_index), list) self.assertEqual(len(llama_index), 1) - def test_image_to_base64(self): - image = Image.open(os.path.join(self.files_directory, 'example.jpg')) - image.load() # needed to close the file - base64_string = core.image_to_base64(image) - self.assertEqual(type(base64_string), str) - # converting back should be the same - image_data = base64.b64decode(base64_string) - decoded_image = Image.open(BytesIO(image_data)) - self.assertEqual(image.size, decoded_image.size) - def test_chunks_to_messages(self): chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True) messages = core.chunks_to_messages(chunks) @@ -58,6 +48,21 @@ def test_save_outputs(self): core.save_outputs(chunks) self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory))) + def test_chunk_json(self): + chunk = core.Chunk(path="example.md", texts=["Hello, World!"]) + # convert to json + chunk_json = chunk.to_json() + # verify it is a dictionary with the expected items + self.assertEqual(type(chunk_json), dict) + self.assertIn('texts', chunk_json) + self.assertIn('path', chunk_json) + # convert back + chunk = core.Chunk.from_json(chunk_json) + # verify it is the correct Chunk object + self.assertEqual(type(chunk), core.Chunk) + self.assertEqual(chunk.path, "example.md") + self.assertEqual(chunk.texts, ["Hello, World!"]) + def test_parse_arguments(self): args = core.parse_arguments() self.assertEqual(type(args), argparse.Namespace) diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 161f899..5c8afdc 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -16,21 +16,6 @@ def tearDown(self): for file in os.listdir(self.outputs_directory): os.remove(os.path.join(self.outputs_directory, file)) os.rmdir(self.outputs_directory) - - def test_chunk_json(self): - chunk = core.Chunk(path="example.md", texts=["Hello, World!"]) - # convert to json - chunk_json = chunk.to_json() - # verify it is a dictionary with the expected items - self.assertEqual(type(chunk_json), dict) - self.assertIn('texts', chunk_json) - self.assertIn('path', chunk_json) - # convert back - chunk = core.Chunk.from_json(chunk_json) - # verify it is the correct Chunk object - self.assertEqual(type(chunk), core.Chunk) - self.assertEqual(chunk.path, "example.md") - self.assertEqual(chunk.texts, ["Hello, World!"]) def test_scrape_zip(self): chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True) diff --git a/thepipe/core.py b/thepipe/core.py index f48b0cb..463f2ee 100644 --- a/thepipe/core.py +++ b/thepipe/core.py @@ -5,7 +5,7 @@ import os import time from typing import Dict, List, Optional, Union -from urllib import request +import requests from PIL import Image from llama_index.core.schema import Document, ImageDocument @@ -36,30 +36,29 @@ def to_message(self, host_images: bool = False, max_resolution : Optional[int] = message["content"].append({"type": "image_url", "image_url": image_url}) return message - def to_json(self) -> str: + def to_json(self, host_images: bool = False) -> str: data = { 'path': self.path, 'texts': self.texts, - 'images': [self.image_to_base64(image) for image in self.images], + 'images': [make_image_url(image=image, host_images=host_images) for image in self.images], 'audios': self.audios, 'videos': self.videos, } return json.dumps(data) @staticmethod - def from_json(json_str: str) -> 'Chunk': + def from_json(json_str: str, host_images: bool = False) -> 'Chunk': data = json.loads(json_str) images = [] for image_str in data['images']: - # Try to decode the image from base64 - # if that fails, try to download it - try: - image_data = base64.b64decode(image_str) + if host_images: + image_data = requests.get(image_str).content image = Image.open(BytesIO(image_data)) images.append(image) - except: - response = request.get(image_str) - image = Image.open(BytesIO(response.content)) + else: + remove_prefix = image_str.replace("data:image/jpeg;base64,", "") + image_data = base64.b64decode(remove_prefix) + image = Image.open(BytesIO(image_data)) images.append(image) return Chunk( path=data['path'], diff --git a/thepipe/scraper.py b/thepipe/scraper.py index 66f467b..9b254bc 100644 --- a/thepipe/scraper.py +++ b/thepipe/scraper.py @@ -60,7 +60,7 @@ def detect_source_type(source: str) -> str: def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]: if not local: with open(source, 'rb') as f: - response = request.post( + response = requests.post( url=f"{HOST_URL}/scrape", files={'file': (source, f)}, data={'ai_extraction': ai_extraction, 'text_only': text_only}