Skip to content

Commit

Permalink
fixed to_message, fixed by_document chunker
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Jul 5, 2024
1 parent dab1555 commit 9fa148b
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 123 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v

## Get started in 5 minutes 🚀

thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).
thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal PDF & video response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).

### Hosted API (Python)

Expand All @@ -49,7 +49,7 @@ import thepipe_api as tp
from openai import OpenAI

# scrape markdown + images
chunks = tp.scrape_file(
chunks = tp.scrape(
source="example.pdf",
ai_extraction=True
)
Expand All @@ -58,7 +58,7 @@ chunks = tp.scrape_file(
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=tp.to_messages(chunks),
messages=tp.chunks_to_messages(chunks),
)
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='thepipe_api',
version='1.0.0',
version='1.0.1',
author='Emmett McFarlane',
author_email='[email protected]',
description='AI-native scraper for multimodal LLMs.',
Expand Down
21 changes: 16 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def test_image_to_base64(self):
decoded_image = Image.open(BytesIO(image_data))
self.assertEqual(image.size, decoded_image.size)

def test_to_messages(self):
chunks = scraper.scrape_file(source=self.files_directory+"/example.md")
messages = core.to_messages(chunks)
def test_chunks_to_messages(self):
chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
messages = core.chunks_to_messsages(chunks)
self.assertEqual(type(messages), list)
for message in messages:
self.assertEqual(type(message), dict)
Expand All @@ -55,12 +55,23 @@ def test_save_outputs(self):
text = file.read()
self.assertIn('Hello, World!', text)
# verify with images
chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg")
chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True)
thepipe.save_outputs(chunks)
self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))

def test_parse_arguments(self):
args = thepipe.parse_arguments()
self.assertEqual(type(args), argparse.Namespace)
self.assertIn('source', vars(args))
self.assertIn('include_regex', vars(args))
self.assertIn('include_regex', vars(args))

def test_calculate_tokens(self):
text = "Hello, World!"
tokens = core.calculate_tokens([core.Chunk(texts=[text])])
self.assertAlmostEqual(tokens, 3.25, places=0)

def test_calculate_image_tokens(self):
image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
image.load() # needed to close the file
tokens = core.calculate_image_tokens(image)
self.assertAlmostEqual(tokens, 85, places=0)
26 changes: 13 additions & 13 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tearDown(self):
os.rmdir(self.outputs_directory)

def test_scrape_zip(self):
chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True)
# verify it scraped the zip file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -31,7 +31,7 @@ def test_scrape_zip(self):
self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))

def test_scrape_ipynb(self):
chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True, local=True)
# verify it scraped the ipynb file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -42,7 +42,7 @@ def test_scrape_ipynb(self):
self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))

def test_scrape_pdf_with_ai_extraction(self):
chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True)
chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True, local=True)
# verify it scraped the pdf file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -52,7 +52,7 @@ def test_scrape_pdf_with_ai_extraction(self):
self.assertIsNotNone(chunk.texts or chunk.images)

def test_scrape_docx(self):
chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True, local=True)
# verify it scraped the docx file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -63,7 +63,7 @@ def test_scrape_docx(self):
self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))

def test_extract_pdf_without_ai_extraction(self):
chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True, local=True)
# verify it scraped the pdf file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -74,7 +74,7 @@ def test_extract_pdf_without_ai_extraction(self):
self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))

def test_scrape_audio(self):
chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True, local=True)
# verify it scraped the audio file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -85,7 +85,7 @@ def test_scrape_audio(self):
self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))

def test_scrape_video(self):
chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True)
chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True)
# verify it scraped the video file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -98,7 +98,7 @@ def test_scrape_video(self):
self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))

def test_scrape_pptx(self):
chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True)
chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True, local=True)
# verify it scraped the pptx file into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -110,7 +110,7 @@ def test_scrape_pptx(self):

def test_scrape_tweet(self):
tweet_url = "https://x.com/ylecun/status/1796734866156843480"
chunks = scraper.scrape_url(tweet_url)
chunks = scraper.scrape_url(tweet_url, local=True)
# verify it returned chunks representing the tweet
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -120,7 +120,7 @@ def test_scrape_tweet(self):
self.assertTrue(len(chunks[0].images) > 0)

def test_scrape_youtube(self):
chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8")
chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8", local=True)
# verify it scraped the youtube video into chunks
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0)
Expand All @@ -134,7 +134,7 @@ def test_scrape_youtube(self):

def test_scrape_url(self):
# verify web page scrape result
chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping')
chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping', local=True)
for chunk in chunks:
self.assertEqual(type(chunk), core.Chunk)
self.assertEqual(chunk.path, 'https://en.wikipedia.org/wiki/Piping')
Expand All @@ -144,12 +144,12 @@ def test_scrape_url(self):
# verify if at least one image was scraped
self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
# verify file url scrape result
chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf')
chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', local=True)
self.assertEqual(len(chunks), 1)

@unittest.skipUnless(os.environ.get('GITHUB_TOKEN'), "requires GITHUB_TOKEN")
def test_scrape_github(self):
chunks = scraper.scrape_url('https://github.com/emcf/thepipe')
chunks = scraper.scrape_url('https://github.com/emcf/thepipe', local=True)
self.assertEqual(type(chunks), list)
self.assertNotEqual(len(chunks), 0) # should have some repo contents

Expand Down
4 changes: 3 additions & 1 deletion thepipe_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .scraper import scrape_file, scrape_url, scrape_directory
from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
from .core import Chunk, calculate_tokens, chunks_to_messsages
from .thepipe import extract # deprecated
26 changes: 24 additions & 2 deletions thepipe_api/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,24 @@
from .core import Chunk, calculate_tokens
from sklearn.metrics.pairwise import cosine_similarity

def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]:
chunks_by_doc = {}
new_chunks = []
for chunk in chunks:
if not chunk.path:
raise ValueError("Document chunking requires the path attribute to determine the document boundaries")
if chunk.path not in chunks_by_doc:
chunks_by_doc[chunk.path] = []
chunks_by_doc[chunk.path].append(chunk)
for doc_chunks in chunks_by_doc.values():
doc_texts = []
doc_images = []
for chunk in doc_chunks:
doc_texts.extend(chunk.texts)
doc_images.extend(chunk.images)
new_chunks.append(Chunk(path=doc_chunks[0].path, texts=doc_texts, images=doc_images))
return new_chunks

def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]:
# by-page chunking is default behavior
return chunks
Expand All @@ -28,7 +46,7 @@ def chunk_by_section(chunks: List[Chunk]) -> List[Chunk]:
section_chunks.append(Chunk(texts=[current_chunk_text], images=current_chunk_images))
return section_chunks

def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 2, similarity_threshold: float = 0.5) -> List[Chunk]:
def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 3, similarity_threshold: float = 0.1) -> List[Chunk]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name)
# Flatten the chunks into sentences
Expand Down Expand Up @@ -69,8 +87,12 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers
for group in grouped_sentences:
group_texts = [sentences[i] for i in group]
group_images = []
seen_images = []
for i in group:
group_images.extend(sentence_chunk_map[i].images)
for image in sentence_chunk_map[i].images:
if image not in seen_images:
group_images.append(image)
seen_images.append(image)
new_chunks.append(Chunk(texts=group_texts, images=group_images))

return new_chunks
30 changes: 14 additions & 16 deletions thepipe_api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,18 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
return [ImageDocument(text=document_text, image=image) for image in self.images]
else:
return [Document(text=document_text)]


def to_message(self) -> Dict:
content = []
if self.texts:
for text in self.texts:
content.append({"type": "text", "text": {"content": text}})
if self.images:
for image in self.images:
base64_image = image_to_base64(image)
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
return {"role": "user", "content": content}

# uses https://platform.openai.com/docs/guides/vision
def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int:
width, height = image.size
Expand Down Expand Up @@ -56,18 +67,5 @@ def image_to_base64(image: Image.Image) -> str:
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode()

def to_messages(chunks: List[Chunk]) -> List[Dict]:
# audio and video are not yet supported as they
# are not common in SOTA multimodel LLMs (June 2024)
messages = []
for chunk in chunks:
content = []
if chunk.texts:
for text in chunk.texts:
content.append({"type": "text", "text": {"content": text}})
if chunk.images:
for image in chunk.images:
base64_image = image_to_base64(image)
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
messages.append({"role": "user", "content": content})
return messages
def chunks_to_messsages(chunks: List[Chunk]) -> List[Dict]:
return [chunk.to_message() for chunk in chunks]
Loading

0 comments on commit 9fa148b

Please sign in to comment.