From fa9e3ff49d84f808261926959cc7b17c9f862c66 Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Sun, 7 Jul 2024 23:12:01 -0400 Subject: [PATCH] specified docs status while switching backend services --- README.md | 49 +++---- requirements.txt | 3 +- tests/test_chunker.py | 2 +- tests/test_core.py | 8 +- tests/test_scraper.py | 6 +- thepipe/__init__.py | 3 +- thepipe/chunker.py | 4 +- thepipe/core.py | 39 ++++-- thepipe/scraper.py | 316 ++++++++++++++++++++++++++---------------- 9 files changed, 265 insertions(+), 165 deletions(-) diff --git a/README.md b/README.md index 7e824bf..d18708b 100644 --- a/README.md +++ b/README.md @@ -20,18 +20,17 @@ +### Extract markdown and visuals from PDFs URLs, slides, videos, and more, ready for multimodal LLMs. ⚡ -### Extract markdown and visuals from PDFs URLs, docs, slides, videos, and more, ready for multimodal LLMs. ⚡ - -thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and visuals from any document, media, or web page. It is built for multimodal language models such as GPT-4o, and works out-of-the-box with any LLM or vector database. thepi.pe is available as a [hosted API](https://thepi.pe), or it can be self-hosted. +thepi.pe is an API that can scrape multimodal data via `thepipe.scrape` or extract structured data via `thepipe.extract` from a wide range of data. It is built to interface with LLMs such as GPT-4o, and works out-of-the-box with any LLM or vector databases. thepi.pe can be used right away with a [hosted GPU cloud](https://thepi.pe), or it can be self-hosted. ## Features 🌟 -- Extract clean markdown, tables, and images from any document or web page -- Output works out-of-the-box with all multimodal LLMs and RAG frameworks -- GPU-accelerated AI layout analysis, chunking, and structured data extraction -- Quick-start integrations for web data like Twitter, YouTube, GitHub, and more -- Self-hosted or hosted API options available +- Extract markdown, tables, and images from any document or webpage +- Extract complex structured data from any document or webpage +- Works out-of-the-box with all LLMs and RAG frameworks +- AI-native filetype detection, layout analysis, and structured data extraction +- Multimodal scraping for video, audio, and image sources ## Get started in 5 minutes 🚀 @@ -39,19 +38,23 @@ thepi.pe can read a wide range of filetypes and web sources, so it requires a fe ### Hosted API (Python) +> ⚠️ **Warning.** +The docs and functionality in this repo differ significantly from the current working version on pip. To use a working version, please refer to the [API docs](https://thepi.pe/docs), and not these docs. + ```bash pip install thepipe-api setx THEPIPE_API_KEY=your_api_key +setx OPENAI_API_KEY=your_openai_key ``` ```python -import thepipe +from thepipe.scraper import scrape_file from openai import OpenAI -# scrape markdown + images -chunks = thepipe.scrape(source="example.pdf") +# scrape markdown, tables, visuals +chunks = scrape_file(filepath="paper.pdf") -# call LLM +# call LLM with clean, comprehensive data client = OpenAI() response = client.chat.completions.create( model="gpt-4o", @@ -59,19 +62,18 @@ response = client.chat.completions.create( ) ``` -### Local Installation +### Local Installation (Python) +For a local installation, you can use the following command: ```bash pip install thepipe-api[local] ``` -```python -import thepipe -from openai import OpenAI +And append `local=True` to your API calls: -# scrape markdown + images -chunks = thepipe.scrape_file(source="example.pdf", local=True) +```python +chunks = scrape_url(url="https://example.com", local=True) ``` You can also use The Pipe from the command line: @@ -79,13 +81,12 @@ You can also use The Pipe from the command line: thepipe path/to/folder --include_regex .*\.tsx ``` - ## Supported File Types 📚 | Source Type | Input types | Multimodal Scraping | Notes | |--------------------------|----------------------------------------------------------------|---------------------|----------------------| -| Webpage | URLs starting with `http`, `https`, `ftp` | ✔️ | Scrapes markdown, images, and tables from web pages | -| PDF | `.pdf` | ✔️ | Extracts page markdown and page images. Opt-in `ai_extraction` for advanced layout analysis (extracts markdown, LaTeX equations, tables, and figures) | +| Webpage | URLs starting with `http`, `https`, `ftp` | ✔️ | Scrapes markdown, images, and tables from web pages. `ai_extraction` available for AI layout analysis | +| PDF | `.pdf` | ✔️ | Extracts page markdown and page images. `ai_extraction` available for AI layout analysis | | Word Document | `.docx` | ✔️ | Extracts text, tables, and images | | PowerPoint | `.pptx` | ✔️ | Extracts text and images from slides | | Video | `.mp4`, `.mov`, `.wmv` | ✔️ | Uses Whisper for transcription and extracts frames | @@ -102,7 +103,7 @@ thepipe path/to/folder --include_regex .*\.tsx ## How it works 🛠️ -thepi.pe uses computer vision models and heuristics to extract clean content from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer). The output from thepi.pe is a prompt (a list of messages) containing all content from the source document. The messages returned should look like this: +thepi.pe uses computer vision models and heuristics to extract clean content from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer). The output from thepi.pe is a list of chunks containing all content within the source document. These chunks can easily be converted to a prompt format that is compatible with any LLM or multimodal model with `thepipe.chunks_to_messages`, which gives the following format: ```json [ { @@ -123,10 +124,10 @@ thepi.pe uses computer vision models and heuristics to extract clean content fro ] ``` -You can feed these messages directly into the model, or you can use `thepipe_api.chunk_by_page`, `thepipe_api.chunk_by_section`, `thepipe_api.chunk_semantic` to chunk these messages for a vector database such as ChromaDB or a RAG framework (a chunk can be converted to LlamaIndex Document/ImageDocument with `.to_llamaindex`). +You can feed these messages directly into the model, or alternatively you can use `thepipe_api.chunk_by_document`, `thepipe_api.chunk_by_page`, `thepipe_api.chunk_by_section`, `thepipe_api.chunk_semantic` to chunk these messages for a vector database such as ChromaDB or a RAG framework. A chunk can be converted to LlamaIndex Document/ImageDocument with `.to_llamaindex`. > ⚠️ **It is important to be mindful of your model's token limit.** -GPT-4o does not work with too many images in the prompt (see discussion [here](https://community.openai.com/t/gpt-4-vision-maximum-amount-of-images/573110/6)). Large documents should be extracted with `text_only=True` to avoid this issue, or alternatively they can be chunked and saved into a vector database or RAG framework. +GPT-4o does not work with too many images in the prompt (see discussion [here](https://community.openai.com/t/gpt-4-vision-maximum-amount-of-images/573110/6)). To remedy this issue, either use an LLM with a larger context window, extract larger documents with `text_only=True`, or embed the chunks into vector database. # Sponsors diff --git a/requirements.txt b/requirements.txt index 888256d..35bf909 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ charset-normalizer colorama requests pillow -pydantic \ No newline at end of file +pydantic +supabase \ No newline at end of file diff --git a/tests/test_chunker.py b/tests/test_chunker.py index 3fe142a..db57fa6 100644 --- a/tests/test_chunker.py +++ b/tests/test_chunker.py @@ -3,7 +3,7 @@ import sys from typing import List sys.path.append('..') -from thepipe import chunker +import thepipe.chunker as chunker from thepipe.core import Chunk class test_chunker(unittest.TestCase): diff --git a/tests/test_core.py b/tests/test_core.py index 46d01d5..3579354 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,8 +4,8 @@ import os import sys sys.path.append('..') -from thepipe import core -from thepipe import scraper +import thepipe.core as core +import thepipe.scraper as scraper from PIL import Image from io import BytesIO @@ -28,7 +28,7 @@ def test_chunk_to_llamaindex(self): self.assertEqual(len(llama_index), 1) def test_chunks_to_messages(self): - chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True) + chunks = scraper.scrape_file(filepath=self.files_directory+"/example.md", local=True) messages = core.chunks_to_messages(chunks) self.assertEqual(type(messages), list) for message in messages: @@ -44,7 +44,7 @@ def test_save_outputs(self): text = file.read() self.assertIn('Hello, World!', text) # verify with images - chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True) + chunks = scraper.scrape_file(filepath=self.files_directory+"/example.jpg", local=True) core.save_outputs(chunks) self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory))) diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 5c8afdc..9a1eed1 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -2,8 +2,8 @@ import os import sys sys.path.append('..') -from thepipe import core -from thepipe import scraper +import thepipe.core as core +import thepipe.scraper as scraper class test_scraper(unittest.TestCase): def setUp(self): @@ -83,7 +83,7 @@ def test_scrape_audio(self): self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None)) def test_scrape_video(self): - chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True) + chunks = scraper.scrape_file(self.files_directory+"/example.mp4", verbose=True, local=True) # verify it scraped the video file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) diff --git a/thepipe/__init__.py b/thepipe/__init__.py index 37c0669..ea34f91 100644 --- a/thepipe/__init__.py +++ b/thepipe/__init__.py @@ -1,7 +1,6 @@ import os from .scraper import scrape_file, scrape_url, scrape_directory -from .chunker import chunk_by_document, chunk_by_page, chunk_by_section, chunk_semantic -from .core import Chunk, calculate_tokens, chunks_to_messages, parse_arguments, save_outputs +from .core import parse_arguments, save_outputs def main() -> None: args = parse_arguments() diff --git a/thepipe/chunker.py b/thepipe/chunker.py index 27cd3ad..1c0ea4d 100644 --- a/thepipe/chunker.py +++ b/thepipe/chunker.py @@ -1,6 +1,6 @@ import re -from typing import Dict, List, Optional, Tuple -from .core import Chunk, calculate_tokens +from typing import List +from .core import Chunk from sklearn.metrics.pairwise import cosine_similarity def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]: diff --git a/thepipe/core.py b/thepipe/core.py index 047f852..51ccbdd 100644 --- a/thepipe/core.py +++ b/thepipe/core.py @@ -1,8 +1,8 @@ import argparse import base64 from io import BytesIO -import json import os +import re import time from typing import Dict, List, Optional, Union import requests @@ -26,20 +26,43 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]: else: return [Document(text=document_text)] - def to_message(self, host_images: bool = False, max_resolution : Optional[int] = None) -> Dict: + def to_message(self, host_images: bool = False, max_resolution: Optional[int] = None) -> Dict: message = {"role": "user", "content": []} + image_urls = [make_image_url(image, host_images, max_resolution) for image in self.images] + if self.texts: - prompt = "\n```\n" + '\n'.join(self.texts) + "\n```\n" - message["content"].append({"type": "text", "text": prompt}) - for image in self.images: - image_url = make_image_url(image, host_images, max_resolution) + message_text = "\n\n" + img_index = 0 + + for text in self.texts: + if host_images: + def replace_image(match): + nonlocal img_index + if img_index < len(image_urls): + url = image_urls[img_index] + img_index += 1 + return f"![image]({url})" + return match.group(0) # If we run out of images, leave the original text + + # Replace markdown image references with hosted URLs + text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', replace_image, text) + + message_text += text + "\n\n" + + # clean up, add to message + message_text = re.sub(r'\n{3,}', '\n\n', message_text).strip() + message["content"].append({"type": "text", "text": message_text}) + + # Add remaining images that weren't referenced in the text + for image_url in image_urls: message["content"].append({"type": "image_url", "image_url": image_url}) + return message def to_json(self, host_images: bool = False) -> Dict: data = { 'path': self.path, - 'texts': self.texts, + 'texts': [text.strip() for text in self.texts], 'images': [make_image_url(image=image, host_images=host_images) for image in self.images], 'audios': self.audios, 'videos': self.videos, @@ -61,7 +84,7 @@ def from_json(data: Dict, host_images: bool = False) -> 'Chunk': images.append(image) return Chunk( path=data['path'], - texts=data['texts'], + texts=[text.strip() for text in data['texts']], images=images, audios=data['audios'], videos=data['videos'], diff --git a/thepipe/scraper.py b/thepipe/scraper.py index af2cfd1..2d00f7f 100644 --- a/thepipe/scraper.py +++ b/thepipe/scraper.py @@ -1,9 +1,10 @@ import base64 from concurrent.futures import ThreadPoolExecutor from io import BytesIO +import io import math import re -from typing import Dict, List, Optional, Tuple +from typing import List, Optional import glob import os import tempfile @@ -18,8 +19,11 @@ import dotenv import shutil from magika import Magika +from .core import make_image_url, Chunk dotenv.load_dotenv() +from typing import List, Optional + FOLDERS_TO_IGNORE = ['*node_modules.*', '.*venv.*', '.*\.git.*', '.*\.vscode.*', '.*pycache.*'] FILES_TO_IGNORE = ['package-lock.json', '.gitignore', '.*\.bin', '.*\.pyc', '.*\.pyo', '.*\.exe', '.*\.dll', '.*\.ipynb_checkpoints'] GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN", None) @@ -29,6 +33,9 @@ TWITTER_DOMAINS = ['https://twitter.com', 'https://www.twitter.com', 'https://x.com', 'https://www.x.com'] YOUTUBE_DOMAINS = ['https://www.youtube.com', 'https://youtube.com'] GITHUB_DOMAINS = ['https://github.com', 'https://www.github.com'] +EXTRACTION_PROMPT = """Output the entire extracted text from the document in detailed markdown format. +Be sure to correctly format markdown for headers, paragraphs, lists, tables, menus, full text contents, etc. +Always reply immediately with only markdown. Do not output anything else.""" def detect_source_type(source: str) -> str: # otherwise, try to detect the file type by its extension @@ -47,12 +54,12 @@ def detect_source_type(source: str) -> str: mimetype = result.output.mime_type return mimetype -def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]: +def scrape_file(filepath: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]: if not local: - with open(source, 'rb') as f: + with open(filepath, 'rb') as f: response = requests.post( url=f"{HOST_URL}/scrape", - files={'file': (source, f)}, + files={'file': (filepath, f)}, data={'ai_extraction': ai_extraction, 'text_only': text_only} ) response_json = response.json() @@ -62,44 +69,44 @@ def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = Fals return chunks # returns chunks of scraped content from any source (file, URL, etc.) extraction = [] - source_type = detect_source_type(source) + source_type = detect_source_type(filepath) if source_type is None: if verbose: - print(f"[thepipe] Unsupported source type: {source}") + print(f"[thepipe] Unsupported source type: {filepath}") return extraction if verbose: - print(f"[thepipe] Scraping {source_type}: {source}...") + print(f"[thepipe] Scraping {source_type}: {filepath}...") if source_type == 'application/pdf': - extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) + extraction = scrape_pdf(file_path=filepath, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': - extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only) + extraction = scrape_docx(file_path=filepath, verbose=verbose, text_only=text_only) elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation': - extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only) + extraction = scrape_pptx(file_path=filepath, verbose=verbose, text_only=text_only) elif source_type.startswith('image/'): - extraction = scrape_image(file_path=source, text_only=text_only) + extraction = scrape_image(file_path=filepath, text_only=text_only) elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': - extraction = scrape_spreadsheet(file_path=source) + extraction = scrape_spreadsheet(file_path=filepath, source_type=source_type) elif source_type == 'application/x-ipynb+json': - extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only) + extraction = scrape_ipynb(file_path=filepath, verbose=verbose, text_only=text_only) elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed': - extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local) + extraction = scrape_zip(file_path=filepath, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local) elif source_type.startswith('video/'): - extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only) + extraction = scrape_video(file_path=filepath, verbose=verbose, text_only=text_only) elif source_type.startswith('audio/'): - extraction = scrape_audio(file_path=source, verbose=verbose) + extraction = scrape_audio(file_path=filepath, verbose=verbose) elif source_type.startswith('text/'): - extraction = scrape_plaintext(file_path=source) + extraction = scrape_plaintext(file_path=filepath) else: try: - extraction = scrape_plaintext(file_path=source) + extraction = scrape_plaintext(file_path=filepath) except Exception as e: if verbose: - print(f"[thepipe] Error extracting from {source}: {e}") + print(f"[thepipe] Error extracting from {filepath}: {e}") if verbose: if extraction: - print(f"[thepipe] Extracted from {source}") + print(f"[thepipe] Extracted from {filepath}") else: - print(f"[thepipe] No content extracted from {source}") + print(f"[thepipe] No content extracted from {filepath}") return extraction def scrape_plaintext(file_path: str) -> List[Chunk]: @@ -113,7 +120,7 @@ def scrape_directory(dir_path: str, include_regex: Optional[str] = None, verbose if include_regex: all_files = [file for file in all_files if re.search(include_regex, file, re.IGNORECASE)] with ThreadPoolExecutor() as executor: - results = executor.map(lambda file_path: scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files) + results = executor.map(lambda file_path: scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files) for result in results: extraction += result return extraction @@ -129,46 +136,38 @@ def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: boo def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]: chunks = [] if ai_extraction: - # ai_extraction uses layout analysis AI to extract markdown, equations, tables, and images from the PDF - MD_FOLDER = 'mdoutput' - if not os.path.exists(MD_FOLDER): - os.makedirs(MD_FOLDER) - else: - shutil.rmtree(MD_FOLDER) - os.makedirs(MD_FOLDER) - os.system(f"marker_single {file_path} {MD_FOLDER} --batch_multiplier 4 --max_pages 1000 --langs English") - # Find the .md file and read its content - for output_file in glob.glob(f'{MD_FOLDER}/*/*', recursive=True): - if output_file.endswith('.md'): - with open(output_file, 'r') as f: - markdown = f.read() - break - if not markdown: - if verbose: print(f"[thepipe] No markdown extracted from {file_path} (AI extraction likely failed).") - raise ValueError("AI extraction failed.") - if text_only: - chunks.append(Chunk(path=file_path, texts=[markdown])) - return chunks - # split the markdown into text and images, so we can return them in the correct order - content_pattern = re.compile(r'(\!\[.*?\]\(.*?\)|[^!\[]+)') - content_matches = content_pattern.findall(markdown) - for content in content_matches: - if content.startswith('!['): - # matched an image - if text_only: - continue - image_url = os.path.join(MD_FOLDER, re.search(r'\((.*?)\)', content).group(1)) - try: - image = Image.open(image_url) # the image url is a local path - chunks.append(Chunk(path=file_path, images=[image])) - except Exception as e: - if verbose: print(f"[thepipe] Error loading image {image_url}: {e}") - else: - # matched text - chunks.append(Chunk(path=file_path, texts=[content.strip()])) - # remove the output folder - shutil.rmtree(MD_FOLDER) - if verbose: print(f"[thepipe] AI extracted from {file_path}") + # if using AI extraction, for each page, generate markdown and cropped figures + import fitz + import modal + with open(file_path, "rb") as f: + pdf_bytes = f.read() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + images = [] + + for page_num in range(len(doc)): + page = doc[page_num] + pix = page.get_pixmap() + image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + img_byte_arr = io.BytesIO() + image.save(img_byte_arr, format='PNG') + images.append(img_byte_arr.getvalue()) + + app_name = "scrape-pdf" + function_name = "get_nougat_and_layout_preds_per_page" + fn = modal.Function.lookup(app_name, function_name) + results = fn.remote(images) + + chunks = [] + for i, result in enumerate(results): + texts = result['texts'] + # nougat often outputs many newlines + for text in texts: + # remove excessive newlines + text = re.sub(r'\n{3,}', '\n\n', text) + text = text.strip() + figures = result['figures'] + chunks.append(Chunk(path=file_path, texts=texts, images=figures)) + return chunks else: # if not using AI extraction, for each page, extract markdown and (optionally) full page images @@ -179,6 +178,9 @@ def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = Fa md_reader = pymupdf4llm.helpers.pymupdf_rag.to_markdown(doc, page_chunks=True) for i, page in enumerate(doc): text = md_reader[i]["text"] + # remove excessive newlines + text = re.sub(r'\n{3,}', '\n\n', text) + text = text.strip() if text_only: chunks.append(Chunk(path=file_path, texts=[text])) else: @@ -225,11 +227,11 @@ def scrape_image(file_path: str, text_only: bool = False) -> List[Chunk]: chunks.append(Chunk(path=file_path, images=[img])) return chunks -def scrape_spreadsheet(file_path: str) -> List[Chunk]: +def scrape_spreadsheet(file_path: str, source_type: str) -> List[Chunk]: import pandas as pd - if file_path.endswith(".csv"): + if source_type == 'application/vnd.ms-excel': df = pd.read_csv(file_path) - elif file_path.endswith(".xls") or file_path.endswith(".xlsx"): + elif source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': df = pd.read_excel(file_path) else: raise ValueError("Unsupported file format") @@ -242,7 +244,81 @@ def scrape_spreadsheet(file_path: str) -> List[Chunk]: chunks.append(Chunk(path=file_path, texts=[item_json])) return chunks -def extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Tuple[str, List[str]]: +def ai_extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Chunk: + from playwright.sync_api import sync_playwright + import modal + from openai import OpenAI + + app_name = "scrape-ui" + function_name = "get_ui_layout_preds" + fn = modal.Function.lookup(app_name, function_name) + + with sync_playwright() as p: + browser = p.chromium.launch() + context = browser.new_context(user_agent=USER_AGENT_STRING) + page = context.new_page() + page.goto(url, wait_until='domcontentloaded') + + viewport_height = page.viewport_size['height'] + total_height = page.evaluate("document.body.scrollHeight") + current_scroll_position = 0 + scrolldowns, max_scrolldowns = 0, 3 + images = [] + + while current_scroll_position < total_height and scrolldowns < max_scrolldowns: + page.wait_for_timeout(1000) + screenshot = page.screenshot(full_page=False) + img = Image.open(io.BytesIO(screenshot)) + images.append(img) + + current_scroll_position += viewport_height + page.evaluate(f"window.scrollTo(0, {current_scroll_position})") + scrolldowns += 1 + total_height = page.evaluate("document.body.scrollHeight") + + browser.close() + + if images: + # Vertically stack the images + total_height = sum(img.height for img in images) + max_width = max(img.width for img in images) + stacked_image = Image.new('RGB', (max_width, total_height)) + y_offset = 0 + for img in images: + stacked_image.paste(img, (0, y_offset)) + y_offset += img.height + + # Process the stacked image with the UI model + figures = fn.remote(stacked_image) + + # Process the stacked image with VLM + openrouter_client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=os.environ["OPENROUTER_API_KEY"], + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": make_image_url(stacked_image)}, + {"type": "text", "text": EXTRACTION_PROMPT}, + ] + }, + ] + response = openrouter_client.chat.completions.create( + model="google/gemini-flash-1.5", + messages=messages, + temperature=0.2 + ) + llm_response = response.choices[0].message.content + chunk = Chunk(path=url, texts=[llm_response], images=figures) + else: + raise ValueError("Model received 0 images from webpage") + + return chunk + +def extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Chunk: from urllib.parse import urlparse import markdownify from bs4 import BeautifulSoup @@ -250,6 +326,9 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals import base64 import requests + texts = [] + images = [] + with sync_playwright() as p: browser = p.chromium.launch() context = browser.new_context(user_agent="USER_AGENT_STRING") @@ -263,7 +342,7 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals scrolldowns, max_scrolldowns = 0, 20 # Finite to prevent infinite scroll while current_scroll_position < total_height and scrolldowns < max_scrolldowns: - page.wait_for_timeout(100) # Wait for dynamic content to load + page.wait_for_timeout(1000) # Wait for dynamic content to load current_scroll_position += viewport_height page.evaluate(f"window.scrollTo(0, {current_scroll_position})") scrolldowns += 1 @@ -277,56 +356,53 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals markdown_content = markdownify.markdownify(str(soup), heading_style="ATX") # Remove excessive newlines in the markdown - while '\n\n\n' in markdown_content: - markdown_content = markdown_content.replace('\n\n\n', '\n\n') - - if text_only: - browser.close() - return markdown_content, [] + markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content) + markdown_content = markdown_content.strip() - # Extract images from the page using heuristics - # to adaptively read image URLs - images = [] - for img in page.query_selector_all('img'): - img_path = img.get_attribute('src') - if not img_path: - continue - if img_path.startswith('data:image'): - # save base64 image to PIL Image - decoded_data = base64.b64decode(img_path.split(',')[1]) - try: - image = Image.open(BytesIO(decoded_data)) - images.append(image) - except Exception as e: - if verbose: print(f"[thepipe] Ignoring error loading image {img_path}: {e}") - continue # Ignore incompatible image extractions - else: - try: - image = Image.open(requests.get(img_path, stream=True).raw) - images.append(image) - except: - if 'https://' not in img_path and 'http://' not in img_path: - try: - while img_path.startswith('/'): - img_path = img_path[1:] - path_with_schema = urlparse(url).scheme + "://" + img_path - image = Image.open(requests.get(path_with_schema, stream=True).raw) - images.append(image) - except: + texts.append(markdown_content) + + if not text_only: + # Extract images from the page using heuristics + for img in page.query_selector_all('img'): + img_path = img.get_attribute('src') + if not img_path: + continue + if img_path.startswith('data:image'): + # Save base64 image to PIL Image + decoded_data = base64.b64decode(img_path.split(',')[1]) + try: + image = Image.open(BytesIO(decoded_data)) + images.append(image) + except Exception as e: + if verbose: print(f"[thepipe] Ignoring error loading image {img_path}: {e}") + continue # Ignore incompatible image extractions + else: + try: + image = Image.open(requests.get(img_path, stream=True).raw) + images.append(image) + except: + if 'https://' not in img_path and 'http://' not in img_path: try: - path_with_schema_and_netloc = urlparse(url).scheme + "://" + urlparse(url).netloc + "/" + img_path - image = Image.open(requests.get(path_with_schema_and_netloc, stream=True).raw) + while img_path.startswith('/'): + img_path = img_path[1:] + path_with_schema = urlparse(url).scheme + "://" + img_path + image = Image.open(requests.get(path_with_schema, stream=True).raw) images.append(image) except: - if verbose: print(f"[thepipe] Ignoring error loading image {img_path}") - continue # Ignore incompatible image extractions - else: - if verbose: print(f"[thepipe] Ignoring error loading image {img_path}") - continue # Ignore incompatible image extractions + try: + path_with_schema_and_netloc = urlparse(url).scheme + "://" + urlparse(url).netloc + "/" + img_path + image = Image.open(requests.get(path_with_schema_and_netloc, stream=True).raw) + images.append(image) + except: + if verbose: print(f"[thepipe] Ignoring error loading image {img_path}") + continue # Ignore incompatible image extractions + else: + if verbose: print(f"[thepipe] Ignoring error loading image {img_path}") + continue # Ignore incompatible image extractions browser.close() - print("N_IMAGES", len(images)) - return markdown_content, images + + return Chunk(path=url, texts=texts, images=images) def parse_html_to_markdown(html_content): from bs4 import BeautifulSoup, NavigableString, Tag @@ -380,20 +456,19 @@ def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False, v response = requests.get(url) with open(file_path, 'wb') as file: file.write(response.content) - chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=True) + chunks = scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=True) for chunk in chunks: all_texts.extend(chunk.texts) all_images.extend(chunk.images) + return [Chunk(path=url, texts=all_texts, images=all_images)] else: # if url leads to web content, scrape it directly - markdown_content, images = extract_page_content(url, text_only=text_only, verbose=verbose) - all_texts.append(markdown_content) - if not text_only: - all_images.extend(images) - if not all_texts and not all_images: - raise ValueError("No content extracted from URL.") - return [Chunk(path=url, texts=all_texts, images=all_images)] - + if ai_extraction: + chunk = ai_extract_page_content(url=url, text_only=text_only, verbose=verbose) + else: + chunk = extract_page_content(url=url, text_only=text_only, verbose=verbose) + return [chunk] + def format_timestamp(seconds, chunk_index, chunk_duration): # helper function to format the timestamp. total_seconds = chunk_index * chunk_duration + seconds @@ -556,6 +631,7 @@ def read_docx_tables(tab): image_part = document.part.related_parts[embed_attr] image_data = io.BytesIO(image_part._blob) image = Image.open(image_data) + image.load() block_images.append(image) image_counter += 1 elif block.__class__.__name__ == 'Table':