From fa9e3ff49d84f808261926959cc7b17c9f862c66 Mon Sep 17 00:00:00 2001
From: Emmett McFaralne <staminacode@gmail.com>
Date: Sun, 7 Jul 2024 23:12:01 -0400
Subject: [PATCH] specified docs status while switching backend services

---
 README.md             |  49 +++----
 requirements.txt      |   3 +-
 tests/test_chunker.py |   2 +-
 tests/test_core.py    |   8 +-
 tests/test_scraper.py |   6 +-
 thepipe/__init__.py   |   3 +-
 thepipe/chunker.py    |   4 +-
 thepipe/core.py       |  39 ++++--
 thepipe/scraper.py    | 316 ++++++++++++++++++++++++++----------------
 9 files changed, 265 insertions(+), 165 deletions(-)
diff --git a/README.md b/README.md
index 7e824bf..d18708b 100644
--- a/README.md
+++ b/README.md
@@ -20,18 +20,17 @@
   </a>
 </div>
 
+### Extract markdown and visuals from PDFs URLs, slides, videos, and more, ready for multimodal LLMs. ⚡
 
-### Extract markdown and visuals from PDFs URLs, docs, slides, videos, and more, ready for multimodal LLMs. ⚡
-
-thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and visuals from any document, media, or web page. It is built for multimodal language models such as GPT-4o, and works out-of-the-box with any LLM or vector database. thepi.pe is available as a [hosted API](https://thepi.pe), or it can be self-hosted. 
+thepi.pe is an API that can scrape multimodal data via `thepipe.scrape` or extract structured data via `thepipe.extract` from a wide range of data. It is built to interface with LLMs such as GPT-4o, and works out-of-the-box with any LLM or vector databases. thepi.pe can be used right away with a [hosted GPU cloud](https://thepi.pe), or it can be self-hosted.
 
 ## Features 🌟
 
-- Extract clean markdown, tables, and images from any document or web page
-- Output works out-of-the-box with all multimodal LLMs and RAG frameworks
-- GPU-accelerated AI layout analysis, chunking, and structured data extraction
-- Quick-start integrations for web data like Twitter, YouTube, GitHub, and more
-- Self-hosted or hosted API options available
+- Extract markdown, tables, and images from any document or webpage
+- Extract complex structured data from any document or webpage
+- Works out-of-the-box with all LLMs and RAG frameworks
+- AI-native filetype detection, layout analysis, and structured data extraction
+- Multimodal scraping for video, audio, and image sources
 
 ## Get started in 5 minutes  🚀
 
@@ -39,19 +38,23 @@ thepi.pe can read a wide range of filetypes and web sources, so it requires a fe
 
 ### Hosted API (Python)
 
+> ⚠️ **Warning.**
+The docs and functionality in this repo differ significantly from the current working version on pip. To use a working version, please refer to the [API docs](https://thepi.pe/docs), and not these docs.
+
 ```bash
 pip install thepipe-api
 setx THEPIPE_API_KEY=your_api_key
+setx OPENAI_API_KEY=your_openai_key
 ```
 
 ```python
-import thepipe
+from thepipe.scraper import scrape_file
 from openai import OpenAI
 
-# scrape markdown + images
-chunks = thepipe.scrape(source="example.pdf")
+# scrape markdown, tables, visuals
+chunks = scrape_file(filepath="paper.pdf")
 
-# call LLM
+# call LLM with clean, comprehensive data
 client = OpenAI()
 response = client.chat.completions.create(
     model="gpt-4o",
@@ -59,19 +62,18 @@ response = client.chat.completions.create(
 )
 ```
 
-### Local Installation
+### Local Installation (Python)
 
+For a local installation, you can use the following command:
 
 ```bash
 pip install thepipe-api[local]
 ```
 
-```python
-import thepipe
-from openai import OpenAI
+And append `local=True` to your API calls:
 
-# scrape markdown + images
-chunks = thepipe.scrape_file(source="example.pdf", local=True)
+```python
+chunks = scrape_url(url="https://example.com", local=True)
 ```
 
 You can also use The Pipe from the command line:
@@ -79,13 +81,12 @@ You can also use The Pipe from the command line:
 thepipe path/to/folder --include_regex .*\.tsx
 ```
 
-
 ## Supported File Types 📚
 
 | Source Type              | Input types                                                    | Multimodal Scraping | Notes |
 |--------------------------|----------------------------------------------------------------|---------------------|----------------------|
-| Webpage                  | URLs starting with `http`, `https`, `ftp`                      | ✔️                  | Scrapes markdown, images, and tables from web pages |
-| PDF                      | `.pdf`                                                          | ✔️                  | Extracts page markdown and page images. Opt-in `ai_extraction` for advanced layout analysis (extracts markdown, LaTeX equations, tables, and figures) |
+| Webpage                  | URLs starting with `http`, `https`, `ftp`                      | ✔️                  | Scrapes markdown, images, and tables from web pages. `ai_extraction` available for AI layout analysis |
+| PDF                      | `.pdf`                                                          | ✔️                  | Extracts page markdown and page images. `ai_extraction` available for AI layout analysis |
 | Word Document  | `.docx`                                                         | ✔️                  | Extracts text, tables, and images |
 | PowerPoint     | `.pptx`                                                         | ✔️                  | Extracts text and images from slides |
 | Video                    | `.mp4`, `.mov`, `.wmv`                                          | ✔️                  | Uses Whisper for transcription and extracts frames |
@@ -102,7 +103,7 @@ thepipe path/to/folder --include_regex .*\.tsx
 
 ## How it works 🛠️
 
-thepi.pe uses computer vision models and heuristics to extract clean content from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer). The output from thepi.pe is a prompt (a list of messages) containing all content from the source document. The messages returned should look like this:
+thepi.pe uses computer vision models and heuristics to extract clean content from the source and process it for downstream use with [language models](https://en.wikipedia.org/wiki/Large_language_model), or [vision transformers](https://en.wikipedia.org/wiki/Vision_transformer). The output from thepi.pe is a list of chunks containing all content within the source document. These chunks can easily be converted to a prompt format that is compatible with any LLM or multimodal model with `thepipe.chunks_to_messages`, which gives the following format:
 ```json
 [
   {
@@ -123,10 +124,10 @@ thepi.pe uses computer vision models and heuristics to extract clean content fro
 ]
 ```
 
-You can feed these messages directly into the model, or you can use `thepipe_api.chunk_by_page`, `thepipe_api.chunk_by_section`, `thepipe_api.chunk_semantic` to chunk these messages for a vector database such as ChromaDB or a RAG framework (a chunk can be converted to LlamaIndex Document/ImageDocument with `.to_llamaindex`).
+You can feed these messages directly into the model, or alternatively you can use `thepipe_api.chunk_by_document`, `thepipe_api.chunk_by_page`, `thepipe_api.chunk_by_section`, `thepipe_api.chunk_semantic` to chunk these messages for a vector database such as ChromaDB or a RAG framework. A chunk can be converted to LlamaIndex Document/ImageDocument with `.to_llamaindex`.
 
 > ⚠️ **It is important to be mindful of your model's token limit.**
-GPT-4o does not work with too many images in the prompt (see discussion [here](https://community.openai.com/t/gpt-4-vision-maximum-amount-of-images/573110/6)). Large documents should be extracted with `text_only=True` to avoid this issue, or alternatively they can be chunked and saved into a vector database or RAG framework.
+GPT-4o does not work with too many images in the prompt (see discussion [here](https://community.openai.com/t/gpt-4-vision-maximum-amount-of-images/573110/6)). To remedy this issue, either use an LLM with a larger context window, extract larger documents with `text_only=True`, or embed the chunks into vector database.
 
 # Sponsors
 
diff --git a/requirements.txt b/requirements.txt
index 888256d..35bf909 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ charset-normalizer
 colorama
 requests
 pillow
-pydantic
\ No newline at end of file
+pydantic
+supabase
\ No newline at end of file
diff --git a/tests/test_chunker.py b/tests/test_chunker.py
index 3fe142a..db57fa6 100644
--- a/tests/test_chunker.py
+++ b/tests/test_chunker.py
@@ -3,7 +3,7 @@
 import sys
 from typing import List
 sys.path.append('..')
-from thepipe import chunker
+import thepipe.chunker as chunker
 from thepipe.core import Chunk
 
 class test_chunker(unittest.TestCase):
diff --git a/tests/test_core.py b/tests/test_core.py
index 46d01d5..3579354 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -4,8 +4,8 @@
 import os
 import sys
 sys.path.append('..')
-from thepipe import core
-from thepipe import scraper
+import thepipe.core as core
+import thepipe.scraper as scraper
 from PIL import Image
 from io import BytesIO
 
@@ -28,7 +28,7 @@ def test_chunk_to_llamaindex(self):
         self.assertEqual(len(llama_index), 1)
     
     def test_chunks_to_messages(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
+        chunks = scraper.scrape_file(filepath=self.files_directory+"/example.md", local=True)
         messages = core.chunks_to_messages(chunks)
         self.assertEqual(type(messages), list)
         for message in messages:
@@ -44,7 +44,7 @@ def test_save_outputs(self):
             text = file.read()
         self.assertIn('Hello, World!', text)
         # verify with images
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True)
+        chunks = scraper.scrape_file(filepath=self.files_directory+"/example.jpg", local=True)
         core.save_outputs(chunks)
         self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))
 
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
index 5c8afdc..9a1eed1 100644
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -2,8 +2,8 @@
 import os
 import sys
 sys.path.append('..')
-from thepipe import core
-from thepipe import scraper
+import thepipe.core as core
+import thepipe.scraper as scraper
 
 class test_scraper(unittest.TestCase):
     def setUp(self):
@@ -83,7 +83,7 @@ def test_scrape_audio(self):
         self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))
 
     def test_scrape_video(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.mp4", verbose=True, local=True)
         # verify it scraped the video file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
diff --git a/thepipe/__init__.py b/thepipe/__init__.py
index 37c0669..ea34f91 100644
--- a/thepipe/__init__.py
+++ b/thepipe/__init__.py
@@ -1,7 +1,6 @@
 import os
 from .scraper import scrape_file, scrape_url, scrape_directory
-from .chunker import chunk_by_document, chunk_by_page, chunk_by_section, chunk_semantic
-from .core import Chunk, calculate_tokens, chunks_to_messages, parse_arguments, save_outputs
+from .core import parse_arguments, save_outputs
 
 def main() -> None:
     args = parse_arguments()
diff --git a/thepipe/chunker.py b/thepipe/chunker.py
index 27cd3ad..1c0ea4d 100644
--- a/thepipe/chunker.py
+++ b/thepipe/chunker.py
@@ -1,6 +1,6 @@
 import re
-from typing import Dict, List, Optional, Tuple
-from .core import Chunk, calculate_tokens
+from typing import List
+from .core import Chunk
 from sklearn.metrics.pairwise import cosine_similarity
 
 def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]:
diff --git a/thepipe/core.py b/thepipe/core.py
index 047f852..51ccbdd 100644
--- a/thepipe/core.py
+++ b/thepipe/core.py
@@ -1,8 +1,8 @@
 import argparse
 import base64
 from io import BytesIO
-import json
 import os
+import re
 import time
 from typing import Dict, List, Optional, Union
 import requests
@@ -26,20 +26,43 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
         else:
             return [Document(text=document_text)]
         
-    def to_message(self, host_images: bool = False, max_resolution : Optional[int] = None) -> Dict:
+    def to_message(self, host_images: bool = False, max_resolution: Optional[int] = None) -> Dict:
         message = {"role": "user", "content": []}
+        image_urls = [make_image_url(image, host_images, max_resolution) for image in self.images]
+        
         if self.texts:
-            prompt = "\n```\n" + '\n'.join(self.texts) + "\n```\n" 
-            message["content"].append({"type": "text", "text": prompt})
-        for image in self.images:
-            image_url = make_image_url(image, host_images, max_resolution)
+            message_text = "\n\n"
+            img_index = 0
+            
+            for text in self.texts:
+                if host_images:
+                    def replace_image(match):
+                        nonlocal img_index
+                        if img_index < len(image_urls):
+                            url = image_urls[img_index]
+                            img_index += 1
+                            return f"![image]({url})"
+                        return match.group(0)  # If we run out of images, leave the original text
+    
+                    # Replace markdown image references with hosted URLs
+                    text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', replace_image, text)
+                
+                message_text += text + "\n\n"
+
+            # clean up, add to message
+            message_text = re.sub(r'\n{3,}', '\n\n', message_text).strip()
+            message["content"].append({"type": "text", "text": message_text})
+        
+        # Add remaining images that weren't referenced in the text
+        for image_url in image_urls:
             message["content"].append({"type": "image_url", "image_url": image_url})
+        
         return message
     
     def to_json(self, host_images: bool = False) -> Dict:
         data = {
             'path': self.path,
-            'texts': self.texts,
+            'texts': [text.strip() for text in self.texts],
             'images': [make_image_url(image=image, host_images=host_images) for image in self.images],
             'audios': self.audios,
             'videos': self.videos,
@@ -61,7 +84,7 @@ def from_json(data: Dict, host_images: bool = False) -> 'Chunk':
                 images.append(image)
         return Chunk(
             path=data['path'],
-            texts=data['texts'],
+            texts=[text.strip() for text in data['texts']],
             images=images,
             audios=data['audios'],
             videos=data['videos'],
diff --git a/thepipe/scraper.py b/thepipe/scraper.py
index af2cfd1..2d00f7f 100644
--- a/thepipe/scraper.py
+++ b/thepipe/scraper.py
@@ -1,9 +1,10 @@
 import base64
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
+import io
 import math
 import re
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional
 import glob
 import os
 import tempfile
@@ -18,8 +19,11 @@
 import dotenv
 import shutil
 from magika import Magika
+from .core import make_image_url, Chunk
 dotenv.load_dotenv()
 
+from typing import List, Optional
+
 FOLDERS_TO_IGNORE = ['*node_modules.*', '.*venv.*', '.*\.git.*', '.*\.vscode.*', '.*pycache.*']
 FILES_TO_IGNORE = ['package-lock.json', '.gitignore', '.*\.bin', '.*\.pyc', '.*\.pyo', '.*\.exe', '.*\.dll', '.*\.ipynb_checkpoints']
 GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN", None)
@@ -29,6 +33,9 @@
 TWITTER_DOMAINS = ['https://twitter.com', 'https://www.twitter.com', 'https://x.com', 'https://www.x.com']
 YOUTUBE_DOMAINS = ['https://www.youtube.com', 'https://youtube.com']
 GITHUB_DOMAINS = ['https://github.com', 'https://www.github.com']
+EXTRACTION_PROMPT = """Output the entire extracted text from the document in detailed markdown format.
+Be sure to correctly format markdown for headers, paragraphs, lists, tables, menus, full text contents, etc.
+Always reply immediately with only markdown. Do not output anything else."""
 
 def detect_source_type(source: str) -> str:
     # otherwise, try to detect the file type by its extension
@@ -47,12 +54,12 @@ def detect_source_type(source: str) -> str:
     mimetype = result.output.mime_type
     return mimetype
 
-def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]:
+def scrape_file(filepath: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]:
     if not local:
-        with open(source, 'rb') as f:
+        with open(filepath, 'rb') as f:
             response = requests.post(
                 url=f"{HOST_URL}/scrape",
-                files={'file': (source, f)},
+                files={'file': (filepath, f)},
                 data={'ai_extraction': ai_extraction, 'text_only': text_only}
             )
             response_json = response.json()
@@ -62,44 +69,44 @@ def scrape_file(source: str, ai_extraction: bool = False, text_only: bool = Fals
             return chunks
     # returns chunks of scraped content from any source (file, URL, etc.)
     extraction = []
-    source_type = detect_source_type(source)
+    source_type = detect_source_type(filepath)
     if source_type is None:
         if verbose:
-            print(f"[thepipe] Unsupported source type: {source}")
+            print(f"[thepipe] Unsupported source type: {filepath}")
         return extraction
     if verbose: 
-        print(f"[thepipe] Scraping {source_type}: {source}...")
+        print(f"[thepipe] Scraping {source_type}: {filepath}...")
     if source_type == 'application/pdf':
-        extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
+        extraction = scrape_pdf(file_path=filepath, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
     elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
-        extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only)
+        extraction = scrape_docx(file_path=filepath, verbose=verbose, text_only=text_only)
     elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
-        extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only)
+        extraction = scrape_pptx(file_path=filepath, verbose=verbose, text_only=text_only)
     elif source_type.startswith('image/'):
-        extraction = scrape_image(file_path=source, text_only=text_only)
+        extraction = scrape_image(file_path=filepath, text_only=text_only)
     elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
-        extraction = scrape_spreadsheet(file_path=source)
+        extraction = scrape_spreadsheet(file_path=filepath, source_type=source_type)
     elif source_type == 'application/x-ipynb+json':
-        extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only)
+        extraction = scrape_ipynb(file_path=filepath, verbose=verbose, text_only=text_only)
     elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed':
-        extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
+        extraction = scrape_zip(file_path=filepath, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
     elif source_type.startswith('video/'):
-        extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only)
+        extraction = scrape_video(file_path=filepath, verbose=verbose, text_only=text_only)
     elif source_type.startswith('audio/'):
-        extraction = scrape_audio(file_path=source, verbose=verbose)
+        extraction = scrape_audio(file_path=filepath, verbose=verbose)
     elif source_type.startswith('text/'):
-        extraction = scrape_plaintext(file_path=source)
+        extraction = scrape_plaintext(file_path=filepath)
     else:
         try:
-            extraction = scrape_plaintext(file_path=source)
+            extraction = scrape_plaintext(file_path=filepath)
         except Exception as e:
             if verbose: 
-                print(f"[thepipe] Error extracting from {source}: {e}")
+                print(f"[thepipe] Error extracting from {filepath}: {e}")
     if verbose: 
         if extraction:
-            print(f"[thepipe] Extracted from {source}")
+            print(f"[thepipe] Extracted from {filepath}")
         else:
-            print(f"[thepipe] No content extracted from {source}")
+            print(f"[thepipe] No content extracted from {filepath}")
     return extraction
 
 def scrape_plaintext(file_path: str) -> List[Chunk]:
@@ -113,7 +120,7 @@ def scrape_directory(dir_path: str, include_regex: Optional[str] = None, verbose
     if include_regex:
         all_files = [file for file in all_files if re.search(include_regex, file, re.IGNORECASE)]
     with ThreadPoolExecutor() as executor:
-        results = executor.map(lambda file_path: scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files)
+        results = executor.map(lambda file_path: scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=local), all_files)
         for result in results:
             extraction += result
     return extraction
@@ -129,46 +136,38 @@ def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: boo
 def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]:
     chunks = []
     if ai_extraction:
-        # ai_extraction uses layout analysis AI to extract markdown, equations, tables, and images from the PDF
-        MD_FOLDER = 'mdoutput'
-        if not os.path.exists(MD_FOLDER):
-            os.makedirs(MD_FOLDER)
-        else:
-            shutil.rmtree(MD_FOLDER)
-            os.makedirs(MD_FOLDER)
-        os.system(f"marker_single {file_path} {MD_FOLDER} --batch_multiplier 4 --max_pages 1000 --langs English")
-        # Find the .md file and read its content
-        for output_file in glob.glob(f'{MD_FOLDER}/*/*', recursive=True):
-            if output_file.endswith('.md'):
-                with open(output_file, 'r') as f:
-                    markdown = f.read()
-                    break
-        if not markdown:
-            if verbose: print(f"[thepipe] No markdown extracted from {file_path} (AI extraction likely failed).")
-            raise ValueError("AI extraction failed.")
-        if text_only:
-            chunks.append(Chunk(path=file_path, texts=[markdown]))
-            return chunks
-        # split the markdown into text and images, so we can return them in the correct order
-        content_pattern = re.compile(r'(\!\[.*?\]\(.*?\)|[^!\[]+)')
-        content_matches = content_pattern.findall(markdown)
-        for content in content_matches:
-            if content.startswith('!['):
-                # matched an image
-                if text_only:
-                    continue
-                image_url = os.path.join(MD_FOLDER, re.search(r'\((.*?)\)', content).group(1))
-                try:
-                    image = Image.open(image_url) # the image url is a local path
-                    chunks.append(Chunk(path=file_path, images=[image]))
-                except Exception as e:
-                    if verbose: print(f"[thepipe] Error loading image {image_url}: {e}")
-            else:
-                # matched text
-                chunks.append(Chunk(path=file_path, texts=[content.strip()]))
-        # remove the output folder
-        shutil.rmtree(MD_FOLDER)
-        if verbose: print(f"[thepipe] AI extracted from {file_path}")
+        # if using AI extraction, for each page, generate markdown and cropped figures
+        import fitz
+        import modal
+        with open(file_path, "rb") as f:
+            pdf_bytes = f.read()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            images = []
+    
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                pix = page.get_pixmap()
+                image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                img_byte_arr = io.BytesIO()
+                image.save(img_byte_arr, format='PNG')
+                images.append(img_byte_arr.getvalue())
+    
+            app_name = "scrape-pdf"
+            function_name = "get_nougat_and_layout_preds_per_page"
+            fn = modal.Function.lookup(app_name, function_name)
+            results = fn.remote(images)
+            
+            chunks = []
+            for i, result in enumerate(results):
+                texts = result['texts']
+                # nougat often outputs many newlines
+                for text in texts:
+                    # remove excessive newlines
+                    text = re.sub(r'\n{3,}', '\n\n', text)
+                    text = text.strip()
+                figures = result['figures']
+                chunks.append(Chunk(path=file_path, texts=texts, images=figures))
+        
         return chunks
     else:
         # if not using AI extraction, for each page, extract markdown and (optionally) full page images
@@ -179,6 +178,9 @@ def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = Fa
             md_reader = pymupdf4llm.helpers.pymupdf_rag.to_markdown(doc, page_chunks=True)
             for i, page in enumerate(doc):
                 text = md_reader[i]["text"]
+                # remove excessive newlines
+                text = re.sub(r'\n{3,}', '\n\n', text)
+                text = text.strip()
                 if text_only:
                     chunks.append(Chunk(path=file_path, texts=[text]))
                 else:
@@ -225,11 +227,11 @@ def scrape_image(file_path: str, text_only: bool = False) -> List[Chunk]:
         chunks.append(Chunk(path=file_path, images=[img]))
     return chunks
 
-def scrape_spreadsheet(file_path: str) -> List[Chunk]:
+def scrape_spreadsheet(file_path: str, source_type: str) -> List[Chunk]:
     import pandas as pd
-    if file_path.endswith(".csv"):
+    if source_type == 'application/vnd.ms-excel':
         df = pd.read_csv(file_path)
-    elif file_path.endswith(".xls") or file_path.endswith(".xlsx"):
+    elif source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
         df = pd.read_excel(file_path)
     else:
         raise ValueError("Unsupported file format")
@@ -242,7 +244,81 @@ def scrape_spreadsheet(file_path: str) -> List[Chunk]:
         chunks.append(Chunk(path=file_path, texts=[item_json]))
     return chunks
 
-def extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Tuple[str, List[str]]:
+def ai_extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Chunk:
+    from playwright.sync_api import sync_playwright
+    import modal
+    from openai import OpenAI
+
+    app_name = "scrape-ui"
+    function_name = "get_ui_layout_preds"
+    fn = modal.Function.lookup(app_name, function_name)
+    
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        context = browser.new_context(user_agent=USER_AGENT_STRING)
+        page = context.new_page()
+        page.goto(url, wait_until='domcontentloaded')
+        
+        viewport_height = page.viewport_size['height']
+        total_height = page.evaluate("document.body.scrollHeight")
+        current_scroll_position = 0
+        scrolldowns, max_scrolldowns = 0, 3
+        images = []
+
+        while current_scroll_position < total_height and scrolldowns < max_scrolldowns:
+            page.wait_for_timeout(1000)
+            screenshot = page.screenshot(full_page=False)
+            img = Image.open(io.BytesIO(screenshot))
+            images.append(img)
+
+            current_scroll_position += viewport_height
+            page.evaluate(f"window.scrollTo(0, {current_scroll_position})")
+            scrolldowns += 1
+            total_height = page.evaluate("document.body.scrollHeight")
+        
+        browser.close()
+
+    if images:
+        # Vertically stack the images
+        total_height = sum(img.height for img in images)
+        max_width = max(img.width for img in images)
+        stacked_image = Image.new('RGB', (max_width, total_height))
+        y_offset = 0
+        for img in images:
+            stacked_image.paste(img, (0, y_offset))
+            y_offset += img.height
+
+        # Process the stacked image with the UI model
+        figures = fn.remote(stacked_image)
+
+        # Process the stacked image with VLM
+        openrouter_client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=os.environ["OPENROUTER_API_KEY"],
+        )
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": make_image_url(stacked_image)},
+                    {"type": "text", "text": EXTRACTION_PROMPT},
+                ]
+            },
+        ]
+        response = openrouter_client.chat.completions.create(
+            model="google/gemini-flash-1.5",
+            messages=messages,
+            temperature=0.2
+        )
+        llm_response = response.choices[0].message.content
+        chunk = Chunk(path=url, texts=[llm_response], images=figures)
+    else:
+        raise ValueError("Model received 0 images from webpage")
+
+    return chunk
+
+def extract_page_content(url: str, text_only: bool = False, verbose: bool = False) -> Chunk:
     from urllib.parse import urlparse
     import markdownify
     from bs4 import BeautifulSoup
@@ -250,6 +326,9 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals
     import base64
     import requests
     
+    texts = []
+    images = []
+    
     with sync_playwright() as p:
         browser = p.chromium.launch()
         context = browser.new_context(user_agent="USER_AGENT_STRING")
@@ -263,7 +342,7 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals
         scrolldowns, max_scrolldowns = 0, 20  # Finite to prevent infinite scroll
         
         while current_scroll_position < total_height and scrolldowns < max_scrolldowns:
-            page.wait_for_timeout(100)  # Wait for dynamic content to load
+            page.wait_for_timeout(1000)  # Wait for dynamic content to load
             current_scroll_position += viewport_height
             page.evaluate(f"window.scrollTo(0, {current_scroll_position})")
             scrolldowns += 1
@@ -277,56 +356,53 @@ def extract_page_content(url: str, text_only: bool = False, verbose: bool = Fals
         markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")
         
         # Remove excessive newlines in the markdown
-        while '\n\n\n' in markdown_content:
-            markdown_content = markdown_content.replace('\n\n\n', '\n\n')
-        
-        if text_only:
-            browser.close()
-            return markdown_content, []
+        markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
+        markdown_content = markdown_content.strip()
 
-        # Extract images from the page using heuristics
-        # to adaptively read image URLs
-        images = []
-        for img in page.query_selector_all('img'):
-            img_path = img.get_attribute('src')
-            if not img_path:
-                continue
-            if img_path.startswith('data:image'):
-                # save base64 image to PIL Image
-                decoded_data = base64.b64decode(img_path.split(',')[1])
-                try:
-                    image = Image.open(BytesIO(decoded_data))
-                    images.append(image)
-                except Exception as e:
-                    if verbose: print(f"[thepipe] Ignoring error loading image {img_path}: {e}")
-                    continue  # Ignore incompatible image extractions
-            else:
-                try:
-                    image = Image.open(requests.get(img_path, stream=True).raw)
-                    images.append(image)
-                except:
-                    if 'https://' not in img_path and 'http://' not in img_path:
-                        try:
-                            while img_path.startswith('/'):
-                                img_path = img_path[1:]
-                            path_with_schema = urlparse(url).scheme + "://" + img_path
-                            image = Image.open(requests.get(path_with_schema, stream=True).raw)
-                            images.append(image)
-                        except:
+        texts.append(markdown_content)
+        
+        if not text_only:
+            # Extract images from the page using heuristics
+            for img in page.query_selector_all('img'):
+                img_path = img.get_attribute('src')
+                if not img_path:
+                    continue
+                if img_path.startswith('data:image'):
+                    # Save base64 image to PIL Image
+                    decoded_data = base64.b64decode(img_path.split(',')[1])
+                    try:
+                        image = Image.open(BytesIO(decoded_data))
+                        images.append(image)
+                    except Exception as e:
+                        if verbose: print(f"[thepipe] Ignoring error loading image {img_path}: {e}")
+                        continue  # Ignore incompatible image extractions
+                else:
+                    try:
+                        image = Image.open(requests.get(img_path, stream=True).raw)
+                        images.append(image)
+                    except:
+                        if 'https://' not in img_path and 'http://' not in img_path:
                             try:
-                                path_with_schema_and_netloc = urlparse(url).scheme + "://" + urlparse(url).netloc + "/" + img_path
-                                image = Image.open(requests.get(path_with_schema_and_netloc, stream=True).raw)
+                                while img_path.startswith('/'):
+                                    img_path = img_path[1:]
+                                path_with_schema = urlparse(url).scheme + "://" + img_path
+                                image = Image.open(requests.get(path_with_schema, stream=True).raw)
                                 images.append(image)
                             except:
-                                if verbose: print(f"[thepipe] Ignoring error loading image {img_path}")
-                                continue  # Ignore incompatible image extractions
-                    else:
-                        if verbose: print(f"[thepipe] Ignoring error loading image {img_path}")
-                        continue  # Ignore incompatible image extractions
+                                try:
+                                    path_with_schema_and_netloc = urlparse(url).scheme + "://" + urlparse(url).netloc + "/" + img_path
+                                    image = Image.open(requests.get(path_with_schema_and_netloc, stream=True).raw)
+                                    images.append(image)
+                                except:
+                                    if verbose: print(f"[thepipe] Ignoring error loading image {img_path}")
+                                    continue  # Ignore incompatible image extractions
+                        else:
+                            if verbose: print(f"[thepipe] Ignoring error loading image {img_path}")
+                            continue  # Ignore incompatible image extractions
                 
         browser.close()
-    print("N_IMAGES", len(images))
-    return markdown_content, images
+    
+    return Chunk(path=url, texts=texts, images=images)
 
 def parse_html_to_markdown(html_content):
     from bs4 import BeautifulSoup, NavigableString, Tag
@@ -380,20 +456,19 @@ def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False, v
             response = requests.get(url)
             with open(file_path, 'wb') as file:
                 file.write(response.content)
-            chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=True)
+            chunks = scrape_file(filepath=file_path, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=True)
             for chunk in chunks:
                 all_texts.extend(chunk.texts)
                 all_images.extend(chunk.images)
+        return [Chunk(path=url, texts=all_texts, images=all_images)]
     else:
         # if url leads to web content, scrape it directly
-        markdown_content, images = extract_page_content(url, text_only=text_only, verbose=verbose)
-        all_texts.append(markdown_content)
-        if not text_only:
-            all_images.extend(images)
-    if not all_texts and not all_images:
-        raise ValueError("No content extracted from URL.")
-    return [Chunk(path=url, texts=all_texts, images=all_images)]
-
+        if ai_extraction:
+            chunk = ai_extract_page_content(url=url, text_only=text_only, verbose=verbose)
+        else:
+            chunk = extract_page_content(url=url, text_only=text_only, verbose=verbose)
+        return [chunk]
+    
 def format_timestamp(seconds, chunk_index, chunk_duration):
     # helper function to format the timestamp.
     total_seconds = chunk_index * chunk_duration + seconds
@@ -556,6 +631,7 @@ def read_docx_tables(tab):
                                         image_part = document.part.related_parts[embed_attr]
                                         image_data = io.BytesIO(image_part._blob)
                                         image = Image.open(image_data)
+                                        image.load()
                                         block_images.append(image)
                                         image_counter += 1
             elif block.__class__.__name__ == 'Table':