From 9fa148b9cc6b1a8922741a79335bf0d2e3c5e74b Mon Sep 17 00:00:00 2001
From: Emmett McFaralne <staminacode@gmail.com>
Date: Fri, 5 Jul 2024 18:42:08 -0400
Subject: [PATCH] fixed to_message, fixed by_document chunker

---
 README.md               |   6 +-
 setup.py                |   2 +-
 tests/test_core.py      |  21 +++--
 tests/test_scraper.py   |  26 +++----
 thepipe_api/__init__.py |   4 +-
 thepipe_api/chunker.py  |  26 ++++++-
 thepipe_api/core.py     |  30 ++++---
 thepipe_api/scraper.py  | 169 +++++++++++++++++++++++-----------------
 thepipe_api/thepipe.py  |  83 +++++++++++++++++---
 9 files changed, 244 insertions(+), 123 deletions(-)

diff --git a/README.md b/README.md
index b3a2b06..991ecac 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v
 
 ## Get started in 5 minutes  🚀
 
-thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).
+thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal PDF & video response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe).
 
 ### Hosted API (Python)
 
@@ -49,7 +49,7 @@ import thepipe_api as tp
 from openai import OpenAI
 
 # scrape markdown + images
-chunks = tp.scrape_file(
+chunks = tp.scrape(
   source="example.pdf",
   ai_extraction=True
 )
@@ -58,7 +58,7 @@ chunks = tp.scrape_file(
 client = OpenAI()
 response = client.chat.completions.create(
     model="gpt-4o",
-    messages=tp.to_messages(chunks),
+    messages=tp.chunks_to_messages(chunks),
 )
 ```
 
diff --git a/setup.py b/setup.py
index 681645a..9e05d53 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='thepipe_api',
-    version='1.0.0',
+    version='1.0.1',
     author='Emmett McFarlane',
     author_email='emmett@thepi.pe',
     description='AI-native scraper for multimodal LLMs.',
diff --git a/tests/test_core.py b/tests/test_core.py
index 5e147da..013f713 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -38,9 +38,9 @@ def test_image_to_base64(self):
         decoded_image = Image.open(BytesIO(image_data))
         self.assertEqual(image.size, decoded_image.size)
 
-    def test_to_messages(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.md")
-        messages = core.to_messages(chunks)
+    def test_chunks_to_messages(self):
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True)
+        messages = core.chunks_to_messsages(chunks)
         self.assertEqual(type(messages), list)
         for message in messages:
             self.assertEqual(type(message), dict)
@@ -55,7 +55,7 @@ def test_save_outputs(self):
             text = file.read()
         self.assertIn('Hello, World!', text)
         # verify with images
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg")
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True)
         thepipe.save_outputs(chunks)
         self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory)))
 
@@ -63,4 +63,15 @@ def test_parse_arguments(self):
         args = thepipe.parse_arguments()
         self.assertEqual(type(args), argparse.Namespace)
         self.assertIn('source', vars(args))
-        self.assertIn('include_regex', vars(args))
\ No newline at end of file
+        self.assertIn('include_regex', vars(args))
+
+    def test_calculate_tokens(self):
+        text = "Hello, World!"
+        tokens = core.calculate_tokens([core.Chunk(texts=[text])])
+        self.assertAlmostEqual(tokens, 3.25, places=0)
+
+    def test_calculate_image_tokens(self):
+        image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
+        image.load() # needed to close the file
+        tokens = core.calculate_image_tokens(image)
+        self.assertAlmostEqual(tokens, 85, places=0)
\ No newline at end of file
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
index 77261ed..673c5bc 100644
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -20,7 +20,7 @@ def tearDown(self):
             os.rmdir(self.outputs_directory)
     
     def test_scrape_zip(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True)
         # verify it scraped the zip file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -31,7 +31,7 @@ def test_scrape_zip(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
     
     def test_scrape_ipynb(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True, local=True)
         # verify it scraped the ipynb file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -42,7 +42,7 @@ def test_scrape_ipynb(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_scrape_pdf_with_ai_extraction(self):
-        chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True)
+        chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True, local=True)
         # verify it scraped the pdf file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -52,7 +52,7 @@ def test_scrape_pdf_with_ai_extraction(self):
             self.assertIsNotNone(chunk.texts or chunk.images)
     
     def test_scrape_docx(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True, local=True)
         # verify it scraped the docx file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -63,7 +63,7 @@ def test_scrape_docx(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
     
     def test_extract_pdf_without_ai_extraction(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True, local=True)
         # verify it scraped the pdf file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -74,7 +74,7 @@ def test_extract_pdf_without_ai_extraction(self):
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
 
     def test_scrape_audio(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True, local=True)
         # verify it scraped the audio file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -85,7 +85,7 @@ def test_scrape_audio(self):
         self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))
 
     def test_scrape_video(self):
-        chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True)
+        chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True)
         # verify it scraped the video file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -98,7 +98,7 @@ def test_scrape_video(self):
         self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None))
     
     def test_scrape_pptx(self):
-        chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True)
+        chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True, local=True)
         # verify it scraped the pptx file into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -110,7 +110,7 @@ def test_scrape_pptx(self):
 
     def test_scrape_tweet(self):
         tweet_url = "https://x.com/ylecun/status/1796734866156843480"
-        chunks = scraper.scrape_url(tweet_url)
+        chunks = scraper.scrape_url(tweet_url, local=True)
         # verify it returned chunks representing the tweet
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -120,7 +120,7 @@ def test_scrape_tweet(self):
         self.assertTrue(len(chunks[0].images) > 0)
     
     def test_scrape_youtube(self):
-        chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8")
+        chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8", local=True)
         # verify it scraped the youtube video into chunks
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0)
@@ -134,7 +134,7 @@ def test_scrape_youtube(self):
 
     def test_scrape_url(self):
         # verify web page scrape result
-        chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping')
+        chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping', local=True)
         for chunk in chunks:
             self.assertEqual(type(chunk), core.Chunk)
             self.assertEqual(chunk.path, 'https://en.wikipedia.org/wiki/Piping')
@@ -144,12 +144,12 @@ def test_scrape_url(self):
         # verify if at least one image was scraped
         self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks))
         # verify file url scrape result
-        chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf')
+        chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', local=True)
         self.assertEqual(len(chunks), 1)
 
     @unittest.skipUnless(os.environ.get('GITHUB_TOKEN'), "requires GITHUB_TOKEN")
     def test_scrape_github(self):
-        chunks = scraper.scrape_url('https://github.com/emcf/thepipe')
+        chunks = scraper.scrape_url('https://github.com/emcf/thepipe', local=True)
         self.assertEqual(type(chunks), list)
         self.assertNotEqual(len(chunks), 0) # should have some repo contents
     
diff --git a/thepipe_api/__init__.py b/thepipe_api/__init__.py
index f822200..e29f7c6 100644
--- a/thepipe_api/__init__.py
+++ b/thepipe_api/__init__.py
@@ -1,2 +1,4 @@
 from .scraper import scrape_file, scrape_url, scrape_directory
-from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
\ No newline at end of file
+from .chunker import chunk_by_page, chunk_by_section, chunk_semantic
+from .core import Chunk, calculate_tokens, chunks_to_messsages
+from .thepipe import extract # deprecated
\ No newline at end of file
diff --git a/thepipe_api/chunker.py b/thepipe_api/chunker.py
index 505e460..01257dd 100644
--- a/thepipe_api/chunker.py
+++ b/thepipe_api/chunker.py
@@ -3,6 +3,24 @@
 from .core import Chunk, calculate_tokens
 from sklearn.metrics.pairwise import cosine_similarity
 
+def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]:
+    chunks_by_doc = {}
+    new_chunks = []
+    for chunk in chunks:
+        if not chunk.path:
+            raise ValueError("Document chunking requires the path attribute to determine the document boundaries")
+        if chunk.path not in chunks_by_doc:
+            chunks_by_doc[chunk.path] = []
+        chunks_by_doc[chunk.path].append(chunk)
+    for doc_chunks in chunks_by_doc.values():
+        doc_texts = []
+        doc_images = []
+        for chunk in doc_chunks:
+            doc_texts.extend(chunk.texts)
+            doc_images.extend(chunk.images)
+        new_chunks.append(Chunk(path=doc_chunks[0].path, texts=doc_texts, images=doc_images))
+    return new_chunks    
+
 def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]:
     # by-page chunking is default behavior
     return chunks
@@ -28,7 +46,7 @@ def chunk_by_section(chunks: List[Chunk]) -> List[Chunk]:
         section_chunks.append(Chunk(texts=[current_chunk_text], images=current_chunk_images))
     return section_chunks
 
-def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 2, similarity_threshold: float = 0.5) -> List[Chunk]:
+def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 3, similarity_threshold: float = 0.1) -> List[Chunk]:
     from sentence_transformers import SentenceTransformer
     model = SentenceTransformer(model_name)
     # Flatten the chunks into sentences
@@ -69,8 +87,12 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers
     for group in grouped_sentences:
         group_texts = [sentences[i] for i in group]
         group_images = []
+        seen_images = []
         for i in group:
-            group_images.extend(sentence_chunk_map[i].images)
+            for image in sentence_chunk_map[i].images:
+                if image not in seen_images:
+                    group_images.append(image)
+                    seen_images.append(image)
         new_chunks.append(Chunk(texts=group_texts, images=group_images))
     
     return new_chunks
\ No newline at end of file
diff --git a/thepipe_api/core.py b/thepipe_api/core.py
index 4dbf11d..a262eed 100644
--- a/thepipe_api/core.py
+++ b/thepipe_api/core.py
@@ -19,7 +19,18 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]:
             return [ImageDocument(text=document_text, image=image) for image in self.images]
         else:
             return [Document(text=document_text)]
-
+        
+    def to_message(self) -> Dict:
+        content = []
+        if self.texts:
+            for text in self.texts:
+                content.append({"type": "text", "text": {"content": text}})
+        if self.images:
+            for image in self.images:
+                base64_image = image_to_base64(image)
+                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
+        return {"role": "user", "content": content}
+    
 # uses https://platform.openai.com/docs/guides/vision
 def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int:
     width, height = image.size
@@ -56,18 +67,5 @@ def image_to_base64(image: Image.Image) -> str:
     image.save(buffered, format="JPEG")
     return base64.b64encode(buffered.getvalue()).decode()
 
-def to_messages(chunks: List[Chunk]) -> List[Dict]:
-    # audio and video are not yet supported as they
-    # are not common in SOTA multimodel LLMs (June 2024)
-    messages = []
-    for chunk in chunks:
-        content = []
-        if chunk.texts:
-            for text in chunk.texts:
-                content.append({"type": "text", "text": {"content": text}})
-        if chunk.images:
-            for image in chunk.images:
-                base64_image = image_to_base64(image)
-                content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
-        messages.append({"role": "user", "content": content})
-    return messages
+def chunks_to_messsages(chunks: List[Chunk]) -> List[Dict]:
+    return [chunk.to_message() for chunk in chunks]
\ No newline at end of file
diff --git a/thepipe_api/scraper.py b/thepipe_api/scraper.py
index 21f4f52..bb19b4c 100644
--- a/thepipe_api/scraper.py
+++ b/thepipe_api/scraper.py
@@ -38,6 +38,7 @@ def default(self, obj):
 TWITTER_DOMAINS = ['https://twitter.com', 'https://www.twitter.com', 'https://x.com', 'https://www.x.com']
 YOUTUBE_DOMAINS = ['https://www.youtube.com', 'https://youtube.com']
 GITHUB_DOMAINS = ['https://github.com', 'https://www.github.com']
+API_URL_V2 = "https://localhost:5000/scrape"
 
 def detect_source_type(source: str) -> str:
     # otherwise, try to detect the file type by its extension
@@ -56,48 +57,60 @@ def detect_source_type(source: str) -> str:
     mimetype = result.output.mime_type
     return mimetype
 
-def scrape_file(source: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False) -> List[Chunk]:
-    # returns chunks of scraped content from any source (file, URL, etc.)
-    extraction = []
-    source_type = detect_source_type(source)
-    if source_type is None:
-        if verbose:
-            print(f"[thepipe] Unsupported source type: {source}")
-        return extraction
-    if verbose: 
-        print(f"[thepipe] Scraping {source_type}: {source}...")
-    if source_type == 'application/pdf':
-        extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
-    elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
-        extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only)
-    elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
-        extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only)
-    elif source_type.startswith('image/'):
-        extraction = scrape_image(file_path=source, text_only=text_only)
-    elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
-        extraction = scrape_spreadsheet(file_path=source)
-    elif source_type == 'application/x-ipynb+json':
-        extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only)
-    elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed':
-        extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
-    elif source_type.startswith('video/'):
-        extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only)
-    elif source_type.startswith('audio/'):
-        extraction = scrape_audio(file_path=source, verbose=verbose)
-    elif source_type.startswith('text/'):
-        extraction = scrape_plaintext(file_path=source)
-    else:
-        try:
+def scrape_file(source: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]:
+    if local:
+        extraction = []
+        source_type = detect_source_type(source)
+        if source_type is None:
+            if verbose:
+                print(f"[thepipe] Unsupported source type: {source}")
+            return extraction
+        if verbose: 
+            print(f"[thepipe] Scraping {source_type}: {source}...")
+        if source_type == 'application/pdf':
+            extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
+        elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+            extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only)
+        elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
+            extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only)
+        elif source_type.startswith('image/'):
+            extraction = scrape_image(file_path=source, text_only=text_only)
+        elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+            extraction = scrape_spreadsheet(file_path=source)
+        elif source_type == 'application/x-ipynb+json':
+            extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only)
+        elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed':
+            extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
+        elif source_type.startswith('video/'):
+            extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only)
+        elif source_type.startswith('audio/'):
+            extraction = scrape_audio(file_path=source, verbose=verbose)
+        elif source_type.startswith('text/'):
             extraction = scrape_plaintext(file_path=source)
-        except Exception as e:
-            if verbose: 
-                print(f"[thepipe] Error extracting from {source}: {e}")
+        else:
+            try:
+                extraction = scrape_plaintext(file_path=source)
+            except Exception as e:
+                if verbose: 
+                    print(f"[thepipe] Error extracting from {source}: {e}")
+    else:
+        with open(source, 'rb') as f:
+            response = requests.post(
+                url=API_URL_V2,
+                files={'file': (source, f)},
+                data={'text_only': text_only, 'ai_extraction': ai_extraction},
+                headers={"Authorization": f"Bearer {THEPIPE_API_KEY}"}
+            )
+        response_json = response.json()
+        if 'error' in response_json:
+            raise ValueError(f"{response_json['error']}")
+        chunks_json = response_json['chunks']
+        extraction = [Chunk.from_json(chunk_json) for chunk_json in chunks_json]
     if verbose: 
         if extraction:
             print(f"[thepipe] Extracted from {source}")
         else:
             print(f"[thepipe] No content extracted from {source}")
-    
     return extraction
 
 def scrape_plaintext(file_path: str) -> List[Chunk]:
@@ -126,7 +139,6 @@ def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: boo
 
 def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]:
     chunks = []
-
     if ai_extraction:
         # ai_extraction uses layout analysis AI to extract markdown, equations, tables, and images from the PDF
         MD_FOLDER = 'mdoutput'
@@ -137,11 +149,15 @@ def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = Fa
             os.makedirs(MD_FOLDER)
         os.system(f"marker_single {file_path} {MD_FOLDER} --batch_multiplier 4 --max_pages 1000 --langs English")
         # Find the .md file and read its content
+        markdown = None
         for output_file in glob.glob(f'{MD_FOLDER}/*/*', recursive=True):
             if output_file.endswith('.md'):
                 with open(output_file, 'r') as f:
                     markdown = f.read()
                     break
+        if markdown is None:
+            if verbose: print(f"[thepipe] No markdown file found in {MD_FOLDER}. (AI extraction likely crashed)")
+            raise ValueError("AI extraction failed.")
         if text_only:
             chunks.append(Chunk(path=file_path, texts=[markdown]))
             return chunks
@@ -252,7 +268,7 @@ def extract_page_content(url: str, verbose: bool = False) -> Tuple[str, List[str
         viewport_height = page.viewport_size['height']
         total_height = page.evaluate("document.body.scrollHeight")
         current_scroll_position = 0
-        scrolldowns, max_scrolldowns = 0, 20  # Finite to prevent infinite scroll
+        scrolldowns, max_scrolldowns = 0, 10  # Finite to prevent infinite scroll
         while current_scroll_position < total_height and scrolldowns < max_scrolldowns:
             page.wait_for_timeout(100)  # Wait for dynamic content to load
             current_scroll_position += viewport_height
@@ -306,39 +322,52 @@ def traverse_and_extract(element):
         traverse_and_extract(body)
     return ''.join(markdown_content)
 
-def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False) -> List[Chunk]:
-    if any(url.startswith(domain) for domain in TWITTER_DOMAINS):
-        extraction = scrape_tweet(url=url, text_only=text_only)
-        return extraction
-    elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS):
-        extraction = scrape_youtube(youtube_url=url, text_only=text_only)
-        return extraction
-    elif any(url.startswith(domain) for domain in GITHUB_DOMAINS):
-        extraction = scrape_github(github_url=url, text_only=text_only, ai_extraction=ai_extraction)
-        return extraction
-    _, extension = os.path.splitext(urlparse(url).path)
-    all_texts = []
-    all_images = []
-    if extension and extension not in {'.html', '.htm', '.php', '.asp', '.aspx'}:
-        # if url leads to a file, attempt to download it and scrape it
-        with tempfile.TemporaryDirectory() as temp_dir:
-            file_path = os.path.join(temp_dir, os.path.basename(url))
-            response = requests.get(url)
-            with open(file_path, 'wb') as file:
-                file.write(response.content)
-            chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only)
-            for chunk in chunks:
-                all_texts.extend(chunk.texts)
-                all_images.extend(chunk.images)
+def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False, local: bool = False) -> List[Chunk]:
+    if local:
+        if any(url.startswith(domain) for domain in TWITTER_DOMAINS):
+            extraction = scrape_tweet(url=url, text_only=text_only)
+            return extraction
+        elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS):
+            extraction = scrape_youtube(youtube_url=url, text_only=text_only)
+            return extraction
+        elif any(url.startswith(domain) for domain in GITHUB_DOMAINS):
+            extraction = scrape_github(github_url=url, text_only=text_only, ai_extraction=ai_extraction)
+            return extraction
+        _, extension = os.path.splitext(urlparse(url).path)
+        all_texts = []
+        all_images = []
+        if extension and extension not in {'.html', '.htm', '.php', '.asp', '.aspx'}:
+            # if url leads to a file, attempt to download it and scrape it
+            with tempfile.TemporaryDirectory() as temp_dir:
+                file_path = os.path.join(temp_dir, os.path.basename(url))
+                response = requests.get(url)
+                with open(file_path, 'wb') as file:
+                    file.write(response.content)
+                chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, local=local)
+                for chunk in chunks:
+                    all_texts.extend(chunk.texts)
+                    all_images.extend(chunk.images)
+        else:
+            # if url leads to web content, scrape it directly
+            markdown_content, images = extract_page_content(url)
+            all_texts.append(markdown_content)
+            if not text_only:
+                all_images.extend(images)
+        if not all_texts and not all_images:
+            raise ValueError("No content extracted from URL.")
+        return [Chunk(path=url, texts=all_texts, images=all_images)]
     else:
-        # if url leads to web content, scrape it directly
-        markdown_content, images = extract_page_content(url)
-        all_texts.append(markdown_content)
-        if not text_only:
-            all_images.extend(images)
-    if not all_texts and not all_images:
-        raise ValueError("No content extracted from URL.")
-    return [Chunk(path=url, texts=all_texts, images=all_images)]
+        response = requests.post(
+            url=API_URL_V2,
+            data={'url': url, 'text_only': text_only, 'ai_extraction': ai_extraction},
+            headers={"Authorization": f"Bearer {THEPIPE_API_KEY}"}
+        )
+        response_json = response.json()
+        if 'error' in response_json:
+            raise ValueError(f"{response_json['error']}")
+        chunks_json = response_json['chunks']
+        chunks = [Chunk.from_json(chunk_json) for chunk_json in chunks_json]
+        return chunks
 
 def format_timestamp(seconds, chunk_index, chunk_duration):
     # helper function to format the timestamp.
diff --git a/thepipe_api/thepipe.py b/thepipe_api/thepipe.py
index 584bd31..ee0d780 100644
--- a/thepipe_api/thepipe.py
+++ b/thepipe_api/thepipe.py
@@ -1,22 +1,81 @@
 from typing import List, Optional
 import argparse
 import os
-from .core import Chunk, calculate_tokens
+import warnings
+from .core import Chunk, calculate_tokens, chunks_to_messsages
 from . import scraper
 from . import chunker
+import requests
 
-def extract(source: str, match: Optional[List[str]] = None, ignore: str = None, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]:
-    raise DeprecationWarning("This function is deprecated. Please use scraper.scrape_file or scraper.scrape_url instead.")
-    # if its a url, return the url source type
+API_URL_V1 = "https://thepipe.up.railway.app/extract"
+
+def extract(source: str, match: Optional[str] = None, ignore: Optional[str] = None, ai_extraction: Optional[bool] = False, text_only: Optional[bool] = False, verbose: Optional[bool] = False, local: Optional[bool] = False) -> List[Chunk]:
+    warnings.warn("This function is deprecated. Please use scraper.scrape_file or scraper.scrape_url instead.", DeprecationWarning, stacklevel=2)
+    chunks = None
     if source.startswith('http'):
-        return scraper.scrape_url(url=source, match=match, ignore=ignore, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
-    # if it's a directory, return the directory source type
-    if os.path.isdir(source) or source in ('.', './'):
-        if source in ('.', './'):
-            source = os.getcwd()
-        return scraper.scrape_directory(dir_path=source, include_regex=match, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=False)
-    # if it's a file, return the file source type
-    return scraper.scrape_file(source=source, match=match, ignore=ignore, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=False)
+        if local:
+            chunks = scraper.scrape_url(url=source, ai_extraction=ai_extraction, text_only=text_only)
+            messages = chunks_to_messsages(chunks)
+        else:
+            response = requests.post(
+                url=API_URL_V1,
+                data={'url': source, 'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only}
+            )
+            response_json = response.json()
+            if 'error' in response_json:
+                raise ValueError(f"{response_json['error']}")
+            messages = response_json['messages']
+    else:
+        if local:
+            # if it's a file, return the file source type
+            chunks = scraper.scrape_file(source=source, ai_extraction=ai_extraction, text_only=text_only)
+            messages = chunks_to_messsages(chunks)
+        else:
+            with open(source, 'rb') as f:
+                response = requests.post(
+                    url=API_URL_V1,
+                    files={'file': (source, f)},
+                    data={'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only}
+                )
+            response_json = response.json()
+            if 'error' in response_json:
+                raise ValueError(f"{response_json['error']}")
+            messages = response_json['messages']
+    return messages
+
+def scrape(source: str, match: Optional[str] = None, ai_extraction: Optional[bool] = False, text_only: Optional[bool] = False, verbose: Optional[bool] = False, local: Optional[bool] = False) -> List[Chunk]:
+    warnings.warn("This function is deprecated. Please use scrape instead", DeprecationWarning, stacklevel=2)
+    chunks = None
+    if source.startswith('http'):
+        if local:
+            chunks = scraper.scrape_url(url=source, match=match, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose)
+            messages = chunks_to_messsages(chunks)
+        else:
+            response = requests.post(
+                url=API_URL_V1,
+                data={'url': source, 'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only}
+            )
+            response_json = response.json()
+            if 'error' in response_json:
+                raise ValueError(f"{response_json['error']}")
+            messages = response_json['messages']
+    else:
+        if local:
+            # if it's a file, return the file source type
+            chunks = scraper.scrape_file(source=source, ai_extraction=ai_extraction, text_only=text_only)
+            messages = chunks_to_messsages(chunks)
+        else:
+            with open(source, 'rb') as f:
+                response = requests.post(
+                    url=API_URL_V1,
+                    files={'file': (source, f)},
+                    data={'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only}
+                )
+            response_json = response.json()
+            if 'error' in response_json:
+                raise ValueError(f"{response_json['error']}")
+            messages = response_json['messages']
+    return messages
 
 def save_outputs(chunks: List[Chunk], verbose: bool = False, text_only: bool = False) -> None:
     if not os.path.exists('outputs'):