From 9fa148b9cc6b1a8922741a79335bf0d2e3c5e74b Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Fri, 5 Jul 2024 18:42:08 -0400 Subject: [PATCH] fixed to_message, fixed by_document chunker --- README.md | 6 +- setup.py | 2 +- tests/test_core.py | 21 +++-- tests/test_scraper.py | 26 +++---- thepipe_api/__init__.py | 4 +- thepipe_api/chunker.py | 26 ++++++- thepipe_api/core.py | 30 ++++--- thepipe_api/scraper.py | 169 +++++++++++++++++++++++----------------- thepipe_api/thepipe.py | 83 +++++++++++++++++--- 9 files changed, 244 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index b3a2b06..991ecac 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ thepi.pe is an AI-native scraping engine that generates LLM-ready markdown and v ## Get started in 5 minutes 🚀 -thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe). +thepi.pe can read a wide range of filetypes and web sources, so it requires a few dependencies. It also requires a strong machine (16GB+ VRAM for optimal PDF & video response times) for AI extraction features. For these reasons, we host a REST API that works out-of-the-box at [thepi.pe](https://thepi.pe). ### Hosted API (Python) @@ -49,7 +49,7 @@ import thepipe_api as tp from openai import OpenAI # scrape markdown + images -chunks = tp.scrape_file( +chunks = tp.scrape( source="example.pdf", ai_extraction=True ) @@ -58,7 +58,7 @@ chunks = tp.scrape_file( client = OpenAI() response = client.chat.completions.create( model="gpt-4o", - messages=tp.to_messages(chunks), + messages=tp.chunks_to_messages(chunks), ) ``` diff --git a/setup.py b/setup.py index 681645a..9e05d53 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='thepipe_api', - version='1.0.0', + version='1.0.1', author='Emmett McFarlane', author_email='emmett@thepi.pe', description='AI-native scraper for multimodal LLMs.', diff --git a/tests/test_core.py b/tests/test_core.py index 5e147da..013f713 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -38,9 +38,9 @@ def test_image_to_base64(self): decoded_image = Image.open(BytesIO(image_data)) self.assertEqual(image.size, decoded_image.size) - def test_to_messages(self): - chunks = scraper.scrape_file(source=self.files_directory+"/example.md") - messages = core.to_messages(chunks) + def test_chunks_to_messages(self): + chunks = scraper.scrape_file(source=self.files_directory+"/example.md", local=True) + messages = core.chunks_to_messsages(chunks) self.assertEqual(type(messages), list) for message in messages: self.assertEqual(type(message), dict) @@ -55,7 +55,7 @@ def test_save_outputs(self): text = file.read() self.assertIn('Hello, World!', text) # verify with images - chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg") + chunks = scraper.scrape_file(source=self.files_directory+"/example.jpg", local=True) thepipe.save_outputs(chunks) self.assertTrue(any('.jpg' in f for f in os.listdir(self.outputs_directory))) @@ -63,4 +63,15 @@ def test_parse_arguments(self): args = thepipe.parse_arguments() self.assertEqual(type(args), argparse.Namespace) self.assertIn('source', vars(args)) - self.assertIn('include_regex', vars(args)) \ No newline at end of file + self.assertIn('include_regex', vars(args)) + + def test_calculate_tokens(self): + text = "Hello, World!" + tokens = core.calculate_tokens([core.Chunk(texts=[text])]) + self.assertAlmostEqual(tokens, 3.25, places=0) + + def test_calculate_image_tokens(self): + image = Image.open(os.path.join(self.files_directory, 'example.jpg')) + image.load() # needed to close the file + tokens = core.calculate_image_tokens(image) + self.assertAlmostEqual(tokens, 85, places=0) \ No newline at end of file diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 77261ed..673c5bc 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -20,7 +20,7 @@ def tearDown(self): os.rmdir(self.outputs_directory) def test_scrape_zip(self): - chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.zip", verbose=True, local=True) # verify it scraped the zip file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -31,7 +31,7 @@ def test_scrape_zip(self): self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks)) def test_scrape_ipynb(self): - chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.ipynb", verbose=True, local=True) # verify it scraped the ipynb file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -42,7 +42,7 @@ def test_scrape_ipynb(self): self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks)) def test_scrape_pdf_with_ai_extraction(self): - chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True) + chunks = scraper.scrape_file("tests/files/example.pdf", ai_extraction=True, verbose=True, local=True) # verify it scraped the pdf file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -52,7 +52,7 @@ def test_scrape_pdf_with_ai_extraction(self): self.assertIsNotNone(chunk.texts or chunk.images) def test_scrape_docx(self): - chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.docx", verbose=True, local=True) # verify it scraped the docx file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -63,7 +63,7 @@ def test_scrape_docx(self): self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks)) def test_extract_pdf_without_ai_extraction(self): - chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.pdf", ai_extraction=False, verbose=True, local=True) # verify it scraped the pdf file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -74,7 +74,7 @@ def test_extract_pdf_without_ai_extraction(self): self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks)) def test_scrape_audio(self): - chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.mp3", verbose=True, local=True) # verify it scraped the audio file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -85,7 +85,7 @@ def test_scrape_audio(self): self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None)) def test_scrape_video(self): - chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True) + chunks = scraper.scrape_file(source=self.files_directory+"/example.mp4", verbose=True, local=True) # verify it scraped the video file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -98,7 +98,7 @@ def test_scrape_video(self): self.assertTrue(any('citizens' in chunk.texts[0].lower() for chunk in chunks if chunk.texts is not None)) def test_scrape_pptx(self): - chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True) + chunks = scraper.scrape_file(self.files_directory+"/example.pptx", verbose=True, local=True) # verify it scraped the pptx file into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -110,7 +110,7 @@ def test_scrape_pptx(self): def test_scrape_tweet(self): tweet_url = "https://x.com/ylecun/status/1796734866156843480" - chunks = scraper.scrape_url(tweet_url) + chunks = scraper.scrape_url(tweet_url, local=True) # verify it returned chunks representing the tweet self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -120,7 +120,7 @@ def test_scrape_tweet(self): self.assertTrue(len(chunks[0].images) > 0) def test_scrape_youtube(self): - chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8") + chunks = scraper.scrape_url("https://www.youtube.com/watch?v=So7TNRhIYJ8", local=True) # verify it scraped the youtube video into chunks self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) @@ -134,7 +134,7 @@ def test_scrape_youtube(self): def test_scrape_url(self): # verify web page scrape result - chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping') + chunks = scraper.scrape_url('https://en.wikipedia.org/wiki/Piping', local=True) for chunk in chunks: self.assertEqual(type(chunk), core.Chunk) self.assertEqual(chunk.path, 'https://en.wikipedia.org/wiki/Piping') @@ -144,12 +144,12 @@ def test_scrape_url(self): # verify if at least one image was scraped self.assertTrue(any(len(chunk.images) > 0 for chunk in chunks)) # verify file url scrape result - chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf') + chunks = scraper.scrape_url('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', local=True) self.assertEqual(len(chunks), 1) @unittest.skipUnless(os.environ.get('GITHUB_TOKEN'), "requires GITHUB_TOKEN") def test_scrape_github(self): - chunks = scraper.scrape_url('https://github.com/emcf/thepipe') + chunks = scraper.scrape_url('https://github.com/emcf/thepipe', local=True) self.assertEqual(type(chunks), list) self.assertNotEqual(len(chunks), 0) # should have some repo contents diff --git a/thepipe_api/__init__.py b/thepipe_api/__init__.py index f822200..e29f7c6 100644 --- a/thepipe_api/__init__.py +++ b/thepipe_api/__init__.py @@ -1,2 +1,4 @@ from .scraper import scrape_file, scrape_url, scrape_directory -from .chunker import chunk_by_page, chunk_by_section, chunk_semantic \ No newline at end of file +from .chunker import chunk_by_page, chunk_by_section, chunk_semantic +from .core import Chunk, calculate_tokens, chunks_to_messsages +from .thepipe import extract # deprecated \ No newline at end of file diff --git a/thepipe_api/chunker.py b/thepipe_api/chunker.py index 505e460..01257dd 100644 --- a/thepipe_api/chunker.py +++ b/thepipe_api/chunker.py @@ -3,6 +3,24 @@ from .core import Chunk, calculate_tokens from sklearn.metrics.pairwise import cosine_similarity +def chunk_by_document(chunks: List[Chunk]) -> List[Chunk]: + chunks_by_doc = {} + new_chunks = [] + for chunk in chunks: + if not chunk.path: + raise ValueError("Document chunking requires the path attribute to determine the document boundaries") + if chunk.path not in chunks_by_doc: + chunks_by_doc[chunk.path] = [] + chunks_by_doc[chunk.path].append(chunk) + for doc_chunks in chunks_by_doc.values(): + doc_texts = [] + doc_images = [] + for chunk in doc_chunks: + doc_texts.extend(chunk.texts) + doc_images.extend(chunk.images) + new_chunks.append(Chunk(path=doc_chunks[0].path, texts=doc_texts, images=doc_images)) + return new_chunks + def chunk_by_page(chunks: List[Chunk]) -> List[Chunk]: # by-page chunking is default behavior return chunks @@ -28,7 +46,7 @@ def chunk_by_section(chunks: List[Chunk]) -> List[Chunk]: section_chunks.append(Chunk(texts=[current_chunk_text], images=current_chunk_images)) return section_chunks -def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 2, similarity_threshold: float = 0.5) -> List[Chunk]: +def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', buffer_size: int = 3, similarity_threshold: float = 0.1) -> List[Chunk]: from sentence_transformers import SentenceTransformer model = SentenceTransformer(model_name) # Flatten the chunks into sentences @@ -69,8 +87,12 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers for group in grouped_sentences: group_texts = [sentences[i] for i in group] group_images = [] + seen_images = [] for i in group: - group_images.extend(sentence_chunk_map[i].images) + for image in sentence_chunk_map[i].images: + if image not in seen_images: + group_images.append(image) + seen_images.append(image) new_chunks.append(Chunk(texts=group_texts, images=group_images)) return new_chunks \ No newline at end of file diff --git a/thepipe_api/core.py b/thepipe_api/core.py index 4dbf11d..a262eed 100644 --- a/thepipe_api/core.py +++ b/thepipe_api/core.py @@ -19,7 +19,18 @@ def to_llamaindex(self) -> List[Union[Document, ImageDocument]]: return [ImageDocument(text=document_text, image=image) for image in self.images] else: return [Document(text=document_text)] - + + def to_message(self) -> Dict: + content = [] + if self.texts: + for text in self.texts: + content.append({"type": "text", "text": {"content": text}}) + if self.images: + for image in self.images: + base64_image = image_to_base64(image) + content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}) + return {"role": "user", "content": content} + # uses https://platform.openai.com/docs/guides/vision def calculate_image_tokens(image: Image.Image, detail: str = "auto") -> int: width, height = image.size @@ -56,18 +67,5 @@ def image_to_base64(image: Image.Image) -> str: image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode() -def to_messages(chunks: List[Chunk]) -> List[Dict]: - # audio and video are not yet supported as they - # are not common in SOTA multimodel LLMs (June 2024) - messages = [] - for chunk in chunks: - content = [] - if chunk.texts: - for text in chunk.texts: - content.append({"type": "text", "text": {"content": text}}) - if chunk.images: - for image in chunk.images: - base64_image = image_to_base64(image) - content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}) - messages.append({"role": "user", "content": content}) - return messages +def chunks_to_messsages(chunks: List[Chunk]) -> List[Dict]: + return [chunk.to_message() for chunk in chunks] \ No newline at end of file diff --git a/thepipe_api/scraper.py b/thepipe_api/scraper.py index 21f4f52..bb19b4c 100644 --- a/thepipe_api/scraper.py +++ b/thepipe_api/scraper.py @@ -38,6 +38,7 @@ def default(self, obj): TWITTER_DOMAINS = ['https://twitter.com', 'https://www.twitter.com', 'https://x.com', 'https://www.x.com'] YOUTUBE_DOMAINS = ['https://www.youtube.com', 'https://youtube.com'] GITHUB_DOMAINS = ['https://github.com', 'https://www.github.com'] +API_URL_V2 = "https://localhost:5000/scrape" def detect_source_type(source: str) -> str: # otherwise, try to detect the file type by its extension @@ -56,48 +57,60 @@ def detect_source_type(source: str) -> str: mimetype = result.output.mime_type return mimetype -def scrape_file(source: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False) -> List[Chunk]: - # returns chunks of scraped content from any source (file, URL, etc.) - extraction = [] - source_type = detect_source_type(source) - if source_type is None: - if verbose: - print(f"[thepipe] Unsupported source type: {source}") - return extraction - if verbose: - print(f"[thepipe] Scraping {source_type}: {source}...") - if source_type == 'application/pdf': - extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) - elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': - extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only) - elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation': - extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only) - elif source_type.startswith('image/'): - extraction = scrape_image(file_path=source, text_only=text_only) - elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': - extraction = scrape_spreadsheet(file_path=source) - elif source_type == 'application/x-ipynb+json': - extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only) - elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed': - extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only) - elif source_type.startswith('video/'): - extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only) - elif source_type.startswith('audio/'): - extraction = scrape_audio(file_path=source, verbose=verbose) - elif source_type.startswith('text/'): - extraction = scrape_plaintext(file_path=source) - else: - try: +def scrape_file(source: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = False) -> List[Chunk]: + if local: + extraction = [] + source_type = detect_source_type(source) + if source_type is None: + if verbose: + print(f"[thepipe] Unsupported source type: {source}") + return extraction + if verbose: + print(f"[thepipe] Scraping {source_type}: {source}...") + if source_type == 'application/pdf': + extraction = scrape_pdf(file_path=source, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) + elif source_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + extraction = scrape_docx(file_path=source, verbose=verbose, text_only=text_only) + elif source_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation': + extraction = scrape_pptx(file_path=source, verbose=verbose, text_only=text_only) + elif source_type.startswith('image/'): + extraction = scrape_image(file_path=source, text_only=text_only) + elif source_type.startswith('application/vnd.ms-excel') or source_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': + extraction = scrape_spreadsheet(file_path=source) + elif source_type == 'application/x-ipynb+json': + extraction = scrape_ipynb(file_path=source, verbose=verbose, text_only=text_only) + elif source_type == 'application/zip' or source_type == 'application/x-zip-compressed': + extraction = scrape_zip(file_path=source, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only) + elif source_type.startswith('video/'): + extraction = scrape_video(file_path=source, verbose=verbose, text_only=text_only) + elif source_type.startswith('audio/'): + extraction = scrape_audio(file_path=source, verbose=verbose) + elif source_type.startswith('text/'): extraction = scrape_plaintext(file_path=source) - except Exception as e: - if verbose: - print(f"[thepipe] Error extracting from {source}: {e}") + else: + try: + extraction = scrape_plaintext(file_path=source) + except Exception as e: + if verbose: + print(f"[thepipe] Error extracting from {source}: {e}") + else: + with open(source, 'rb') as f: + response = requests.post( + url=API_URL_V2, + files={'file': (source, f)}, + data={'text_only': text_only, 'ai_extraction': ai_extraction}, + headers={"Authorization": f"Bearer {THEPIPE_API_KEY}"} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + chunks_json = response_json['chunks'] + extraction = [Chunk.from_json(chunk_json) for chunk_json in chunks_json] if verbose: if extraction: print(f"[thepipe] Extracted from {source}") else: print(f"[thepipe] No content extracted from {source}") - return extraction def scrape_plaintext(file_path: str) -> List[Chunk]: @@ -126,7 +139,6 @@ def scrape_zip(file_path: str, include_regex: Optional[str] = None, verbose: boo def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False) -> List[Chunk]: chunks = [] - if ai_extraction: # ai_extraction uses layout analysis AI to extract markdown, equations, tables, and images from the PDF MD_FOLDER = 'mdoutput' @@ -137,11 +149,15 @@ def scrape_pdf(file_path: str, ai_extraction: bool = False, text_only: bool = Fa os.makedirs(MD_FOLDER) os.system(f"marker_single {file_path} {MD_FOLDER} --batch_multiplier 4 --max_pages 1000 --langs English") # Find the .md file and read its content + markdown = None for output_file in glob.glob(f'{MD_FOLDER}/*/*', recursive=True): if output_file.endswith('.md'): with open(output_file, 'r') as f: markdown = f.read() break + if markdown is None: + if verbose: print(f"[thepipe] No markdown file found in {MD_FOLDER}. (AI extraction likely crashed)") + raise ValueError("AI extraction failed.") if text_only: chunks.append(Chunk(path=file_path, texts=[markdown])) return chunks @@ -252,7 +268,7 @@ def extract_page_content(url: str, verbose: bool = False) -> Tuple[str, List[str viewport_height = page.viewport_size['height'] total_height = page.evaluate("document.body.scrollHeight") current_scroll_position = 0 - scrolldowns, max_scrolldowns = 0, 20 # Finite to prevent infinite scroll + scrolldowns, max_scrolldowns = 0, 10 # Finite to prevent infinite scroll while current_scroll_position < total_height and scrolldowns < max_scrolldowns: page.wait_for_timeout(100) # Wait for dynamic content to load current_scroll_position += viewport_height @@ -306,39 +322,52 @@ def traverse_and_extract(element): traverse_and_extract(body) return ''.join(markdown_content) -def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False) -> List[Chunk]: - if any(url.startswith(domain) for domain in TWITTER_DOMAINS): - extraction = scrape_tweet(url=url, text_only=text_only) - return extraction - elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS): - extraction = scrape_youtube(youtube_url=url, text_only=text_only) - return extraction - elif any(url.startswith(domain) for domain in GITHUB_DOMAINS): - extraction = scrape_github(github_url=url, text_only=text_only, ai_extraction=ai_extraction) - return extraction - _, extension = os.path.splitext(urlparse(url).path) - all_texts = [] - all_images = [] - if extension and extension not in {'.html', '.htm', '.php', '.asp', '.aspx'}: - # if url leads to a file, attempt to download it and scrape it - with tempfile.TemporaryDirectory() as temp_dir: - file_path = os.path.join(temp_dir, os.path.basename(url)) - response = requests.get(url) - with open(file_path, 'wb') as file: - file.write(response.content) - chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only) - for chunk in chunks: - all_texts.extend(chunk.texts) - all_images.extend(chunk.images) +def scrape_url(url: str, text_only: bool = False, ai_extraction: bool = False, local: bool = False) -> List[Chunk]: + if local: + if any(url.startswith(domain) for domain in TWITTER_DOMAINS): + extraction = scrape_tweet(url=url, text_only=text_only) + return extraction + elif any(url.startswith(domain) for domain in YOUTUBE_DOMAINS): + extraction = scrape_youtube(youtube_url=url, text_only=text_only) + return extraction + elif any(url.startswith(domain) for domain in GITHUB_DOMAINS): + extraction = scrape_github(github_url=url, text_only=text_only, ai_extraction=ai_extraction) + return extraction + _, extension = os.path.splitext(urlparse(url).path) + all_texts = [] + all_images = [] + if extension and extension not in {'.html', '.htm', '.php', '.asp', '.aspx'}: + # if url leads to a file, attempt to download it and scrape it + with tempfile.TemporaryDirectory() as temp_dir: + file_path = os.path.join(temp_dir, os.path.basename(url)) + response = requests.get(url) + with open(file_path, 'wb') as file: + file.write(response.content) + chunks = scrape_file(source=file_path, ai_extraction=ai_extraction, text_only=text_only, local=local) + for chunk in chunks: + all_texts.extend(chunk.texts) + all_images.extend(chunk.images) + else: + # if url leads to web content, scrape it directly + markdown_content, images = extract_page_content(url) + all_texts.append(markdown_content) + if not text_only: + all_images.extend(images) + if not all_texts and not all_images: + raise ValueError("No content extracted from URL.") + return [Chunk(path=url, texts=all_texts, images=all_images)] else: - # if url leads to web content, scrape it directly - markdown_content, images = extract_page_content(url) - all_texts.append(markdown_content) - if not text_only: - all_images.extend(images) - if not all_texts and not all_images: - raise ValueError("No content extracted from URL.") - return [Chunk(path=url, texts=all_texts, images=all_images)] + response = requests.post( + url=API_URL_V2, + data={'url': url, 'text_only': text_only, 'ai_extraction': ai_extraction}, + headers={"Authorization": f"Bearer {THEPIPE_API_KEY}"} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + chunks_json = response_json['chunks'] + chunks = [Chunk.from_json(chunk_json) for chunk_json in chunks_json] + return chunks def format_timestamp(seconds, chunk_index, chunk_duration): # helper function to format the timestamp. diff --git a/thepipe_api/thepipe.py b/thepipe_api/thepipe.py index 584bd31..ee0d780 100644 --- a/thepipe_api/thepipe.py +++ b/thepipe_api/thepipe.py @@ -1,22 +1,81 @@ from typing import List, Optional import argparse import os -from .core import Chunk, calculate_tokens +import warnings +from .core import Chunk, calculate_tokens, chunks_to_messsages from . import scraper from . import chunker +import requests -def extract(source: str, match: Optional[List[str]] = None, ignore: str = None, ai_extraction: bool = False, text_only: bool = False, verbose: bool = False, local: bool = False) -> List[Chunk]: - raise DeprecationWarning("This function is deprecated. Please use scraper.scrape_file or scraper.scrape_url instead.") - # if its a url, return the url source type +API_URL_V1 = "https://thepipe.up.railway.app/extract" + +def extract(source: str, match: Optional[str] = None, ignore: Optional[str] = None, ai_extraction: Optional[bool] = False, text_only: Optional[bool] = False, verbose: Optional[bool] = False, local: Optional[bool] = False) -> List[Chunk]: + warnings.warn("This function is deprecated. Please use scraper.scrape_file or scraper.scrape_url instead.", DeprecationWarning, stacklevel=2) + chunks = None if source.startswith('http'): - return scraper.scrape_url(url=source, match=match, ignore=ignore, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) - # if it's a directory, return the directory source type - if os.path.isdir(source) or source in ('.', './'): - if source in ('.', './'): - source = os.getcwd() - return scraper.scrape_directory(dir_path=source, include_regex=match, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=False) - # if it's a file, return the file source type - return scraper.scrape_file(source=source, match=match, ignore=ignore, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose, local=False) + if local: + chunks = scraper.scrape_url(url=source, ai_extraction=ai_extraction, text_only=text_only) + messages = chunks_to_messsages(chunks) + else: + response = requests.post( + url=API_URL_V1, + data={'url': source, 'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + messages = response_json['messages'] + else: + if local: + # if it's a file, return the file source type + chunks = scraper.scrape_file(source=source, ai_extraction=ai_extraction, text_only=text_only) + messages = chunks_to_messsages(chunks) + else: + with open(source, 'rb') as f: + response = requests.post( + url=API_URL_V1, + files={'file': (source, f)}, + data={'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + messages = response_json['messages'] + return messages + +def scrape(source: str, match: Optional[str] = None, ai_extraction: Optional[bool] = False, text_only: Optional[bool] = False, verbose: Optional[bool] = False, local: Optional[bool] = False) -> List[Chunk]: + warnings.warn("This function is deprecated. Please use scrape instead", DeprecationWarning, stacklevel=2) + chunks = None + if source.startswith('http'): + if local: + chunks = scraper.scrape_url(url=source, match=match, ai_extraction=ai_extraction, text_only=text_only, verbose=verbose) + messages = chunks_to_messsages(chunks) + else: + response = requests.post( + url=API_URL_V1, + data={'url': source, 'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + messages = response_json['messages'] + else: + if local: + # if it's a file, return the file source type + chunks = scraper.scrape_file(source=source, ai_extraction=ai_extraction, text_only=text_only) + messages = chunks_to_messsages(chunks) + else: + with open(source, 'rb') as f: + response = requests.post( + url=API_URL_V1, + files={'file': (source, f)}, + data={'api_key': os.getenv("THEPIPE_API_KEY", None), 'text_only': text_only} + ) + response_json = response.json() + if 'error' in response_json: + raise ValueError(f"{response_json['error']}") + messages = response_json['messages'] + return messages def save_outputs(chunks: List[Chunk], verbose: bool = False, text_only: bool = False) -> None: if not os.path.exists('outputs'):