emcf · emcf · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ thepipeapi/ctags.exe
 thepipe.bat
 *.egg-info/
 build/
-dist/
+dist/
+youtube_temp/
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 [![codecov](https://codecov.io/gh/emcf/thepipe/graph/badge.svg?token=OE7CUEFUL9)](https://codecov.io/gh/emcf/thepipe) ![python-gh-action](https://github.com/emcf/thepipe/actions/workflows/python-ci.yml/badge.svg) <a href="https://thepi.pe/">![Website](https://img.shields.io/website?url=https%3A%2F%2Fthepipe.up.railway.app%2F&label=API%20status)</a> <a href="https://thepi.pe/">![get API](https://img.shields.io/badge/API-access-blue)</a>
 
-### Feed PDFs, word docs, slides, web pages and more into Vision-LLMs with one line of code ⚡
+### Feed PDFs, web pages, word docs, slides, videos, CSV, and more into Vision-LLMs with one line of code ⚡
 
 The Pipe is a multimodal-first tool for feeding files and web pages into vision-language models such as GPT-4V. It is best for LLM and RAG applications that require a deep understanding of tricky data sources. The Pipe is available as a hosted API at [thepi.pe](https://thepi.pe), or it can be set up locally. 
 
@@ -17,7 +17,7 @@ The Pipe is a multimodal-first tool for feeding files and web pages into vision-
 
 - Extracts text and visuals from files or web pages 📚
 - Outputs chunks optimized for multimodal LLMs and RAG frameworks 🖼️
-- Interpret complex PDFs, web pages, docs, slides, CSVs, and more 🧠
+- Interpret complex PDFs, web pages, docs, videos, data, and more 🧠
 - Auto-compress prompts exceeding your chosen token limit 📦
 - Works even with missing file extensions, in-memory data streams 💾
 - Works with codebases, git repos, and custom integrations 🌐
@@ -39,7 +39,7 @@ Now you can extract comprehensive text and visuals from any file:
 from thepipe_api import thepipe
 messages = thepipe.extract("example.pdf")
 ```
-Or any website:
+Or websites:
 ```python
 messages = thepipe.extract("https://example.com")
 ```
@@ -72,8 +72,11 @@ thepipe path/to/folder --match tsx --ignore tests
 | Jupyter Notebook                      | `.ipynb`                                | ❌               | ✔️               | Extracts code, markdown, and images from Jupyter notebooks                                  |
 | Microsoft Word Document               | `.docx`                                 | ✔️               | ✔️               | Extracts text and images from Word documents                                        |
 | Microsoft PowerPoint Presentation     | `.pptx`                                 | ✔️               | ✔️               | Extracts text and images from PowerPoint presentations                              |
+| Video                                 | `.mp4`, `.avi`, `.mov`, `.wmv`     | ✔️               | ✔️                | Extracts frames from video files; supports frame extraction and OCR for text extraction from frames |
+| Audio                                 | `.mp3`, `.wav`          | ✔️               | ❌                | Extracts text from audio files; supports speech-to-text conversion        | 
 | Website                               | URLs (inputs containing `http`, `https`, `ftp`)             | ✔️                | ✔️    | Extracts text from web page along with image (or images if scrollable); text-only extraction available          |
 | GitHub Repository                     | GitHub repo URLs                         | ✔️               | ✔️                | Extracts from GitHub repositories; supports branch specification         |
+| YouTube Video                         | YouTube video URLs                      | ✔️               | ✔️                | Extracts text from YouTube videos; supports subtitles extraction          |
 | ZIP File                              | `.zip`                                  | ✔️               | ✔️                | Extracts contents of ZIP files; supports nested directory extraction     |
 
 ## How it works 🛠️

diff --git a/requirements_local.txt b/requirements_local.txt
@@ -20,4 +20,8 @@ llmlingua
 PyMuPDF
 pdf2image
 python-magic
-python-dotenv
+python-dotenv
+git+https://github.com/openai/whisper.git
+moviepy
+openpyxl
+pytube
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='thepipe_api',
-    version='0.3.2',
+    version='0.3.3',
     author='Emmett McFarlane',
     author_email='[email protected]',
     description='Automate information extraction for multimodal LLMs.',

diff --git a/tests/files/example.mp3 b/tests/files/example.mp3
diff --git a/tests/files/example.mp4 b/tests/files/example.mp4
diff --git a/tests/test_thepipe.py b/tests/test_thepipe.py
@@ -23,6 +23,43 @@ def tearDown(self):
                 os.remove(os.path.join(self.outputs_directory, file))
             os.rmdir(self.outputs_directory)
 
+    def test_extract_video(self):
+        chunks = extractor.extract_from_source(source=self.files_directory+"/example.mp4")
+        # verify it extracted the video file into chunks
+        self.assertEqual(type(chunks), list)
+        self.assertNotEqual(len(chunks), 0)
+        self.assertEqual(type(chunks[0]), core.Chunk)
+        # verify it extracted visual data
+        self.assertTrue(any(chunk.image for chunk in chunks))
+        # verify it extracted audio data
+        self.assertTrue(any(chunk.text for chunk in chunks))
+        # verify it transcribed the audio correctly, i.e., 'citizens' is in the extracted text
+        self.assertTrue(any('citizens' in chunk.text.lower() for chunk in chunks if chunk.text is not None))
+
+    def test_extract_audio(self):
+        chunks = extractor.extract_from_source(source=self.files_directory+"/example.mp3")
+        # verify it extracted the audio file into chunks
+        self.assertEqual(type(chunks), list)
+        self.assertNotEqual(len(chunks), 0)
+        self.assertEqual(type(chunks[0]), core.Chunk)
+        # verify it extracted audio data
+        self.assertTrue(any(chunk.text for chunk in chunks))
+        # verify it transcribed the audio correctly, i.e., 'citizens' is in the extracted text
+        self.assertTrue(any('citizens' in chunk.text.lower() for chunk in chunks if chunk.text is not None))
+
+    def test_extract_youtube(self):
+        chunks = extractor.extract_from_source("https://www.youtube.com/watch?v=wUEr7TayrmU")
+        # verify it extracted the youtube video into chunks
+        self.assertEqual(type(chunks), list)
+        self.assertNotEqual(len(chunks), 0)
+        self.assertEqual(type(chunks[0]), core.Chunk)
+        # verify it extracted visual data
+        self.assertTrue(any(chunk.image for chunk in chunks))
+        # verify it extracted audio data
+        self.assertTrue(any(chunk.text for chunk in chunks))
+        # verify it transcribed the audio correctly, i.e., 'citizens' is in the extracted text
+        self.assertTrue(any('eliminated' in chunk.text.lower() for chunk in chunks if chunk.text is not None))
+
     def test_image_to_base64(self):
         image = Image.open(os.path.join(self.files_directory, 'example.jpg'))
         image.load() # needed to close the file

diff --git a/thepipe_api/core.py b/thepipe_api/core.py
@@ -21,6 +21,9 @@ class SourceTypes(Enum):
     URL = "website"
     GITHUB = "github repository"
     ZIP = "zip"
+    VIDEO = "video"
+    AUDIO = "audio"
+    YOUTUBE_VIDEO = "youtube video"
 
 class Chunk:
     def __init__(self, path: str, text: Optional[str] = None, image: Optional[Image.Image] = None, source_type: Optional[SourceTypes] = None):

diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py
@@ -1,6 +1,7 @@
 import base64
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
+import math
 import re
 from typing import Dict, List, Optional
 import glob
@@ -32,6 +33,8 @@
 IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png'}
 SPREADSHEET_EXTENSIONS = {'.csv', '.xls', '.xlsx'}
 DOCUMENT_EXTENSIONS = {'.pdf', '.docx', '.pptx'}
+VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mov', '.mkv'}
+AUDIO_EXTENSIONS = {'.mp3', '.wav'}
 OTHER_EXTENSIONS = {'.zip', '.ipynb'}
 KNOWN_EXTENSIONS = IMAGE_EXTENSIONS.union(CODE_EXTENSIONS).union(CTAGS_CODE_EXTENSIONS).union(PLAINTEXT_EXTENSIONS).union(IMAGE_EXTENSIONS).union(SPREADSHEET_EXTENSIONS).union(DOCUMENT_EXTENSIONS).union(OTHER_EXTENSIONS)
 GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
@@ -48,10 +51,10 @@
         return extract_from_directory(dir_path=source, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, limit=limit, local=local)
     elif source_type == SourceTypes.GITHUB:
         return extract_github(github_url=source, file_path='', match=match, ignore=ignore, text_only=text_only, verbose=verbose, ai_extraction=ai_extraction, branch='master')
+    elif source_type == SourceTypes.YOUTUBE_VIDEO:
+        return extract_youtube(youtube_url=source, text_only=text_only, verbose=verbose)
     elif source_type == SourceTypes.URL:
         return extract_url(url=source, text_only=text_only, local=local)
-    elif source_type == SourceTypes.ZIP:
-        return extract_zip(file_path=source, match=match, ignore=ignore, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
     return extract_from_file(file_path=source, source_type=source_type, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only, local=local)
 
 def extract_from_file(file_path: str, source_type: str, verbose: bool = False, ai_extraction: bool = False, text_only: bool = False, local: bool = True, limit: int = None) -> List[Chunk]:
@@ -97,6 +100,12 @@
             extraction = [Chunk(path=e.path, text=e.text, image=None, source_type=SourceTypes.COMPRESSIBLE_CODE) for e in extraction]
         elif source_type == SourceTypes.IPYNB:
             extraction = extract_from_ipynb(file_path=file_path, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
+        elif source_type == SourceTypes.ZIP:
+            extraction = extract_zip(file_path=file_path, verbose=verbose, ai_extraction=ai_extraction, text_only=text_only)
+        elif source_type == SourceTypes.VIDEO:
+            extraction = extract_video(file_path=file_path, verbose=verbose, text_only=text_only)
+        elif source_type == SourceTypes.AUDIO:
+            extraction = extract_audio(file_path=file_path, verbose=verbose)
         else:
             extraction = [extract_plaintext(file_path=file_path)]
         if verbose: print_status(f"Extracted from {file_path}", status='success')
@@ -106,6 +115,8 @@
         return [Chunk(path=file_path)]
 
 def detect_type(source: str) -> Optional[SourceTypes]:
+    if source.startswith("https://www.youtube.com"):
+        return SourceTypes.YOUTUBE_VIDEO
     if source.startswith("https://github.com"):
         return SourceTypes.GITHUB
     elif source.startswith("http") or source.startswith("ftp."):
@@ -146,6 +157,12 @@
         return SourceTypes.DOCX
     elif extension == '.pptx':
         return SourceTypes.PPTX
+    elif extension == '.zip':
+        return SourceTypes.ZIP
+    elif extension in VIDEO_EXTENSIONS:
+        return SourceTypes.VIDEO
+    elif extension in AUDIO_EXTENSIONS:
+        return SourceTypes.AUDIO
     elif extension in PLAINTEXT_EXTENSIONS:
         return SourceTypes.PLAINTEXT
     return None
@@ -315,6 +332,55 @@
         raise ValueError("No content extracted from URL.")
     return chunks
 
+def extract_video(file_path: str, verbose: bool = False, text_only: bool = False) -> List[Chunk]:
+    from moviepy.editor import VideoFileClip # import only if needed
+    import whisper # import only if needed
+    model = whisper.load_model("base")
+    video = VideoFileClip(file_path)
+    chunk_duration = 60
+    num_chunks = math.ceil(video.duration / chunk_duration)
+    chunks = []
+    for i in range(num_chunks):
+        # calculate start and end time for the current chunk
+        start_time = i * chunk_duration
+        end_time = start_time + chunk_duration
+        if end_time > video.duration:
+            end_time = video.duration
+        # extract frame at the middle of the chunk
+        frame_time = (start_time + end_time) / 2
+        frame = video.get_frame(frame_time)
+        image = Image.fromarray(frame)
+        # extract and transcribe audio for the current chunk
+        audio_path = os.path.join(tempfile.gettempdir(), f"temp_audio_{i}.wav")
+        video.subclip(start_time, end_time).audio.write_audiofile(audio_path, codec='pcm_s16le')
+        result = model.transcribe(audio_path, verbose=verbose)
+        transcription = result['text']
+        # add chunk
+        if not text_only:
+            chunks.append(Chunk(path=file_path, text=transcription, image=image, source_type=SourceTypes.VIDEO))
+        else:
+            chunks.append(Chunk(path=file_path, text=transcription, image=None, source_type=SourceTypes.VIDEO))
+        os.remove(audio_path)
+    return chunks
+
+def extract_youtube(youtube_url: str, text_only: bool = False, verbose: bool = False) -> List[Chunk]:
+    from pytube import YouTube # import only if needed
+    temp_dir = "youtube_temp"
+    filename = "temp_video.mp4"
+    yt = YouTube(youtube_url)
+    stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
+    stream.download(temp_dir, filename=filename)
+    video_path = os.path.join(temp_dir, filename)
+    chunks = extract_video(file_path=video_path, verbose=verbose, text_only=text_only)
+    return chunks
+
+def extract_audio(file_path: str, verbose: bool = False) -> List[Chunk]:
+    import whisper # import only if needed
+    model = whisper.load_model("base")
+    result = model.transcribe(file_path, verbose=verbose)
+    transcription = result['text']
+    return [Chunk(path=file_path, text=transcription, image=None, source_type=SourceTypes.AUDIO)]
+
 def extract_github(github_url: str, file_path: str = '', match: Optional[str] = None, ignore: Optional[str] = None, text_only: bool = False, ai_extraction: bool = False, branch: str = 'main', verbose: bool = False) -> List[Chunk]:
     files_contents = []
     if not GITHUB_TOKEN: