Added nonetype check for videos, made url checks more general

emcf · Apr 29, 2024 · c10d08f · c10d08f
1 parent 9276bbc
commit c10d08f
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
   <a href="https://github.com/emcf/thepipe/blob/main/README.md">English</a> | <a href="https://github.com/emcf/thepipe/blob/main/README_cn.md">中文</a>
 </p>
 
-[![codecov](https://codecov.io/gh/emcf/thepipe/graph/badge.svg?token=OE7CUEFUL9)](https://codecov.io/gh/emcf/thepipe) ![python-gh-action](https://github.com/emcf/thepipe/actions/workflows/python-ci.yml/badge.svg) <a href="https://thepi.pe/">![Website](https://img.shields.io/website?url=https%3A%2F%2Fthepipe.up.railway.app%2F&label=API%20status)</a> <a href="https://thepi.pe/">![get API](https://img.shields.io/badge/API-access-blue)</a>
+[![codecov](https://codecov.io/gh/emcf/thepipe/graph/badge.svg?token=OE7CUEFUL9)](https://codecov.io/gh/emcf/thepipe) ![python-gh-action](https://github.com/emcf/thepipe/actions/workflows/python-ci.yml/badge.svg) <a href="https://thepi.pe/">![Website](https://img.shields.io/website?url=https%3A%2F%2Fthepipe.up.railway.app%2F&label=API%20status)</a> <a href="https://thepi.pe/">![get API](https://img.shields.io/badge/API-access-blue)</a> <a href="https://discord.gg/bXfKeGs5qV">![Join discord](https://img.shields.io/discord/1227806200478044274?color=4f69ef&label=Discord&logo=discord&logoColor=ffffff)</a>
 
 ### Feed PDFs, web pages, word docs, slides, videos, CSV, and more into Vision-LLMs with one line of code ⚡
 

diff --git a/thepipe_api/extractor.py b/thepipe_api/extractor.py
@@ -115,9 +115,9 @@ def extract_from_file(file_path: str, source_type: str, verbose: bool = False, a
         return [Chunk(path=file_path)]
 
 def detect_type(source: str) -> Optional[SourceTypes]:
-    if source.startswith("https://www.youtube.com"):
+    if source.startswith("https://www.youtube.com") or source.startswith("https://youtube.com"):
         return SourceTypes.YOUTUBE_VIDEO
-    if source.startswith("https://github.com"):
+    elif source.startswith("https://github.com") or source.startswith("https://www.github.com"):
         return SourceTypes.GITHUB
     elif source.startswith("http") or source.startswith("ftp."):
         return SourceTypes.URL
@@ -352,9 +352,13 @@ def extract_video(file_path: str, verbose: bool = False, text_only: bool = False
         image = Image.fromarray(frame)
         # extract and transcribe audio for the current chunk
         audio_path = os.path.join(tempfile.gettempdir(), f"temp_audio_{i}.wav")
-        video.subclip(start_time, end_time).audio.write_audiofile(audio_path, codec='pcm_s16le')
-        result = model.transcribe(audio_path, verbose=verbose)
-        transcription = result['text']
+        audio = video.subclip(start_time, end_time).audio
+        if audio is None:
+            transcription = None
+        else:
+            audio.write_audiofile(audio_path, codec='pcm_s16le')
+            result = model.transcribe(audio_path, verbose=verbose)
+            transcription = result['text']
         # add chunk
         if not text_only:
             chunks.append(Chunk(path=file_path, text=transcription, image=image, source_type=SourceTypes.VIDEO))