Merge branch 'master' into feature/correct-some-type-hints

SYSTRAN · Nov 22, 2024 · 68f40db · 68f40db
2 parents 05cc0a4 + 97a4785
commit 68f40db
Show file tree

Hide file tree

Showing 10 changed files with 324 additions and 251 deletions.
diff --git a/README.md b/README.md
@@ -12,63 +12,53 @@ This implementation is up to 4 times faster than [openai/whisper](https://github
 
 For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
 
-* [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
-* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
-* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
+* [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/tree/v20240930)
+* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[v1.7.2](https://github.com/ggerganov/whisper.cpp/tree/v1.7.2)
+* [transformers](https://github.com/huggingface/transformers)@[v4.46.3](https://github.com/huggingface/transformers/tree/v4.46.3)
+* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/tree/v1.1.0)
 
 ### Large-v2 model on GPU
 
-| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
-| --- | --- | --- | --- | --- | --- |
-| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
-| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
-| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
-
-*Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
+| Implementation | Precision | Beam size | Time | VRAM Usage |
+| --- | --- | --- | --- | --- |
+| openai/whisper | fp16 | 5 | 2m23s | 4708MB |
+| whisper.cpp (Flash Attention) | fp16 | 5 | 1m05s | 4127MB |
+| transformers (SDPA)[^1] | fp16 | 5 | 1m52s | 4960MB |
+| faster-whisper | fp16 | 5 | 1m03s | 4525MB |
+| faster-whisper (`batch_size=8`) | fp16 | 5 | 17s | 6090MB |
+| faster-whisper | int8 | 5 | 59s | 2926MB |
+| faster-whisper (`batch_size=8`) | int8 | 5 | 16s | 4500MB |
 
-### Small model on CPU
+### distil-whisper-large-v3 model on GPU
 
-| Implementation | Precision | Beam size | Time | Max. memory |
+| Implementation | Precision | Beam size | Time | YT Commons WER |
 | --- | --- | --- | --- | --- |
-| openai/whisper | fp32 | 5 | 10m31s | 3101MB |
-| whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
-| whisper.cpp | fp16 | 5 | 12m39s | 873MB |
-| faster-whisper | fp32 | 5 | 2m44s | 1675MB |
-| faster-whisper | int8 | 5 | 2m04s | 995MB |
-
-*Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
+| transformers (SDPA) (`batch_size=16`) | fp16 | 5 | 46m12s | 14.801 |
+| faster-whisper (`batch_size=16`) | fp16 | 5 | 25m50s | 13.527 |
 
+*GPU Benchmarks are Executed with CUDA 12.4 on a NVIDIA RTX 3070 Ti 8GB.*
+[^1]: transformers OOM for any batch size > 1
 
-### Distil-whisper
+### Small model on CPU
 
-| Implementation | Precision | Beam size | Time | Gigaspeech WER |
+| Implementation | Precision | Beam size | Time | RAM Usage |
 | --- | --- | --- | --- | --- |
-| distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
-| [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
-| distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
-| [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
-
-*Executed with CUDA 11.4 on a NVIDIA 3090.*
-
-<details>
-<summary>testing details (click to expand)</summary>
+| openai/whisper | fp32 | 5 | 6m58s | 2335MB |
+| whisper.cpp | fp32 | 5 | 2m05s | 1049MB |
+| whisper.cpp (OpenVINO) | fp32 | 5 | 1m45s | 1642MB |
+| faster-whisper | fp32 | 5 | 2m37s | 2257MB |
+| faster-whisper (`batch_size=8`) | fp32 | 5 | 1m06s | 4230MB |
+| faster-whisper | int8 | 5 | 1m42s | 1477MB |
+| faster-whisper (`batch_size=8`) | int8 | 5 | 51s | 3608MB |
 
-For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
-```python
-from faster_whisper import WhisperModel
+*Executed with 8 threads on an Intel Core i7-12700K.*
 
-model_size = "distil-large-v2"
-# model_size = "distil-medium.en"
-# Run on GPU with FP16
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
-```
-</details>
 
 ## Requirements
 
 * Python 3.8 or greater
 
+Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package.
 
 ### GPU
 

diff --git a/benchmark/evaluate_yt_commons.py b/benchmark/evaluate_yt_commons.py
@@ -5,9 +5,9 @@
 from io import BytesIO
 
 from datasets import load_dataset
-from evaluate import load
+from jiwer import wer
 from pytubefix import YouTube
-from torch.utils.data import DataLoader
+from pytubefix.exceptions import VideoUnavailable
 from tqdm import tqdm
 from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
 
@@ -17,15 +17,19 @@
 def url_to_audio(row):
     buffer = BytesIO()
     yt = YouTube(row["link"])
-    video = (
-        yt.streams.filter(only_audio=True, mime_type="audio/mp4")
-        .order_by("bitrate")
-        .desc()
-        .first()
-    )
-    video.stream_to_buffer(buffer)
-    buffer.seek(0)
-    row["audio"] = decode_audio(buffer)
+    try:
+        video = (
+            yt.streams.filter(only_audio=True, mime_type="audio/mp4")
+            .order_by("bitrate")
+            .desc()
+            .last()
+        )
+        video.stream_to_buffer(buffer)
+        buffer.seek(0)
+        row["audio"] = decode_audio(buffer)
+    except VideoUnavailable:
+        print(f'Failed to download: {row["link"]}')
+        row["audio"] = []
     return row
 
 
@@ -39,27 +43,22 @@ def url_to_audio(row):
 )
 args = parser.parse_args()
 
-# define the evaluation metric
-wer_metric = load("wer")
-
 with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
     normalizer = EnglishTextNormalizer(json.load(f))
 
 dataset = load_dataset("mobiuslabsgmbh/youtube-commons-asr-eval", streaming=True).map(
     url_to_audio
 )
-dataset = iter(
-    DataLoader(dataset["test"], batch_size=1, prefetch_factor=4, num_workers=2)
-)
-
 model = WhisperModel("large-v3", device="cuda")
 pipeline = BatchedInferencePipeline(model, device="cuda")
 
 
 all_transcriptions = []
 all_references = []
 # iterate over the dataset and run inference
-for i, row in tqdm(enumerate(dataset), desc="Evaluating..."):
+for i, row in tqdm(enumerate(dataset["test"]), desc="Evaluating..."):
+    if not row["audio"]:
+        continue
     result, info = pipeline.transcribe(
         row["audio"][0],
         batch_size=8,
@@ -77,7 +76,5 @@ def url_to_audio(row):
 all_references = [normalizer(reference) for reference in all_references]
 
 # compute the WER metric
-wer = 100 * wer_metric.compute(
-    predictions=all_transcriptions, references=all_references
-)
-print("WER: %.3f" % wer)
+word_error_rate = 100 * wer(hypothesis=all_transcriptions, reference=all_references)
+print("WER: %.3f" % word_error_rate)
diff --git a/benchmark/requirements.benchmark.txt b/benchmark/requirements.benchmark.txt
@@ -1,6 +1,5 @@
 transformers
 jiwer
-evaluate
 datasets
 memory_profiler
 py3nvml

diff --git a/benchmark/wer_benchmark.py b/benchmark/wer_benchmark.py
@@ -3,7 +3,7 @@
 import os
 
 from datasets import load_dataset
-from evaluate import load
+from jiwer import wer
 from tqdm import tqdm
 from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
 
@@ -25,9 +25,6 @@
 # load the dataset with streaming mode
 dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
 
-# define the evaluation metric
-wer_metric = load("wer")
-
 with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
     normalizer = EnglishTextNormalizer(json.load(f))
 
@@ -58,7 +55,5 @@ def inference(batch):
 all_references = [normalizer(reference) for reference in all_references]
 
 # compute the WER metric
-wer = 100 * wer_metric.compute(
-    predictions=all_transcriptions, references=all_references
-)
-print("WER: %.3f" % wer)
+word_error_rate = 100 * wer(hypothesis=all_transcriptions, reference=all_references)
+print("WER: %.3f" % word_error_rate)