diff --git a/examples/stream/README.md b/examples/stream/README.md index f07cfb8915c..e60fd8bd6da 100644 --- a/examples/stream/README.md +++ b/examples/stream/README.md @@ -1,51 +1,41 @@ -# whisper.cpp/examples/stream - -This is a naive example of performing real-time inference on audio from your microphone. -The `whisper-stream` tool samples the audio every half a second and runs the transcription continously. -More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). - -```bash -./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 -``` - -https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4 - -## Sliding window mode with VAD - -Setting the `--step` argument to `0` enables the sliding window mode: - -```bash - ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 -``` - -In this mode, the tool will transcribe only after some speech activity is detected. A very -basic VAD detector is used, but in theory a more sophisticated approach can be added. The -`-vth` argument determines the VAD threshold - higher values will make it detect silence more often. -It's best to tune it to the specific use case, but a value around `0.6` should be OK in general. -When silence is detected, it will transcribe the last `--length` milliseconds of audio and output -a transcription block that is suitable for parsing. - -## Building - -The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: - -```bash -# Install SDL2 -# On Debian based linux distributions: -sudo apt-get install libsdl2-dev - -# On Fedora Linux: -sudo dnf install SDL2 SDL2-devel - -# Install SDL2 on Mac OS -brew install sdl2 - -cmake -B build -DWHISPER_SDL2=ON -cmake --build build --config Release - -./build/bin/whisper-stream -``` - -## Web version - -This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm) +# whisper.cpp/examples/stream + +This is a naive example of performing real-time inference on audio from your microphone. +The `whisper-stream` tool samples the audio every half a second and runs the transcription continously. +More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). + +```bash +./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 +``` + +https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4 + +## VAD support + +VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default +`models/for-tests-silero-v5.1.2-ggml.bin` will be used). + +## Building + +The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: + +```bash +# Install SDL2 +# On Debian based linux distributions: +sudo apt-get install libsdl2-dev + +# On Fedora Linux: +sudo dnf install SDL2 SDL2-devel + +# Install SDL2 on Mac OS +brew install sdl2 + +cmake -B build -DWHISPER_SDL2=ON +cmake --build build --config Release + +./build/bin/whisper-stream +``` + +## Web version + +This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index bc6f13fb267..12bd285a96a 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -37,10 +37,21 @@ struct whisper_params { bool save_audio = false; // save audio to wav file bool use_gpu = true; bool flash_attn = false; + bool no_prints = false; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; std::string fname_out; + + // Voice Activity Detection (VAD) parameters + bool vad = false; + std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin"; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); } - else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } - else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } @@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } - + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } + // Voice Activity Detection (VAD) + else if ( arg == "--vad") { params.vad = true; } + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; } + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); } + else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); } + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); } + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens); fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); - fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); - fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); @@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); + // Voice Activity Detection (VAD) parameters + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); + fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? + std::string("FLT_MAX").c_str() : + std::to_string(params.vad_max_speech_duration_s).c_str()); + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); fprintf(stderr, "\n"); } @@ -124,6 +153,17 @@ int main(int argc, char ** argv) { return 1; } + if (params.no_prints) { + whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL); + } + + if (params.vad) { + // For VAD, ensure at least 500 of context + params.keep_ms = std::max(params.keep_ms, 500); + } else { + params.keep_ms = std::min(params.keep_ms, params.step_ms); + } + params.keep_ms = std::min(params.keep_ms, params.step_ms); params.length_ms = std::max(params.length_ms, params.step_ms); @@ -132,7 +172,7 @@ int main(int argc, char ** argv) { const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; - const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD + const bool use_vad = params.vad; const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line @@ -191,12 +231,7 @@ int main(int argc, char ** argv) { params.translate ? "translate" : "transcribe", params.no_timestamps ? 0 : 1); - if (!use_vad) { - fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context); - } else { - fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__); - } - + fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context); fprintf(stderr, "\n"); } @@ -242,96 +277,82 @@ int main(int argc, char ** argv) { break; } - // process new audio + whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY); - if (!use_vad) { - while (true) { - // handle Ctrl + C - is_running = sdl_poll_events(); - if (!is_running) { - break; - } - audio.get(params.step_ms, pcmf32_new); + wparams.print_progress = false; + wparams.print_special = params.print_special; + wparams.print_realtime = false; + wparams.print_timestamps = !params.no_timestamps; + wparams.translate = params.translate; + wparams.single_segment = !use_vad; + wparams.max_tokens = params.max_tokens; + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.beam_search.beam_size = params.beam_size; - if ((int) pcmf32_new.size() > 2*n_samples_step) { - fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); - audio.clear(); - continue; - } + wparams.audio_ctx = params.audio_ctx; - if ((int) pcmf32_new.size() >= n_samples_step) { - audio.clear(); - break; - } + wparams.tdrz_enable = params.tinydiarize; // [TDRZ] - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - - const int n_samples_new = pcmf32_new.size(); + // disable temperature fallback + //wparams.temperature_inc = -1.0f; + wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; - // take up to params.length_ms audio from previous iteration - const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new)); + wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); + wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); + wparams.vad = params.vad; + wparams.vad_model_path = params.vad_model.c_str(); - //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size()); + wparams.vad_params.threshold = params.vad_threshold; + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wparams.vad_params.samples_overlap = params.vad_samples_overlap; - pcmf32.resize(n_samples_new + n_samples_take); + // process new audio - for (int i = 0; i < n_samples_take; i++) { - pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i]; + while (true) { + // handle Ctrl + C + is_running = sdl_poll_events(); + if (!is_running) { + break; } + audio.get(params.step_ms, pcmf32_new); - memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float)); - - pcmf32_old = pcmf32; - } else { - const auto t_now = std::chrono::high_resolution_clock::now(); - const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); - - if (t_diff < 2000) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - + if ((int) pcmf32_new.size() > 2*n_samples_step) { + fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); + audio.clear(); continue; } - audio.get(2000, pcmf32_new); - - if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { - audio.get(params.length_ms, pcmf32); - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - continue; + if ((int) pcmf32_new.size() >= n_samples_step) { + audio.clear(); + break; } - t_last = t_now; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } - // run the inference - { - whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY); + const int n_samples_new = pcmf32_new.size(); - wparams.print_progress = false; - wparams.print_special = params.print_special; - wparams.print_realtime = false; - wparams.print_timestamps = !params.no_timestamps; - wparams.translate = params.translate; - wparams.single_segment = !use_vad; - wparams.max_tokens = params.max_tokens; - wparams.language = params.language.c_str(); - wparams.n_threads = params.n_threads; - wparams.beam_search.beam_size = params.beam_size; + // take up to params.length_ms audio from previous iteration + const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new)); - wparams.audio_ctx = params.audio_ctx; + //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size()); - wparams.tdrz_enable = params.tinydiarize; // [TDRZ] + pcmf32.resize(n_samples_new + n_samples_take); - // disable temperature fallback - //wparams.temperature_inc = -1.0f; - wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; + for (int i = 0; i < n_samples_take; i++) { + pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i]; + } + + memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float)); - wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); - wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); + pcmf32_old = pcmf32; + // run the inference + { if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 6; @@ -339,21 +360,12 @@ int main(int argc, char ** argv) { // print result; { - if (!use_vad) { - printf("\33[2K\r"); + printf("\33[2K\r"); - // print long empty line to clear the previous line - printf("%s", std::string(100, ' ').c_str()); + // print long empty line to clear the previous line + printf("%s", std::string(100, ' ').c_str()); - printf("\33[2K\r"); - } else { - const int64_t t1 = (t_last - t_start).count()/1000000; - const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); - - printf("\n"); - printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1); - printf("\n"); - } + printf("\33[2K\r"); const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { @@ -391,16 +403,11 @@ int main(int argc, char ** argv) { fout << std::endl; } - if (use_vad) { - printf("\n"); - printf("### Transcription %d END\n", n_iter); - } } ++n_iter; - if (!use_vad && (n_iter % n_new_line) == 0) { - printf("\n"); + if (n_iter % n_new_line == 0) { // keep part of the audio for next iteration to try to mitigate word boundary issues pcmf32_old = std::vector(pcmf32.end() - n_samples_keep, pcmf32.end());