Skip to content

Commit a0ea27d

Browse files
committed
examples : update stream to use VAD (new)
This commit updates the stream example to use the new Voice Activity Detection (VAD) support instead of the simple_vad which it currently used.
1 parent 2a4d6db commit a0ea27d

File tree

2 files changed

+138
-150
lines changed

2 files changed

+138
-150
lines changed

examples/stream/README.md

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,41 @@
1-
# whisper.cpp/examples/stream
2-
3-
This is a naive example of performing real-time inference on audio from your microphone.
4-
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5-
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6-
7-
```bash
8-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9-
```
10-
11-
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12-
13-
## Sliding window mode with VAD
14-
15-
Setting the `--step` argument to `0` enables the sliding window mode:
16-
17-
```bash
18-
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
19-
```
20-
21-
In this mode, the tool will transcribe only after some speech activity is detected. A very
22-
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
23-
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
24-
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
25-
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
26-
a transcription block that is suitable for parsing.
27-
28-
## Building
29-
30-
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
31-
32-
```bash
33-
# Install SDL2
34-
# On Debian based linux distributions:
35-
sudo apt-get install libsdl2-dev
36-
37-
# On Fedora Linux:
38-
sudo dnf install SDL2 SDL2-devel
39-
40-
# Install SDL2 on Mac OS
41-
brew install sdl2
42-
43-
cmake -B build -DWHISPER_SDL2=ON
44-
cmake --build build --config Release
45-
46-
./build/bin/whisper-stream
47-
```
48-
49-
## Web version
50-
51-
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
1+
# whisper.cpp/examples/stream
2+
3+
This is a naive example of performing real-time inference on audio from your microphone.
4+
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5+
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6+
7+
```bash
8+
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9+
```
10+
11+
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
12+
13+
## VAD support
14+
15+
VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default
16+
`models/for-tests-silero-v5.1.2-ggml.bin` will be used).
17+
18+
## Building
19+
20+
The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
21+
22+
```bash
23+
# Install SDL2
24+
# On Debian based linux distributions:
25+
sudo apt-get install libsdl2-dev
26+
27+
# On Fedora Linux:
28+
sudo dnf install SDL2 SDL2-devel
29+
30+
# Install SDL2 on Mac OS
31+
brew install sdl2
32+
33+
cmake -B build -DWHISPER_SDL2=ON
34+
cmake --build build --config Release
35+
36+
./build/bin/whisper-stream
37+
```
38+
39+
## Web version
40+
41+
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)

examples/stream/stream.cpp

Lines changed: 97 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,21 @@ struct whisper_params {
3737
bool save_audio = false; // save audio to wav file
3838
bool use_gpu = true;
3939
bool flash_attn = false;
40+
bool no_prints = false;
4041

4142
std::string language = "en";
4243
std::string model = "models/ggml-base.en.bin";
4344
std::string fname_out;
45+
46+
// Voice Activity Detection (VAD) parameters
47+
bool vad = false;
48+
std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin";
49+
float vad_threshold = 0.5f;
50+
int vad_min_speech_duration_ms = 250;
51+
int vad_min_silence_duration_ms = 100;
52+
float vad_max_speech_duration_s = FLT_MAX;
53+
int vad_speech_pad_ms = 30;
54+
float vad_samples_overlap = 0.1f;
4455
};
4556

4657
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
6172
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
6273
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
6374
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
64-
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
65-
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
6675
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
6776
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
6877
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
7483
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
7584
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
7685
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
77-
86+
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
87+
// Voice Activity Detection (VAD)
88+
else if ( arg == "--vad") { params.vad = true; }
89+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
90+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
91+
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
92+
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
93+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
94+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
95+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
7896
else {
7997
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
8098
whisper_print_usage(argc, argv, params);
@@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99117
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
100118
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101119
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
102-
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
103-
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
104120
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
105121
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
106122
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
@@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112128
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
113129
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
114130
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false");
131+
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
132+
// Voice Activity Detection (VAD) parameters
133+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
134+
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
135+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
136+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
137+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
138+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
139+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
140+
std::string("FLT_MAX").c_str() :
141+
std::to_string(params.vad_max_speech_duration_s).c_str());
142+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
143+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
115144
fprintf(stderr, "\n");
116145
}
117146

@@ -124,6 +153,17 @@ int main(int argc, char ** argv) {
124153
return 1;
125154
}
126155

156+
if (params.no_prints) {
157+
whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL);
158+
}
159+
160+
if (params.vad) {
161+
// For VAD, ensure at least 500 of context
162+
params.keep_ms = std::max(params.keep_ms, 500);
163+
} else {
164+
params.keep_ms = std::min(params.keep_ms, params.step_ms);
165+
}
166+
127167
params.keep_ms = std::min(params.keep_ms, params.step_ms);
128168
params.length_ms = std::max(params.length_ms, params.step_ms);
129169

@@ -132,7 +172,7 @@ int main(int argc, char ** argv) {
132172
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
133173
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
134174

135-
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
175+
const bool use_vad = params.vad;
136176

137177
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
138178

@@ -191,12 +231,7 @@ int main(int argc, char ** argv) {
191231
params.translate ? "translate" : "transcribe",
192232
params.no_timestamps ? 0 : 1);
193233

194-
if (!use_vad) {
195-
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
196-
} else {
197-
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
198-
}
199-
234+
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
200235
fprintf(stderr, "\n");
201236
}
202237

@@ -242,118 +277,86 @@ int main(int argc, char ** argv) {
242277
break;
243278
}
244279

245-
// process new audio
246-
247-
if (!use_vad) {
248-
while (true) {
249-
// handle Ctrl + C
250-
is_running = sdl_poll_events();
251-
if (!is_running) {
252-
break;
253-
}
254-
audio.get(params.step_ms, pcmf32_new);
255-
256-
if ((int) pcmf32_new.size() > 2*n_samples_step) {
257-
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
258-
audio.clear();
259-
continue;
260-
}
280+
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
261281

262-
if ((int) pcmf32_new.size() >= n_samples_step) {
263-
audio.clear();
264-
break;
265-
}
282+
wparams.print_progress = false;
283+
wparams.print_special = params.print_special;
284+
wparams.print_realtime = false;
285+
wparams.print_timestamps = !params.no_timestamps;
286+
wparams.translate = params.translate;
287+
wparams.single_segment = !use_vad;
288+
wparams.max_tokens = params.max_tokens;
289+
wparams.language = params.language.c_str();
290+
wparams.n_threads = params.n_threads;
291+
wparams.beam_search.beam_size = params.beam_size;
266292

267-
std::this_thread::sleep_for(std::chrono::milliseconds(1));
268-
}
293+
wparams.audio_ctx = params.audio_ctx;
269294

270-
const int n_samples_new = pcmf32_new.size();
295+
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
271296

272-
// take up to params.length_ms audio from previous iteration
273-
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
297+
// disable temperature fallback
298+
//wparams.temperature_inc = -1.0f;
299+
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
274300

275-
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
301+
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
302+
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
276303

277-
pcmf32.resize(n_samples_new + n_samples_take);
304+
// process new audio
278305

279-
for (int i = 0; i < n_samples_take; i++) {
280-
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
306+
while (true) {
307+
// handle Ctrl + C
308+
is_running = sdl_poll_events();
309+
if (!is_running) {
310+
break;
281311
}
312+
audio.get(params.step_ms, pcmf32_new);
282313

283-
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
284-
285-
pcmf32_old = pcmf32;
286-
} else {
287-
const auto t_now = std::chrono::high_resolution_clock::now();
288-
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
289-
290-
if (t_diff < 2000) {
291-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
292-
314+
if ((int) pcmf32_new.size() > 2*n_samples_step) {
315+
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
316+
audio.clear();
293317
continue;
294318
}
295319

296-
audio.get(2000, pcmf32_new);
297-
298-
if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
299-
audio.get(params.length_ms, pcmf32);
300-
} else {
301-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
302-
303-
continue;
320+
if ((int) pcmf32_new.size() >= n_samples_step) {
321+
audio.clear();
322+
break;
304323
}
305324

306-
t_last = t_now;
325+
std::this_thread::sleep_for(std::chrono::milliseconds(1));
307326
}
308327

309-
// run the inference
310-
{
311-
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
328+
const int n_samples_new = pcmf32_new.size();
312329

313-
wparams.print_progress = false;
314-
wparams.print_special = params.print_special;
315-
wparams.print_realtime = false;
316-
wparams.print_timestamps = !params.no_timestamps;
317-
wparams.translate = params.translate;
318-
wparams.single_segment = !use_vad;
319-
wparams.max_tokens = params.max_tokens;
320-
wparams.language = params.language.c_str();
321-
wparams.n_threads = params.n_threads;
322-
wparams.beam_search.beam_size = params.beam_size;
330+
// take up to params.length_ms audio from previous iteration
331+
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
323332

324-
wparams.audio_ctx = params.audio_ctx;
333+
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
325334

326-
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
335+
pcmf32.resize(n_samples_new + n_samples_take);
327336

328-
// disable temperature fallback
329-
//wparams.temperature_inc = -1.0f;
330-
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
337+
for (int i = 0; i < n_samples_take; i++) {
338+
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
339+
}
340+
341+
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
331342

332-
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
333-
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
343+
pcmf32_old = pcmf32;
334344

345+
// run the inference
346+
{
335347
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
336348
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
337349
return 6;
338350
}
339351

340352
// print result;
341353
{
342-
if (!use_vad) {
343-
printf("\33[2K\r");
354+
printf("\33[2K\r");
344355

345-
// print long empty line to clear the previous line
346-
printf("%s", std::string(100, ' ').c_str());
356+
// print long empty line to clear the previous line
357+
printf("%s", std::string(100, ' ').c_str());
347358

348-
printf("\33[2K\r");
349-
} else {
350-
const int64_t t1 = (t_last - t_start).count()/1000000;
351-
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
352-
353-
printf("\n");
354-
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
355-
printf("\n");
356-
}
359+
printf("\33[2K\r");
357360

358361
const int n_segments = whisper_full_n_segments(ctx);
359362
for (int i = 0; i < n_segments; ++i) {
@@ -391,16 +394,11 @@ int main(int argc, char ** argv) {
391394
fout << std::endl;
392395
}
393396

394-
if (use_vad) {
395-
printf("\n");
396-
printf("### Transcription %d END\n", n_iter);
397-
}
398397
}
399398

400399
++n_iter;
401400

402-
if (!use_vad && (n_iter % n_new_line) == 0) {
403-
printf("\n");
401+
if (n_iter % n_new_line == 0) {
404402

405403
// keep part of the audio for next iteration to try to mitigate word boundary issues
406404
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());

0 commit comments

Comments
 (0)