@@ -37,10 +37,21 @@ struct whisper_params {
37
37
bool save_audio = false ; // save audio to wav file
38
38
bool use_gpu = true ;
39
39
bool flash_attn = false ;
40
+ bool no_prints = false ;
40
41
41
42
std::string language = " en" ;
42
43
std::string model = " models/ggml-base.en.bin" ;
43
44
std::string fname_out;
45
+
46
+ // Voice Activity Detection (VAD) parameters
47
+ bool vad = false ;
48
+ std::string vad_model = " models/for-tests-silero-v5.1.2-ggml.bin" ;
49
+ float vad_threshold = 0 .5f ;
50
+ int vad_min_speech_duration_ms = 250 ;
51
+ int vad_min_silence_duration_ms = 100 ;
52
+ float vad_max_speech_duration_s = FLT_MAX;
53
+ int vad_speech_pad_ms = 30 ;
54
+ float vad_samples_overlap = 0 .1f ;
44
55
};
45
56
46
57
void whisper_print_usage (int argc, char ** argv, const whisper_params & params);
@@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
61
72
else if (arg == " -mt" || arg == " --max-tokens" ) { params.max_tokens = std::stoi (argv[++i]); }
62
73
else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio_ctx = std::stoi (argv[++i]); }
63
74
else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
64
- else if (arg == " -vth" || arg == " --vad-thold" ) { params.vad_thold = std::stof (argv[++i]); }
65
- else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
66
75
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
67
76
else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
68
77
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
@@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
74
83
else if (arg == " -sa" || arg == " --save-audio" ) { params.save_audio = true ; }
75
84
else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
76
85
else if (arg == " -fa" || arg == " --flash-attn" ) { params.flash_attn = true ; }
77
-
86
+ else if (arg == " -np" || arg == " --no-prints" ) { params.no_prints = true ; }
87
+ // Voice Activity Detection (VAD)
88
+ else if ( arg == " --vad" ) { params.vad = true ; }
89
+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = argv[++i]; }
90
+ else if (arg == " -vt" || arg == " --vad-threshold" ) { params.vad_threshold = std::stof (argv[++i]); }
91
+ else if (arg == " -vspd" || arg == " --vad-min-speech-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
92
+ else if (arg == " -vsd" || arg == " --vad-min-silence-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
93
+ else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
94
+ else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
95
+ else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
78
96
else {
79
97
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
80
98
whisper_print_usage (argc, argv, params);
@@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99
117
fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
100
118
fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
101
119
fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
102
- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
103
- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
104
120
fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
105
121
fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
106
122
fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
@@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112
128
fprintf (stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n " , params.save_audio ? " true" : " false" );
113
129
fprintf (stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n " , params.use_gpu ? " false" : " true" );
114
130
fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n " , params.flash_attn ? " true" : " false" );
131
+ fprintf (stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n " , params.no_prints ? " true" : " false" );
132
+ // Voice Activity Detection (VAD) parameters
133
+ fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
134
+ fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
135
+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n " , params.vad_model .c_str ());
136
+ fprintf (stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n " , params.vad_threshold );
137
+ fprintf (stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n " , params.vad_min_speech_duration_ms );
138
+ fprintf (stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n " , params.vad_min_silence_duration_ms );
139
+ fprintf (stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n " , params.vad_max_speech_duration_s == FLT_MAX ?
140
+ std::string (" FLT_MAX" ).c_str () :
141
+ std::to_string (params.vad_max_speech_duration_s ).c_str ());
142
+ fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
143
+ fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
115
144
fprintf (stderr, " \n " );
116
145
}
117
146
@@ -124,6 +153,17 @@ int main(int argc, char ** argv) {
124
153
return 1 ;
125
154
}
126
155
156
+ if (params.no_prints ) {
157
+ whisper_log_set ([](enum ggml_log_level, const char *, void *) { }, NULL );
158
+ }
159
+
160
+ if (params.vad ) {
161
+ // For VAD, ensure at least 500 of context
162
+ params.keep_ms = std::max (params.keep_ms , 500 );
163
+ } else {
164
+ params.keep_ms = std::min (params.keep_ms , params.step_ms );
165
+ }
166
+
127
167
params.keep_ms = std::min (params.keep_ms , params.step_ms );
128
168
params.length_ms = std::max (params.length_ms , params.step_ms );
129
169
@@ -132,7 +172,7 @@ int main(int argc, char ** argv) {
132
172
const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
133
173
const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
134
174
135
- const bool use_vad = n_samples_step <= 0 ; // sliding window mode uses VAD
175
+ const bool use_vad = params. vad ;
136
176
137
177
const int n_new_line = !use_vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
138
178
@@ -191,12 +231,7 @@ int main(int argc, char ** argv) {
191
231
params.translate ? " translate" : " transcribe" ,
192
232
params.no_timestamps ? 0 : 1 );
193
233
194
- if (!use_vad) {
195
- fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
196
- } else {
197
- fprintf (stderr, " %s: using VAD, will transcribe on speech activity\n " , __func__);
198
- }
199
-
234
+ fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
200
235
fprintf (stderr, " \n " );
201
236
}
202
237
@@ -242,118 +277,86 @@ int main(int argc, char ** argv) {
242
277
break ;
243
278
}
244
279
245
- // process new audio
246
-
247
- if (!use_vad) {
248
- while (true ) {
249
- // handle Ctrl + C
250
- is_running = sdl_poll_events ();
251
- if (!is_running) {
252
- break ;
253
- }
254
- audio.get (params.step_ms , pcmf32_new);
255
-
256
- if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
257
- fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
258
- audio.clear ();
259
- continue ;
260
- }
280
+ whisper_full_params wparams = whisper_full_default_params (params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
261
281
262
- if ((int ) pcmf32_new.size () >= n_samples_step) {
263
- audio.clear ();
264
- break ;
265
- }
282
+ wparams.print_progress = false ;
283
+ wparams.print_special = params.print_special ;
284
+ wparams.print_realtime = false ;
285
+ wparams.print_timestamps = !params.no_timestamps ;
286
+ wparams.translate = params.translate ;
287
+ wparams.single_segment = !use_vad;
288
+ wparams.max_tokens = params.max_tokens ;
289
+ wparams.language = params.language .c_str ();
290
+ wparams.n_threads = params.n_threads ;
291
+ wparams.beam_search .beam_size = params.beam_size ;
266
292
267
- std::this_thread::sleep_for (std::chrono::milliseconds (1 ));
268
- }
293
+ wparams.audio_ctx = params.audio_ctx ;
269
294
270
- const int n_samples_new = pcmf32_new. size ();
295
+ wparams. tdrz_enable = params. tinydiarize ; // [TDRZ]
271
296
272
- // take up to params.length_ms audio from previous iteration
273
- const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
297
+ // disable temperature fallback
298
+ // wparams.temperature_inc = -1.0f;
299
+ wparams.temperature_inc = params.no_fallback ? 0 .0f : wparams.temperature_inc ;
274
300
275
- // printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
301
+ wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
302
+ wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
276
303
277
- pcmf32. resize (n_samples_new + n_samples_take);
304
+ // process new audio
278
305
279
- for (int i = 0 ; i < n_samples_take; i++) {
280
- pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
306
+ while (true ) {
307
+ // handle Ctrl + C
308
+ is_running = sdl_poll_events ();
309
+ if (!is_running) {
310
+ break ;
281
311
}
312
+ audio.get (params.step_ms , pcmf32_new);
282
313
283
- memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
284
-
285
- pcmf32_old = pcmf32;
286
- } else {
287
- const auto t_now = std::chrono::high_resolution_clock::now ();
288
- const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count ();
289
-
290
- if (t_diff < 2000 ) {
291
- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
292
-
314
+ if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
315
+ fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
316
+ audio.clear ();
293
317
continue ;
294
318
}
295
319
296
- audio.get (2000 , pcmf32_new);
297
-
298
- if (::vad_simple (pcmf32_new, WHISPER_SAMPLE_RATE, 1000 , params.vad_thold , params.freq_thold , false )) {
299
- audio.get (params.length_ms , pcmf32);
300
- } else {
301
- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
302
-
303
- continue ;
320
+ if ((int ) pcmf32_new.size () >= n_samples_step) {
321
+ audio.clear ();
322
+ break ;
304
323
}
305
324
306
- t_last = t_now ;
325
+ std::this_thread::sleep_for ( std::chrono::milliseconds ( 1 )) ;
307
326
}
308
327
309
- // run the inference
310
- {
311
- whisper_full_params wparams = whisper_full_default_params (params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
328
+ const int n_samples_new = pcmf32_new.size ();
312
329
313
- wparams.print_progress = false ;
314
- wparams.print_special = params.print_special ;
315
- wparams.print_realtime = false ;
316
- wparams.print_timestamps = !params.no_timestamps ;
317
- wparams.translate = params.translate ;
318
- wparams.single_segment = !use_vad;
319
- wparams.max_tokens = params.max_tokens ;
320
- wparams.language = params.language .c_str ();
321
- wparams.n_threads = params.n_threads ;
322
- wparams.beam_search .beam_size = params.beam_size ;
330
+ // take up to params.length_ms audio from previous iteration
331
+ const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
323
332
324
- wparams. audio_ctx = params. audio_ctx ;
333
+ // printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size()) ;
325
334
326
- wparams. tdrz_enable = params. tinydiarize ; // [TDRZ]
335
+ pcmf32. resize (n_samples_new + n_samples_take);
327
336
328
- // disable temperature fallback
329
- // wparams.temperature_inc = -1.0f;
330
- wparams.temperature_inc = params.no_fallback ? 0 .0f : wparams.temperature_inc ;
337
+ for (int i = 0 ; i < n_samples_take; i++) {
338
+ pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
339
+ }
340
+
341
+ memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
331
342
332
- wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
333
- wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
343
+ pcmf32_old = pcmf32;
334
344
345
+ // run the inference
346
+ {
335
347
if (whisper_full (ctx, wparams, pcmf32.data (), pcmf32.size ()) != 0 ) {
336
348
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
337
349
return 6 ;
338
350
}
339
351
340
352
// print result;
341
353
{
342
- if (!use_vad) {
343
- printf (" \33 [2K\r " );
354
+ printf (" \33 [2K\r " );
344
355
345
- // print long empty line to clear the previous line
346
- printf (" %s" , std::string (100 , ' ' ).c_str ());
356
+ // print long empty line to clear the previous line
357
+ printf (" %s" , std::string (100 , ' ' ).c_str ());
347
358
348
- printf (" \33 [2K\r " );
349
- } else {
350
- const int64_t t1 = (t_last - t_start).count ()/1000000 ;
351
- const int64_t t0 = std::max (0.0 , t1 - pcmf32.size ()*1000.0 /WHISPER_SAMPLE_RATE);
352
-
353
- printf (" \n " );
354
- printf (" ### Transcription %d START | t0 = %d ms | t1 = %d ms\n " , n_iter, (int ) t0, (int ) t1);
355
- printf (" \n " );
356
- }
359
+ printf (" \33 [2K\r " );
357
360
358
361
const int n_segments = whisper_full_n_segments (ctx);
359
362
for (int i = 0 ; i < n_segments; ++i) {
@@ -391,16 +394,11 @@ int main(int argc, char ** argv) {
391
394
fout << std::endl;
392
395
}
393
396
394
- if (use_vad) {
395
- printf (" \n " );
396
- printf (" ### Transcription %d END\n " , n_iter);
397
- }
398
397
}
399
398
400
399
++n_iter;
401
400
402
- if (!use_vad && (n_iter % n_new_line) == 0 ) {
403
- printf (" \n " );
401
+ if (n_iter % n_new_line == 0 ) {
404
402
405
403
// keep part of the audio for next iteration to try to mitigate word boundary issues
406
404
pcmf32_old = std::vector<float >(pcmf32.end () - n_samples_keep, pcmf32.end ());
0 commit comments