From fb071665dba6315298a2a779cb38f727916431b6 Mon Sep 17 00:00:00 2001 From: Butterscotch! Date: Sat, 9 Mar 2024 21:07:18 -0500 Subject: [PATCH] Make audio handling generic & start Whisper --- ButterSTT/Program.cs | 2 +- ButterSTT/STT/AprilAsr.cs | 110 +++++++++++++++++++ ButterSTT/STT/AudioHandler.cs | 107 +++++++++++++++++++ ButterSTT/STT/WhisperAsr.cs | 56 ++++++++++ ButterSTT/SpeechToTextHandler.cs | 178 ------------------------------- 5 files changed, 274 insertions(+), 179 deletions(-) create mode 100644 ButterSTT/STT/AprilAsr.cs create mode 100644 ButterSTT/STT/AudioHandler.cs create mode 100644 ButterSTT/STT/WhisperAsr.cs delete mode 100644 ButterSTT/SpeechToTextHandler.cs diff --git a/ButterSTT/Program.cs b/ButterSTT/Program.cs index 7482bb4..8a21181 100644 --- a/ButterSTT/Program.cs +++ b/ButterSTT/Program.cs @@ -68,7 +68,7 @@ }; oscHandler.StartMessageLoop(); - using var speechToTextHandler = new SpeechToTextHandler( + using var speechToTextHandler = new ButterSTT.STT.AprilAsr( modelFile, oscHandler.MessageQueue, deviceNumber: config.MicrophoneDeviceNumber diff --git a/ButterSTT/STT/AprilAsr.cs b/ButterSTT/STT/AprilAsr.cs new file mode 100644 index 0000000..b6b88fa --- /dev/null +++ b/ButterSTT/STT/AprilAsr.cs @@ -0,0 +1,110 @@ +using System.Text; +using AprilAsr; +using ButterSTT.MessageSystem; +using ButterSTT.TextProcessing; + +namespace ButterSTT.STT +{ + public class AprilAsr : IDisposable + { + // Audio + public readonly AudioHandler AudioHandler; + + // Model + public readonly FileInfo ModelFile; + private readonly AprilModel _model; + + // Session + private readonly AprilSession _session; + + // Output + private readonly StringBuilder _consoleOutput = new(); + private readonly StringBuilder _aprilOutput = new(); + private readonly MessageQueue _messageQueue; + + public AprilAsr(FileInfo modelFile, MessageQueue messageQueue, int deviceNumber = 0) + { + _messageQueue = messageQueue; + + // Load model + ModelFile = modelFile; + _model = new AprilModel(modelFile.FullName); + + Console.WriteLine( + $"Model loaded from \"{modelFile.FullName}\":\n > Name: {_model.Name}\n > Description: {_model.Description}\n > Language: {_model.Language}\n > Sample Rate: {_model.SampleRate} Hz" + ); + + // Initialize session + _session = new AprilSession(_model, OnAprilTokens, async: true); + + // Initialize microphone + AudioHandler = new(_model.SampleRate, deviceNumber); + AudioHandler.OnMicData += OnMicData; + AudioHandler.OnMicStop += OnMicStop; + } + + private void OnMicData(object? sender, (short[] data, int length) data) + { + if (data.length <= 0) + return; + + _session.FeedPCM16(data.data, data.length); + } + + private void OnMicStop(object? sender, EventArgs e) + { + _session.Flush(); + } + + private void OnAprilTokens(AprilResultKind result, AprilToken[] tokens) + { + _consoleOutput.Clear(); + _aprilOutput.Clear(); + + switch (result) + { + case AprilResultKind.PartialRecognition: + _consoleOutput.Append("- "); + break; + case AprilResultKind.FinalRecognition: + _consoleOutput.Append("@ "); + break; + default: + _consoleOutput.Append(' '); + break; + } + + foreach (var token in tokens) + { + _aprilOutput.Append(token.Token); + } + + var aprilOutputString = + tokens.Length > 0 + ? EnglishCapitalization.Capitalize(_aprilOutput.ToString().Trim()) + : ""; + + if (result == AprilResultKind.FinalRecognition) + { + _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph( + aprilOutputString, + wordRegex: EnglishTextParser.WordKeepUrl() + ); + _messageQueue.FinishCurrentParagraph(); + } + else + { + _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(aprilOutputString); + } + + _consoleOutput.Append(aprilOutputString); + Console.WriteLine(_consoleOutput); + } + + public void Dispose() + { + AudioHandler.Dispose(); + GC.SuppressFinalize(this); + } + } +} diff --git a/ButterSTT/STT/AudioHandler.cs b/ButterSTT/STT/AudioHandler.cs new file mode 100644 index 0000000..ab914aa --- /dev/null +++ b/ButterSTT/STT/AudioHandler.cs @@ -0,0 +1,107 @@ +using ButterSTT.Config; +using NAudio.Wave; + +namespace ButterSTT.STT +{ + public class AudioHandler : IDisposable + { + // Audio + private readonly WaveInEvent _audioIn; + private bool _restartRecordingNextStop = false; + + public int WaveDeviceNumber { get; private set; } = + STTConfig.Default.MicrophoneDeviceNumber; + public bool IsMicrophoneRecording { get; private set; } = false; + + public event EventHandler? OnMicStart; + public event EventHandler? OnMicStop; + public event EventHandler<(short[] data, int length)>? OnMicData; + + public AudioHandler(int sampleRate = 16000, int deviceNumber = 0) + { + // Initialize microphone + _audioIn = new WaveInEvent() + { + DeviceNumber = deviceNumber, + WaveFormat = new(sampleRate, 16, 1) + }; + WaveDeviceNumber = deviceNumber; + + // Register microphone events + _audioIn.DataAvailable += OnWaveData; + _audioIn.RecordingStopped += OnWaveStop; + } + + public void StartRecording() + { + _audioIn.StartRecording(); + IsMicrophoneRecording = true; + OnMicStart?.Invoke(this, EventArgs.Empty); + } + + public void StopRecording() + { + // Tell the recording not to restart + _restartRecordingNextStop = false; + + // This keeps recording for a little bit longer, it will call the event when it's done + _audioIn.StopRecording(); + } + + public void SwapMicrophoneDevice(int deviceNumber) + { + // If it's already using this device, ignore it and continue + if (_audioIn.DeviceNumber == deviceNumber) + return; + + var wasRecording = IsMicrophoneRecording; + + // Make sure the recording is stopped + StopRecording(); + + // Swap devices + _audioIn.DeviceNumber = deviceNumber; + WaveDeviceNumber = deviceNumber; + + // If it's already stopped, restart it immediately + // Otherwise, start it again when it's done stopping + if (wasRecording && !IsMicrophoneRecording) + { + StartRecording(); + } + else + { + _restartRecordingNextStop = true; + } + } + + private void OnWaveData(object? sender, WaveInEventArgs args) + { + if (args.BytesRecorded <= 0) + return; + + // Convert the bytes to shorts + var shorts = new short[args.BytesRecorded / sizeof(short)]; + Buffer.BlockCopy(args.Buffer, 0, shorts, 0, args.BytesRecorded); + + OnMicData?.Invoke(this, (shorts, shorts.Length)); + } + + private void OnWaveStop(object? sender, StoppedEventArgs args) + { + IsMicrophoneRecording = false; + + if (_restartRecordingNextStop) + StartRecording(); + else + OnMicStop?.Invoke(this, EventArgs.Empty); + } + + public void Dispose() + { + StopRecording(); + _audioIn.Dispose(); + GC.SuppressFinalize(this); + } + } +} diff --git a/ButterSTT/STT/WhisperAsr.cs b/ButterSTT/STT/WhisperAsr.cs new file mode 100644 index 0000000..41293ac --- /dev/null +++ b/ButterSTT/STT/WhisperAsr.cs @@ -0,0 +1,56 @@ +using Whisper.net; + +namespace ButterSTT.STT +{ + public class WhisperAsr : IDisposable + { + // Audio + public readonly AudioHandler AudioHandler; + + // Model + public readonly FileInfo ModelFile; + private readonly WhisperProcessor _processor; + + public WhisperAsr(FileInfo modelFile, int deviceNumber = 0) + { + // Load model + ModelFile = modelFile; + using var whisperFactory = WhisperFactory.FromPath(modelFile.FullName); + _processor = whisperFactory + .CreateBuilder() + .WithLanguage("auto") + .WithSegmentEventHandler(OnSegmentEvent) + .Build(); + + Console.WriteLine($"Model loaded from \"{modelFile.FullName}\"."); + + // Initialize microphone + AudioHandler = new(16000, deviceNumber); + AudioHandler.OnMicData += OnMicData; + } + + private void OnMicData(object? sender, (short[] data, int length) data) + { + if (data.length <= 0) + return; + + var floats = new float[data.length]; + for (var i = 0; i < data.length; i++) + floats[i] = data.data[i] / 32768f; + + _processor.Process(floats); + } + + private void OnSegmentEvent(SegmentData segment) + { + Console.WriteLine(segment.Text.Trim()); + } + + public void Dispose() + { + AudioHandler?.Dispose(); + _processor.Dispose(); + GC.SuppressFinalize(this); + } + } +} diff --git a/ButterSTT/SpeechToTextHandler.cs b/ButterSTT/SpeechToTextHandler.cs deleted file mode 100644 index c518009..0000000 --- a/ButterSTT/SpeechToTextHandler.cs +++ /dev/null @@ -1,178 +0,0 @@ -using System.Text; -using AprilAsr; -using ButterSTT.Config; -using ButterSTT.MessageSystem; -using ButterSTT.TextProcessing; -using NAudio.Wave; - -namespace ButterSTT -{ - public class SpeechToTextHandler : IDisposable - { - // Audio - private readonly WaveInEvent _audioIn; - private bool _restartRecordingNextStop = false; - - // Model - private readonly AprilModel _model; - public readonly FileInfo ModelFile; - - // Session - private readonly AprilSession _session; - - // Output - private readonly StringBuilder _consoleOutput = new(); - private readonly StringBuilder _aprilOutput = new(); - private readonly MessageQueue _messageQueue; - - public int WaveDeviceNumber { get; private set; } = - STTConfig.Default.MicrophoneDeviceNumber; - public bool MicrophoneRecording { get; private set; } = false; - - public SpeechToTextHandler( - FileInfo modelFile, - MessageQueue messageQueue, - int deviceNumber = 0 - ) - { - _messageQueue = messageQueue; - - // Load model - _model = new AprilModel(modelFile.FullName); - ModelFile = modelFile; - - Console.WriteLine( - $"Model loaded from \"{modelFile.FullName}\":\n > Name: {_model.Name}\n > Description: {_model.Description}\n > Language: {_model.Language}\n > Sample Rate: {_model.SampleRate} Hz" - ); - - // Initialize session - _session = new AprilSession(_model, OnAprilTokens, async: true); - - // Initialize microphone - _audioIn = new WaveInEvent() - { - DeviceNumber = deviceNumber, - WaveFormat = new(_model.SampleRate, 16, 1) - }; - WaveDeviceNumber = deviceNumber; - - // Register microphone events - _audioIn.DataAvailable += OnMicData; - _audioIn.RecordingStopped += OnMicStop; - } - - public void StartRecording() - { - _audioIn.StartRecording(); - MicrophoneRecording = true; - } - - public void StopRecording() - { - // Tell the recording not to restart - _restartRecordingNextStop = false; - - // This keeps recording for a little bit longer, it will call the event when it's done - _audioIn.StopRecording(); - } - - public void SwapMicrophoneDevice(int deviceNumber) - { - // If it's already using this device, ignore it and continue - if (_audioIn.DeviceNumber == deviceNumber) - return; - - var wasRecording = MicrophoneRecording; - - // Make sure the recording is stopped - StopRecording(); - - // Swap devices - _audioIn.DeviceNumber = deviceNumber; - WaveDeviceNumber = deviceNumber; - - // If it's already stopped, restart it immediately - // Otherwise, start it again when it's done stopping - if (wasRecording && !MicrophoneRecording) - { - StartRecording(); - } - else - { - _restartRecordingNextStop = true; - } - } - - private void OnMicData(object? sender, WaveInEventArgs args) - { - if (args.BytesRecorded <= 0) - return; - - // Convert the bytes to shorts - var shorts = new short[args.BytesRecorded / sizeof(short)]; - Buffer.BlockCopy(args.Buffer, 0, shorts, 0, args.BytesRecorded); - _session.FeedPCM16(shorts, shorts.Length); - } - - private void OnMicStop(object? sender, StoppedEventArgs args) - { - _session.Flush(); - MicrophoneRecording = false; - - if (_restartRecordingNextStop) - StartRecording(); - } - - private void OnAprilTokens(AprilResultKind result, AprilToken[] tokens) - { - _consoleOutput.Clear(); - _aprilOutput.Clear(); - - switch (result) - { - case AprilResultKind.PartialRecognition: - _consoleOutput.Append("- "); - break; - case AprilResultKind.FinalRecognition: - _consoleOutput.Append("@ "); - break; - default: - _consoleOutput.Append(' '); - break; - } - - foreach (AprilToken token in tokens) - { - _aprilOutput.Append(token.Token); - } - - var aprilOutputString = - tokens.Length > 0 - ? EnglishCapitalization.Capitalize(_aprilOutput.ToString().Trim()) - : ""; - - if (result == AprilResultKind.FinalRecognition) - { - _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph( - aprilOutputString, - wordRegex: EnglishTextParser.WordKeepUrl() - ); - _messageQueue.FinishCurrentParagraph(); - } - else - { - _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(aprilOutputString); - } - - _consoleOutput.Append(aprilOutputString); - Console.WriteLine(_consoleOutput); - } - - public void Dispose() - { - StopRecording(); - _audioIn.Dispose(); - GC.SuppressFinalize(this); - } - } -}