From fb071665dba6315298a2a779cb38f727916431b6 Mon Sep 17 00:00:00 2001
From: Butterscotch! <bscotchvanilla@gmail.com>
Date: Sat, 9 Mar 2024 21:07:18 -0500
Subject: [PATCH] Make audio handling generic & start Whisper

---
 ButterSTT/Program.cs             |   2 +-
 ButterSTT/STT/AprilAsr.cs        | 110 +++++++++++++++++++
 ButterSTT/STT/AudioHandler.cs    | 107 +++++++++++++++++++
 ButterSTT/STT/WhisperAsr.cs      |  56 ++++++++++
 ButterSTT/SpeechToTextHandler.cs | 178 -------------------------------
 5 files changed, 274 insertions(+), 179 deletions(-)
 create mode 100644 ButterSTT/STT/AprilAsr.cs
 create mode 100644 ButterSTT/STT/AudioHandler.cs
 create mode 100644 ButterSTT/STT/WhisperAsr.cs
 delete mode 100644 ButterSTT/SpeechToTextHandler.cs

diff --git a/ButterSTT/Program.cs b/ButterSTT/Program.cs
index 7482bb4..8a21181 100644
--- a/ButterSTT/Program.cs
+++ b/ButterSTT/Program.cs
@@ -68,7 +68,7 @@
     };
     oscHandler.StartMessageLoop();
 
-    using var speechToTextHandler = new SpeechToTextHandler(
+    using var speechToTextHandler = new ButterSTT.STT.AprilAsr(
         modelFile,
         oscHandler.MessageQueue,
         deviceNumber: config.MicrophoneDeviceNumber
diff --git a/ButterSTT/STT/AprilAsr.cs b/ButterSTT/STT/AprilAsr.cs
new file mode 100644
index 0000000..b6b88fa
--- /dev/null
+++ b/ButterSTT/STT/AprilAsr.cs
@@ -0,0 +1,110 @@
+using System.Text;
+using AprilAsr;
+using ButterSTT.MessageSystem;
+using ButterSTT.TextProcessing;
+
+namespace ButterSTT.STT
+{
+    public class AprilAsr : IDisposable
+    {
+        // Audio
+        public readonly AudioHandler AudioHandler;
+
+        // Model
+        public readonly FileInfo ModelFile;
+        private readonly AprilModel _model;
+
+        // Session
+        private readonly AprilSession _session;
+
+        // Output
+        private readonly StringBuilder _consoleOutput = new();
+        private readonly StringBuilder _aprilOutput = new();
+        private readonly MessageQueue _messageQueue;
+
+        public AprilAsr(FileInfo modelFile, MessageQueue messageQueue, int deviceNumber = 0)
+        {
+            _messageQueue = messageQueue;
+
+            // Load model
+            ModelFile = modelFile;
+            _model = new AprilModel(modelFile.FullName);
+
+            Console.WriteLine(
+                $"Model loaded from \"{modelFile.FullName}\":\n  > Name: {_model.Name}\n  > Description: {_model.Description}\n  > Language: {_model.Language}\n  > Sample Rate: {_model.SampleRate} Hz"
+            );
+
+            // Initialize session
+            _session = new AprilSession(_model, OnAprilTokens, async: true);
+
+            // Initialize microphone
+            AudioHandler = new(_model.SampleRate, deviceNumber);
+            AudioHandler.OnMicData += OnMicData;
+            AudioHandler.OnMicStop += OnMicStop;
+        }
+
+        private void OnMicData(object? sender, (short[] data, int length) data)
+        {
+            if (data.length <= 0)
+                return;
+
+            _session.FeedPCM16(data.data, data.length);
+        }
+
+        private void OnMicStop(object? sender, EventArgs e)
+        {
+            _session.Flush();
+        }
+
+        private void OnAprilTokens(AprilResultKind result, AprilToken[] tokens)
+        {
+            _consoleOutput.Clear();
+            _aprilOutput.Clear();
+
+            switch (result)
+            {
+                case AprilResultKind.PartialRecognition:
+                    _consoleOutput.Append("- ");
+                    break;
+                case AprilResultKind.FinalRecognition:
+                    _consoleOutput.Append("@ ");
+                    break;
+                default:
+                    _consoleOutput.Append(' ');
+                    break;
+            }
+
+            foreach (var token in tokens)
+            {
+                _aprilOutput.Append(token.Token);
+            }
+
+            var aprilOutputString =
+                tokens.Length > 0
+                    ? EnglishCapitalization.Capitalize(_aprilOutput.ToString().Trim())
+                    : "";
+
+            if (result == AprilResultKind.FinalRecognition)
+            {
+                _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(
+                    aprilOutputString,
+                    wordRegex: EnglishTextParser.WordKeepUrl()
+                );
+                _messageQueue.FinishCurrentParagraph();
+            }
+            else
+            {
+                _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(aprilOutputString);
+            }
+
+            _consoleOutput.Append(aprilOutputString);
+            Console.WriteLine(_consoleOutput);
+        }
+
+        public void Dispose()
+        {
+            AudioHandler.Dispose();
+            GC.SuppressFinalize(this);
+        }
+    }
+}
diff --git a/ButterSTT/STT/AudioHandler.cs b/ButterSTT/STT/AudioHandler.cs
new file mode 100644
index 0000000..ab914aa
--- /dev/null
+++ b/ButterSTT/STT/AudioHandler.cs
@@ -0,0 +1,107 @@
+using ButterSTT.Config;
+using NAudio.Wave;
+
+namespace ButterSTT.STT
+{
+    public class AudioHandler : IDisposable
+    {
+        // Audio
+        private readonly WaveInEvent _audioIn;
+        private bool _restartRecordingNextStop = false;
+
+        public int WaveDeviceNumber { get; private set; } =
+            STTConfig.Default.MicrophoneDeviceNumber;
+        public bool IsMicrophoneRecording { get; private set; } = false;
+
+        public event EventHandler? OnMicStart;
+        public event EventHandler? OnMicStop;
+        public event EventHandler<(short[] data, int length)>? OnMicData;
+
+        public AudioHandler(int sampleRate = 16000, int deviceNumber = 0)
+        {
+            // Initialize microphone
+            _audioIn = new WaveInEvent()
+            {
+                DeviceNumber = deviceNumber,
+                WaveFormat = new(sampleRate, 16, 1)
+            };
+            WaveDeviceNumber = deviceNumber;
+
+            // Register microphone events
+            _audioIn.DataAvailable += OnWaveData;
+            _audioIn.RecordingStopped += OnWaveStop;
+        }
+
+        public void StartRecording()
+        {
+            _audioIn.StartRecording();
+            IsMicrophoneRecording = true;
+            OnMicStart?.Invoke(this, EventArgs.Empty);
+        }
+
+        public void StopRecording()
+        {
+            // Tell the recording not to restart
+            _restartRecordingNextStop = false;
+
+            // This keeps recording for a little bit longer, it will call the event when it's done
+            _audioIn.StopRecording();
+        }
+
+        public void SwapMicrophoneDevice(int deviceNumber)
+        {
+            // If it's already using this device, ignore it and continue
+            if (_audioIn.DeviceNumber == deviceNumber)
+                return;
+
+            var wasRecording = IsMicrophoneRecording;
+
+            // Make sure the recording is stopped
+            StopRecording();
+
+            // Swap devices
+            _audioIn.DeviceNumber = deviceNumber;
+            WaveDeviceNumber = deviceNumber;
+
+            // If it's already stopped, restart it immediately
+            // Otherwise, start it again when it's done stopping
+            if (wasRecording && !IsMicrophoneRecording)
+            {
+                StartRecording();
+            }
+            else
+            {
+                _restartRecordingNextStop = true;
+            }
+        }
+
+        private void OnWaveData(object? sender, WaveInEventArgs args)
+        {
+            if (args.BytesRecorded <= 0)
+                return;
+
+            // Convert the bytes to shorts
+            var shorts = new short[args.BytesRecorded / sizeof(short)];
+            Buffer.BlockCopy(args.Buffer, 0, shorts, 0, args.BytesRecorded);
+
+            OnMicData?.Invoke(this, (shorts, shorts.Length));
+        }
+
+        private void OnWaveStop(object? sender, StoppedEventArgs args)
+        {
+            IsMicrophoneRecording = false;
+
+            if (_restartRecordingNextStop)
+                StartRecording();
+            else
+                OnMicStop?.Invoke(this, EventArgs.Empty);
+        }
+
+        public void Dispose()
+        {
+            StopRecording();
+            _audioIn.Dispose();
+            GC.SuppressFinalize(this);
+        }
+    }
+}
diff --git a/ButterSTT/STT/WhisperAsr.cs b/ButterSTT/STT/WhisperAsr.cs
new file mode 100644
index 0000000..41293ac
--- /dev/null
+++ b/ButterSTT/STT/WhisperAsr.cs
@@ -0,0 +1,56 @@
+using Whisper.net;
+
+namespace ButterSTT.STT
+{
+    public class WhisperAsr : IDisposable
+    {
+        // Audio
+        public readonly AudioHandler AudioHandler;
+
+        // Model
+        public readonly FileInfo ModelFile;
+        private readonly WhisperProcessor _processor;
+
+        public WhisperAsr(FileInfo modelFile, int deviceNumber = 0)
+        {
+            // Load model
+            ModelFile = modelFile;
+            using var whisperFactory = WhisperFactory.FromPath(modelFile.FullName);
+            _processor = whisperFactory
+                .CreateBuilder()
+                .WithLanguage("auto")
+                .WithSegmentEventHandler(OnSegmentEvent)
+                .Build();
+
+            Console.WriteLine($"Model loaded from \"{modelFile.FullName}\".");
+
+            // Initialize microphone
+            AudioHandler = new(16000, deviceNumber);
+            AudioHandler.OnMicData += OnMicData;
+        }
+
+        private void OnMicData(object? sender, (short[] data, int length) data)
+        {
+            if (data.length <= 0)
+                return;
+
+            var floats = new float[data.length];
+            for (var i = 0; i < data.length; i++)
+                floats[i] = data.data[i] / 32768f;
+
+            _processor.Process(floats);
+        }
+
+        private void OnSegmentEvent(SegmentData segment)
+        {
+            Console.WriteLine(segment.Text.Trim());
+        }
+
+        public void Dispose()
+        {
+            AudioHandler?.Dispose();
+            _processor.Dispose();
+            GC.SuppressFinalize(this);
+        }
+    }
+}
diff --git a/ButterSTT/SpeechToTextHandler.cs b/ButterSTT/SpeechToTextHandler.cs
deleted file mode 100644
index c518009..0000000
--- a/ButterSTT/SpeechToTextHandler.cs
+++ /dev/null
@@ -1,178 +0,0 @@
-using System.Text;
-using AprilAsr;
-using ButterSTT.Config;
-using ButterSTT.MessageSystem;
-using ButterSTT.TextProcessing;
-using NAudio.Wave;
-
-namespace ButterSTT
-{
-    public class SpeechToTextHandler : IDisposable
-    {
-        // Audio
-        private readonly WaveInEvent _audioIn;
-        private bool _restartRecordingNextStop = false;
-
-        // Model
-        private readonly AprilModel _model;
-        public readonly FileInfo ModelFile;
-
-        // Session
-        private readonly AprilSession _session;
-
-        // Output
-        private readonly StringBuilder _consoleOutput = new();
-        private readonly StringBuilder _aprilOutput = new();
-        private readonly MessageQueue _messageQueue;
-
-        public int WaveDeviceNumber { get; private set; } =
-            STTConfig.Default.MicrophoneDeviceNumber;
-        public bool MicrophoneRecording { get; private set; } = false;
-
-        public SpeechToTextHandler(
-            FileInfo modelFile,
-            MessageQueue messageQueue,
-            int deviceNumber = 0
-        )
-        {
-            _messageQueue = messageQueue;
-
-            // Load model
-            _model = new AprilModel(modelFile.FullName);
-            ModelFile = modelFile;
-
-            Console.WriteLine(
-                $"Model loaded from \"{modelFile.FullName}\":\n  > Name: {_model.Name}\n  > Description: {_model.Description}\n  > Language: {_model.Language}\n  > Sample Rate: {_model.SampleRate} Hz"
-            );
-
-            // Initialize session
-            _session = new AprilSession(_model, OnAprilTokens, async: true);
-
-            // Initialize microphone
-            _audioIn = new WaveInEvent()
-            {
-                DeviceNumber = deviceNumber,
-                WaveFormat = new(_model.SampleRate, 16, 1)
-            };
-            WaveDeviceNumber = deviceNumber;
-
-            // Register microphone events
-            _audioIn.DataAvailable += OnMicData;
-            _audioIn.RecordingStopped += OnMicStop;
-        }
-
-        public void StartRecording()
-        {
-            _audioIn.StartRecording();
-            MicrophoneRecording = true;
-        }
-
-        public void StopRecording()
-        {
-            // Tell the recording not to restart
-            _restartRecordingNextStop = false;
-
-            // This keeps recording for a little bit longer, it will call the event when it's done
-            _audioIn.StopRecording();
-        }
-
-        public void SwapMicrophoneDevice(int deviceNumber)
-        {
-            // If it's already using this device, ignore it and continue
-            if (_audioIn.DeviceNumber == deviceNumber)
-                return;
-
-            var wasRecording = MicrophoneRecording;
-
-            // Make sure the recording is stopped
-            StopRecording();
-
-            // Swap devices
-            _audioIn.DeviceNumber = deviceNumber;
-            WaveDeviceNumber = deviceNumber;
-
-            // If it's already stopped, restart it immediately
-            // Otherwise, start it again when it's done stopping
-            if (wasRecording && !MicrophoneRecording)
-            {
-                StartRecording();
-            }
-            else
-            {
-                _restartRecordingNextStop = true;
-            }
-        }
-
-        private void OnMicData(object? sender, WaveInEventArgs args)
-        {
-            if (args.BytesRecorded <= 0)
-                return;
-
-            // Convert the bytes to shorts
-            var shorts = new short[args.BytesRecorded / sizeof(short)];
-            Buffer.BlockCopy(args.Buffer, 0, shorts, 0, args.BytesRecorded);
-            _session.FeedPCM16(shorts, shorts.Length);
-        }
-
-        private void OnMicStop(object? sender, StoppedEventArgs args)
-        {
-            _session.Flush();
-            MicrophoneRecording = false;
-
-            if (_restartRecordingNextStop)
-                StartRecording();
-        }
-
-        private void OnAprilTokens(AprilResultKind result, AprilToken[] tokens)
-        {
-            _consoleOutput.Clear();
-            _aprilOutput.Clear();
-
-            switch (result)
-            {
-                case AprilResultKind.PartialRecognition:
-                    _consoleOutput.Append("- ");
-                    break;
-                case AprilResultKind.FinalRecognition:
-                    _consoleOutput.Append("@ ");
-                    break;
-                default:
-                    _consoleOutput.Append(' ');
-                    break;
-            }
-
-            foreach (AprilToken token in tokens)
-            {
-                _aprilOutput.Append(token.Token);
-            }
-
-            var aprilOutputString =
-                tokens.Length > 0
-                    ? EnglishCapitalization.Capitalize(_aprilOutput.ToString().Trim())
-                    : "";
-
-            if (result == AprilResultKind.FinalRecognition)
-            {
-                _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(
-                    aprilOutputString,
-                    wordRegex: EnglishTextParser.WordKeepUrl()
-                );
-                _messageQueue.FinishCurrentParagraph();
-            }
-            else
-            {
-                _messageQueue.CurParagraph = EnglishTextParser.ParseParagraph(aprilOutputString);
-            }
-
-            _consoleOutput.Append(aprilOutputString);
-            Console.WriteLine(_consoleOutput);
-        }
-
-        public void Dispose()
-        {
-            StopRecording();
-            _audioIn.Dispose();
-            GC.SuppressFinalize(this);
-        }
-    }
-}