TTS to accept raw & STT to use async generators (#419)

* STT to accept an async generator * Allow getting RAW data from TTS * AsyncIterable * Cleanup * Fix test
NabuCasa · Mar 22, 2023 · 0836155 · 0836155
1 parent 261aa6c
commit 0836155
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 9 deletions.
diff --git a/hass_nabucasa/voice.py b/hass_nabucasa/voice.py
@@ -3,11 +3,9 @@
 
 from datetime import datetime
 from enum import Enum
-import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, AsyncIterable
 import xml.etree.ElementTree as ET
 
-import aiohttp
 from aiohttp.hdrs import ACCEPT, AUTHORIZATION, CONTENT_TYPE
 import attr
 
@@ -17,8 +15,6 @@
 if TYPE_CHECKING:
     from . import Cloud
 
-_LOGGER = logging.getLogger(__name__)
-
 
 class VoiceError(Exception):
     """General Voice error."""
@@ -39,6 +35,13 @@ class Gender(str, Enum):
     FEMALE = "female"
 
 
+class AudioOutput(str, Enum):
+    """Gender Type for voices."""
+
+    MP3 = "mp3"
+    RAW = "raw"
+
+
 MAP_VOICE = {
     ("af-ZA", Gender.FEMALE): "AdriNeural",
     ("af-ZA", Gender.MALE): "WillemNeural",
@@ -339,7 +342,7 @@ async def _update_token(self) -> None:
         self._valid = utc_from_timestamp(float(data["valid"]))
 
     async def process_stt(
-        self, stream: aiohttp.StreamReader, content: str, language: str
+        self, stream: AsyncIterable[bytes], content: str, language: str
     ) -> STTResponse:
         """Stream Audio to Azure congnitive instance."""
         if not self._validate_token():
@@ -366,7 +369,9 @@ async def process_stt(
             data["RecognitionStatus"] == "Success", data.get("DisplayText")
         )
 
-    async def process_tts(self, text: str, language: str, gender: Gender) -> bytes:
+    async def process_tts(
+        self, text: str, language: str, gender: Gender, output: AudioOutput
+    ) -> bytes:
         """Get Speech from text over Azure."""
         if not self._validate_token():
             await self._update_token()
@@ -385,13 +390,18 @@ async def process_tts(self, text: str, language: str, gender: Gender) -> bytes:
         # We can not get here without this being set, but mypy does not know that.
         assert self._endpoint_tts is not None
 
+        if output == AudioOutput.RAW:
+            output_header = "raw-16khz-16bit-mono-pcm"
+        else:
+            output_header = "audio-24khz-48kbitrate-mono-mp3"
+
         # Send request
         async with self.cloud.websession.post(
             self._endpoint_tts,
             headers={
                 CONTENT_TYPE: "application/ssml+xml",
                 AUTHORIZATION: f"Bearer {self._token}",
-                "X-Microsoft-OutputFormat": "audio-24khz-48kbitrate-mono-mp3",
+                "X-Microsoft-OutputFormat": output_header,
             },
             data=ET.tostring(xml_body),
         ) as resp:

diff --git a/tests/test_voice.py b/tests/test_voice.py
@@ -74,7 +74,12 @@ async def test_process_tts(auth_cloud_mock, aioclient_mock):
         content=b"My sound",
     )
     result = await voice_api.process_tts(
-        "Text for Saying", "en-US", voice.Gender.FEMALE
+        "Text for Saying", "en-US", voice.Gender.FEMALE, voice.AudioOutput.MP3
     )
 
     assert result == b"My sound"
+    assert aioclient_mock.mock_calls[1][3] == {
+        "Authorization": "Bearer test-key",
+        "Content-Type": "application/ssml+xml",
+        "X-Microsoft-OutputFormat": "audio-24khz-48kbitrate-mono-mp3",
+    }