BerriAI · Falimonda · Apr 25, 2025
diff --git a/litellm/llms/base_llm/speech/transformation.py b/litellm/llms/base_llm/speech/transformation.py
@@ -0,0 +1,40 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+
+import httpx
+
+from litellm.llms.base_llm.chat.transformation import BaseConfig
+from litellm.types.llms.openai import OpenAISpeechOptionalParams
+from litellm.types.utils import FileTypes, ModelResponse
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any
+
+
+class BaseSpeechConfig(BaseConfig, ABC):
+    @abstractmethod
+    def get_supported_openai_params(
+        self, model: str
+    ) -> List[OpenAISpeechOptionalParams]:
+        pass
+
+    @abstractmethod
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        """
+        Map the OpenAI params to the Whisper params
+        """
+        supported_params = self.get_supported_openai_params(model)
+        for k, v in non_default_params.items():
+            if k in supported_params:
+                optional_params[k] = v
+        return optional_params
diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
@@ -1356,6 +1356,7 @@ def audio_speech(
         model: str,
         input: str,
         voice: str,
+        instructions: Optional[str],
         optional_params: dict,
         api_key: Optional[str],
         api_base: Optional[str],
@@ -1393,6 +1394,7 @@ def audio_speech(
         response = cast(OpenAI, openai_client).audio.speech.create(
             model=model,
             voice=voice,  # type: ignore
+            instructions=instructions,
             input=input,
             **optional_params,
         )

diff --git a/litellm/llms/openai/speech/gpt_transformation.py b/litellm/llms/openai/speech/gpt_transformation.py
@@ -0,0 +1,35 @@
+from typing import List
+
+from litellm.llms.base_llm.speech.transformation import (
+    BaseSpeechConfig,
+)
+from litellm.types.llms.openai import OpenAISpeechOptionalParams
+
+class OpenAIGPTSpeechConfig(BaseSpeechConfig):
+    def get_supported_openai_params(
+        self, model: str
+    ) -> List[OpenAISpeechOptionalParams]:
+        """
+        Get the supported OpenAI params for the gpt models
+        """
+        return [
+            "instructions",
+            "response_format",
+            "speed",
+        ]
+
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        """
+        Map the OpenAI params to the Speech params
+        """
+        supported_params = self.get_supported_openai_params(model)
+        for k, v in non_default_params.items():
+            if k in supported_params:
+                optional_params[k] = v
+        return optional_params
diff --git a/litellm/llms/openai/speech/tts_transformation.py b/litellm/llms/openai/speech/tts_transformation.py
@@ -0,0 +1,33 @@
+from typing import List
+from litellm.llms.base_llm.speech.transformation import (
+    BaseSpeechConfig,
+)
+from litellm.types.llms.openai import OpenAISpeechOptionalParams
+
+class OpenAITTSSpeechConfig(BaseSpeechConfig):
+    def get_supported_openai_params(
+        self, model: str
+    ) -> List[OpenAISpeechOptionalParams]:
+        """
+        Get the supported OpenAI params for the tts models
+        """
+        return [
+            "response_format",
+            "speed",
+        ]
+
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        """
+        Map the OpenAI params to the Speech params
+        """
+        supported_params = self.get_supported_openai_params(model)
+        for k, v in non_default_params.items():
+            if k in supported_params:
+                optional_params[k] = v
+        return optional_params
diff --git a/litellm/main.py b/litellm/main.py
@@ -95,6 +95,7 @@
     get_optional_params_embeddings,
     get_optional_params_image_gen,
     get_optional_params_transcription,
+    get_optional_params_speech,
     get_secret,
     mock_completion_streaming_obj,
     read_config_args,
@@ -5302,6 +5303,7 @@ def speech(  # noqa: PLR0915
     model: str,
     input: str,
     voice: Optional[Union[str, dict]] = None,
+    instructions: Optional[str] = None,
     api_key: Optional[str] = None,
     api_base: Optional[str] = None,
     api_version: Optional[str] = None,
@@ -5328,11 +5330,18 @@ def speech(  # noqa: PLR0915
     )  # type: ignore
     kwargs.pop("tags", [])
 
-    optional_params = {}
-    if response_format is not None:
-        optional_params["response_format"] = response_format
-    if speed is not None:
-        optional_params["speed"] = speed  # type: ignore
+    # optional_params = {}
+    # if response_format is not None:
+    #     optional_params["response_format"] = response_format
+    # if speed is not None:
+    #     optional_params["speed"] = speed  # type: ignore
+
+    optional_params = get_optional_params_speech(
+        model=model,
+        response_format=response_format,
+        speed=speed,
+        instructions=instructions,
+    )
 
     if timeout is None:
         timeout = litellm.request_timeout
@@ -5401,6 +5410,7 @@ def speech(  # noqa: PLR0915
             model=model,
             input=input,
             voice=voice,
+            instructions=instructions,
             optional_params=optional_params,
             api_key=api_key,
             api_base=api_base,

diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
@@ -889,9 +889,14 @@ class Config:
     "include",
 ]
 
-
 OpenAIImageVariationOptionalParams = Literal["n", "size", "response_format", "user"]
 
+OpenAISpeechOptionalParams = Literal[
+    "instructions",
+    "response_format",
+    "speed",
+]
+
 
 class ComputerToolParam(TypedDict, total=False):
     display_height: Required[float]

diff --git a/litellm/utils.py b/litellm/utils.py
@@ -2457,6 +2457,76 @@ def _check_valid_arg(supported_params):
     return optional_params
 
 
+def get_optional_params_speech(
+    model: str,
+    response_format: Optional[str] = None,
+    speed: Optional[float] = None,
+    instructions: Optional[str] = None,
+    custom_llm_provider: Optional[str] = None,
+    drop_params: Optional[bool] = None,
+    **kwargs,
+):
+    # retrieve all parameters passed to the function
+    passed_params = locals()
+    custom_llm_provider = passed_params.pop("custom_llm_provider")
+    drop_params = passed_params.pop("drop_params")
+    special_params = passed_params.pop("kwargs")
+    for k, v in special_params.items():
+        passed_params[k] = v
+
+    default_params = {
+        "response_format": None,
+        "speed": None,
+        "instructions": None,
+    }
+
+    non_default_params = {
+        k: v
+        for k, v in passed_params.items()
+        if (k in default_params and v != default_params[k])
+    }
+    optional_params = {}
+
+    ## raise exception if non-default value passed for non-openai/azure embedding calls
+    def _check_valid_arg(supported_params):
+        if len(non_default_params.keys()) > 0:
+            keys = list(non_default_params.keys())
+            for k in keys:
+                if (
+                    drop_params is True or litellm.drop_params is True
+                ) and k not in supported_params:  # drop the unsupported non-default values
+                    non_default_params.pop(k, None)
+                elif k not in supported_params:
+                    raise UnsupportedParamsError(
+                        status_code=500,
+                        message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
+                    )
+            return non_default_params
+
+    provider_config: Optional[BaseSpeechConfig] = None
+    if custom_llm_provider is not None:
+        provider_config = ProviderConfigManager.get_provider_speech_config(
+            model=model,
+            provider=LlmProviders(custom_llm_provider),
+        )
+
+    if provider_config is not None:  # handles fireworks ai, and any future providers
+        supported_params = provider_config.get_supported_openai_params(model=model)
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = provider_config.map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            model=model,
+            drop_params=drop_params if drop_params is not None else False,
+        )
+    for k in passed_params.keys():  # pass additional kwargs without modification
+        if k not in default_params.keys():
+            optional_params[k] = passed_params[k]
+    return optional_params
+
+
+
+
 def get_optional_params_image_gen(
     model: Optional[str] = None,
     n: Optional[int] = None,

diff --git a/tests/local_testing/test_audio_speech.py b/tests/local_testing/test_audio_speech.py
@@ -315,3 +315,96 @@ def test_audio_speech_cost_calc():
         ]
         print(f"standard_logging_payload: {standard_logging_payload}")
         assert standard_logging_payload["response_cost"] > 0
+
+@pytest.mark.parametrize(
+   "sync_mode",
+   [True],
+)
+@pytest.mark.parametrize(
+   "model, api_key, api_base",
+   [
+       ("openai/gpt-4o-mini-tts", os.getenv("OPENAI_API_KEY"), None),
+   ],
+)
+@pytest.mark.asyncio
+async def test_audio_speech_litellm(sync_mode, model, api_base, api_key):
+   speech_file_path = Path(__file__).parent / "speech.mp3"
+   litellm._turn_on_debug()
+   if sync_mode:
+       response = litellm.speech(
+           model=model,
+           voice="alloy",
+           instructions="speak the text as though you are like a crazy person, almost goofy and laughing at the end",
+           input="say hello to the world",
+           api_base=api_base,
+           api_key=api_key,
+           organization=None,
+           project=None,
+           max_retries=1,
+           timeout=600,
+           client=None,
+           optional_params={},
+       )
+
+
+       from litellm.types.llms.openai import HttpxBinaryResponseContent
+
+
+       print("response", response)
+
+
+       assert isinstance(response, HttpxBinaryResponseContent)
+       with open(speech_file_path, "wb") as f:
+           f.write(response.content)
+
+
+@pytest.mark.parametrize(
+   "sync_mode",
+   [True],
+)
+@pytest.mark.parametrize(
+   "model, api_key, api_base",
+   [
+       ("openai/gpt-4o-mini-tts", os.getenv("OPENAI_API_KEY"), None),
+   ],
+)
+@pytest.mark.asyncio
+async def test_audio_speech_passes_instructions_to_openai(sync_mode, model, api_base, api_key):
+   speech_file_path = Path(__file__).parent / "speech.mp3"
+   litellm._turn_on_debug()
+   if sync_mode:
+
+        from openai import OpenAI
+        from litellm.utils import supports_system_messages
+
+        litellm.set_verbose = True
+        client = OpenAI(api_key="fake-api-key")
+
+        test_instructions = "speak the text as though you are like a crazy person, almost goofy and laughing at the end"
+        with patch.object(
+            client.audio.speech, "create"
+        ) as mock_client:
+            try:
+                litellm.speech(
+                    model=model,
+                    voice="alloy",
+                    instructions=test_instructions,
+                    input="say hello to the world",
+                    api_base=api_base,
+                    api_key=api_key,
+                    organization=None,
+                    project=None,
+                    max_retries=1,
+                    timeout=600,
+                    client=client,
+                    optional_params={},
+                )
+            except Exception as e:
+                print(f"Error: {e}")
+
+            mock_client.assert_called_once()
+            request_body = mock_client.call_args.kwargs
+
+            print("request_body: ", request_body)
+
+            assert request_body["instructions"] == test_instructions