Merge pull request #30 from janko-development/main

sfortis · web-flow · commit 6478068c6df6 · 2025-03-24T09:13:29.000+02:00
Add support for gpt-4o-mini-tts model
diff --git a/README.md b/README.md
@@ -9,9 +9,11 @@ The OpenAI TTS component for Home Assistant makes it possible to use the OpenAI
 - **Customizable speech model** – [Check supported voices and models](https://platform.openai.com/docs/guides/text-to-speech).  
 - **Integration with Home Assistant** – Works seamlessly with assistants, automations, and scripts.  
 - **Custom endpoint option** – Allows you to use your own OpenAI compatible API endpoint.
-- ⭐(New!) **Chime option** – Useful for announcements on speakers. *(See Devices → OpenAI TTS → CONFIGURE button)*  
-- ⭐(New!) **User-configurable chime sounds** – Drop your own chime sound into  `config/custom_components/openai_tts/chime` folder (MP3).  
-- ⭐(New!) **Audio normalization option** – Uses more CPU but improves audio clarity on mobile phones and small speakers. *(See Devices → OpenAI TTS → CONFIGURE button)*  
+- **Chime option** – Useful for announcements on speakers. *(See Devices → OpenAI TTS → CONFIGURE button)*
+- **User-configurable chime sounds** – Drop your own chime sound into  `config/custom_components/openai_tts/chime` folder (MP3).
+- **Audio normalization option** – Uses more CPU but improves audio clarity on mobile phones and small speakers. *(See Devices → OpenAI TTS → CONFIGURE button)*
+- ⭐(New!) **Support for new gpt-4o-mini-tts model** – A fast and powerful language model.
+- ⭐(New!) **Text-to-Speech Instructions option** – Instruct the text-to-speech model to speak in a specific way (only works with newest gpt-4o-mini-tts model). [OpenAI new generation audio models](https://openai.com/index/introducing-our-next-generation-audio-models/)
 
 
 
@@ -34,6 +36,9 @@ data:
   cache: true
   media_player_entity_id: media_player.bedroom_speaker
   message: My speech has improved now!
+  options:
+    chime: true                          # Enable or disable the chime
+    instructions: "Speak like a pirate"  # Instructions for text-to-speach model on how to speak 
 ```
 
 ## HACS installation ( *preferred!* ) 
diff --git a/custom_components/openai_tts/config_flow.py b/custom_components/openai_tts/config_flow.py
@@ -12,6 +12,11 @@
 from homeassistant import data_entry_flow
 from homeassistant.config_entries import ConfigFlow, OptionsFlow
 from homeassistant.helpers.selector import selector
+from homeassistant.helpers.selector import (
+    TextSelector,
+    TextSelectorConfig,
+    TextSelectorType,
+)
 from homeassistant.exceptions import HomeAssistantError
 
 from .const import (
@@ -27,6 +32,7 @@
     CONF_CHIME_ENABLE,    # Use constant for chime enable toggle
     CONF_CHIME_SOUND,
     CONF_NORMALIZE_AUDIO,
+    CONF_INSTRUCTIONS,
 )
 
 _LOGGER = logging.getLogger(__name__)
@@ -174,6 +180,14 @@ async def async_step_init(self, user_input: dict | None = None):
                 }
             }),
 
+
+            vol.Optional(
+                 CONF_INSTRUCTIONS,
+                 default=self.config_entry.options.get(CONF_INSTRUCTIONS, self.config_entry.data.get(CONF_INSTRUCTIONS, None))
+            ): TextSelector(
+                TextSelectorConfig(type=TextSelectorType.TEXT,multiline=True)
+            ),
+
             # Normalization toggle using its constant; label will be picked from strings.json.
             vol.Optional(
                 CONF_NORMALIZE_AUDIO,
diff --git a/custom_components/openai_tts/const.py b/custom_components/openai_tts/const.py
@@ -10,9 +10,10 @@
 CONF_URL = "url"
 UNIQUE_ID = "unique_id"
 
-MODELS = ["tts-1", "tts-1-hd"]
-VOICES = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]
+MODELS = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
+VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]
 
 CONF_CHIME_ENABLE = "chime"
 CONF_CHIME_SOUND = "chime_sound"
 CONF_NORMALIZE_AUDIO = "normalize_audio"
+CONF_INSTRUCTIONS = "instructions"
diff --git a/custom_components/openai_tts/openaitts_engine.py b/custom_components/openai_tts/openaitts_engine.py
@@ -25,7 +25,7 @@ def __init__(self, api_key: str, voice: str, model: str, speed: float, url: str)
         self._speed = speed
         self._url = url
 
-    def get_tts(self, text: str, speed: float = None, voice: str = None) -> AudioResponse:
+    def get_tts(self, text: str, speed: float = None, instructions: str = None, voice: str = None) -> AudioResponse:
         """Synchronous TTS request using urllib.request.
         If the API call fails, waits for 1 second and retries once.
         """
@@ -45,6 +45,8 @@ def get_tts(self, text: str, speed: float = None, voice: str = None) -> AudioRes
             "response_format": "mp3",
             "speed": speed
         }
+        if instructions is not None and self._model == "gpt-4o-mini-tts":
+            data["instructions"] = instructions
 
         max_retries = 1
         attempt = 0
diff --git a/custom_components/openai_tts/strings.json b/custom_components/openai_tts/strings.json
@@ -26,6 +26,7 @@
           "chime_sound": "Chime sound",
           "speed": "Speed (0.25 to 4.0)",
           "voice": "Voice",
+          "instructions": "Instructions",
           "normalize_audio": "Enable loudness for generated audio (uses more CPU)"
         }
       }
diff --git a/custom_components/openai_tts/translations/en.json b/custom_components/openai_tts/translations/en.json
@@ -26,6 +26,7 @@
           "chime_sound": "Chime sound",
           "speed": "Speed (0.25 to 4.0)",
           "voice": "Voice",
+          "instructions": "Instructions",
           "normalize_audio": "Enable loudness for generated audio (uses more CPU)"
         }
       }
diff --git a/custom_components/openai_tts/tts.py b/custom_components/openai_tts/tts.py
@@ -20,6 +20,7 @@
     CONF_MODEL,
     CONF_SPEED,
     CONF_VOICE,
+    CONF_INSTRUCTIONS,
     CONF_URL,
     DOMAIN,
     UNIQUE_ID,
@@ -65,6 +66,10 @@ def __init__(self, hass: HomeAssistant, config: ConfigEntry, engine: OpenAITTSEn
     def default_language(self) -> str:
         return "en"
 
+    @property
+    def supported_options(self) -> list:
+        return ["instructions", "chime"]
+        
     @property
     def supported_languages(self) -> list:
         return self._engine.get_supported_langs()
@@ -97,18 +102,20 @@ def get_tts_audio(
             # Retrieve settings.
             current_speed = self._config.options.get(CONF_SPEED, self._config.data.get(CONF_SPEED, 1.0))
             effective_voice = self._config.options.get(CONF_VOICE, self._config.data.get(CONF_VOICE))
+            instructions = options.get(CONF_INSTRUCTIONS, self._config.options.get(CONF_INSTRUCTIONS, self._config.data.get(CONF_INSTRUCTIONS)))
             _LOGGER.debug("Effective speed: %s", current_speed)
             _LOGGER.debug("Effective voice: %s", effective_voice)
+            _LOGGER.debug("Instructions: %s", instructions)
 
             _LOGGER.debug("Creating TTS API request")
             api_start = time.monotonic()
-            speech = self._engine.get_tts(message, speed=current_speed, voice=effective_voice)
+            speech = self._engine.get_tts(message, speed=current_speed, voice=effective_voice, instructions=instructions)
             api_duration = (time.monotonic() - api_start) * 1000
             _LOGGER.debug("TTS API call completed in %.2f ms", api_duration)
             audio_content = speech.content
 
             # Retrieve options.
-            chime_enabled = self._config.options.get(CONF_CHIME_ENABLE, self._config.data.get(CONF_CHIME_ENABLE, False))
+            chime_enabled = options.get(CONF_CHIME_ENABLE,self._config.options.get(CONF_CHIME_ENABLE, self._config.data.get(CONF_CHIME_ENABLE, False)))
             normalize_audio = self._config.options.get(CONF_NORMALIZE_AUDIO, self._config.data.get(CONF_NORMALIZE_AUDIO, False))
             _LOGGER.debug("Chime enabled: %s", chime_enabled)
             _LOGGER.debug("Normalization option: %s", normalize_audio)

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`"chime_sound": "Chime sound",`
`27`	`27`	`"speed": "Speed (0.25 to 4.0)",`
`28`	`28`	`"voice": "Voice",`
	`29`	`+ "instructions": "Instructions",`
`29`	`30`	`"normalize_audio": "Enable loudness for generated audio (uses more CPU)"`
`30`	`31`	`}`
`31`	`32`	`}`