stability fixes

sfortis · sfortis · commit 2e8ebec0afca · 2025-03-15T09:31:25.000+02:00
diff --git a/custom_components/openai_tts/manifest.json b/custom_components/openai_tts/manifest.json
@@ -10,5 +10,5 @@
   "iot_class": "cloud_polling",
   "issue_tracker": "https://github.com/sfortis/openai_tts/issues",
   "requirements": [],
-  "version": "0.3.0b0"
+  "version": "0.3.1b0"
 }
diff --git a/custom_components/openai_tts/openaitts_engine.py b/custom_components/openai_tts/openaitts_engine.py
@@ -1,10 +1,14 @@
 """
 TTS Engine for OpenAI TTS.
 """
-import asyncio
-import threading
+import json
 import logging
-import aiohttp
+import time
+from urllib.request import Request, urlopen
+from urllib.error import HTTPError, URLError
+from asyncio import CancelledError
+
+from homeassistant.exceptions import HomeAssistantError
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -21,64 +25,66 @@ def __init__(self, api_key: str, voice: str, model: str, speed: float, url: str)
         self._speed = speed
         self._url = url
 
-        # Create a dedicated event loop running in a background thread.
-        self._loop = asyncio.new_event_loop()
-        self._session = None
-        self._thread = threading.Thread(target=self._start_loop, daemon=True)
-        self._thread.start()
-        # Initialize the aiohttp session in the background event loop.
-        asyncio.run_coroutine_threadsafe(self._init_session(), self._loop).result()
-
-    def _start_loop(self):
-        asyncio.set_event_loop(self._loop)
-        self._loop.run_forever()
+    def get_tts(self, text: str, speed: float = None, voice: str = None) -> AudioResponse:
+        """Synchronous TTS request using urllib.request
+        If the API call fails, waits for 1 second and retries once.
+        """
+        if speed is None:
+            speed = self._speed
+        if voice is None:
+            voice = self._voice
 
-    async def _init_session(self):
-        # Create a persistent aiohttp session for reuse.
-        self._session = aiohttp.ClientSession()
+        headers = {"Content-Type": "application/json"}
+        if self._api_key:
+            headers["Authorization"] = f"Bearer {self._api_key}"
 
-    async def _async_get_tts(self, text: str, speed: float, voice: str) -> AudioResponse:
-        headers = {"Authorization": f"Bearer {self._api_key}"} if self._api_key else {}
         data = {
             "model": self._model,
             "input": text,
             "voice": voice,
             "response_format": "wav",
-            "speed": speed,
-            "stream": True
+            "speed": speed
         }
-        # Use separate timeouts for connecting and reading.
-        timeout = aiohttp.ClientTimeout(total=None, sock_connect=5, sock_read=25)
-        async with self._session.post(self._url, headers=headers, json=data, timeout=timeout) as resp:
-            resp.raise_for_status()
-            audio_chunks = []
-            # Optimize the chunk size to 4096 bytes.
-            async for chunk in resp.content.iter_chunked(4096):
-                if chunk:
-                    audio_chunks.append(chunk)
-            audio_data = b"".join(audio_chunks)
-            return AudioResponse(audio_data)
 
-    def get_tts(self, text: str, speed: float = None, voice: str = None) -> AudioResponse:
-        """Synchronous wrapper that runs the asynchronous TTS request on a dedicated event loop.
-           If 'speed' or 'voice' are provided, they override the stored values.
-        """
-        try:
-            if speed is None:
-                speed = self._speed
-            if voice is None:
-                voice = self._voice
-            future = asyncio.run_coroutine_threadsafe(self._async_get_tts(text, speed, voice), self._loop)
-            return future.result()
-        except Exception as e:
-            _LOGGER.error("Error in asynchronous get_tts: %s", e)
-            raise e
+        max_retries = 1
+        attempt = 0
+        while True:
+            try:
+                req = Request(
+                    self._url,
+                    data=json.dumps(data).encode("utf-8"),
+                    headers=headers,
+                    method="POST"
+                )
+                # Set a timeout of 30 seconds for the entire request.
+                with urlopen(req, timeout=30) as response:
+                    content = response.read()
+                return AudioResponse(content)
+            except CancelledError as ce:
+                _LOGGER.exception("TTS request cancelled")
+                raise  # Propagate cancellation.
+            except (HTTPError, URLError) as net_err:
+                _LOGGER.exception("Network error in synchronous get_tts on attempt %d", attempt + 1)
+                if attempt < max_retries:
+                    attempt += 1
+                    time.sleep(1)  # Wait for 1 second before retrying.
+                    _LOGGER.debug("Retrying HTTP call (attempt %d)", attempt + 1)
+                    continue
+                else:
+                    raise HomeAssistantError("Network error occurred while fetching TTS audio") from net_err
+            except Exception as exc:
+                _LOGGER.exception("Unknown error in synchronous get_tts on attempt %d", attempt + 1)
+                if attempt < max_retries:
+                    attempt += 1
+                    time.sleep(1)
+                    _LOGGER.debug("Retrying HTTP call (attempt %d)", attempt + 1)
+                    continue
+                else:
+                    raise HomeAssistantError("An unknown error occurred while fetching TTS audio") from exc
 
     def close(self):
-        """Clean up the aiohttp session and event loop on shutdown."""
-        if self._session:
-            asyncio.run_coroutine_threadsafe(self._session.close(), self._loop).result()
-        self._loop.call_soon_threadsafe(self._loop.stop())
+        """Nothing to close in the synchronous version."""
+        pass
 
     @staticmethod
     def get_supported_langs() -> list:
diff --git a/custom_components/openai_tts/tts.py b/custom_components/openai_tts/tts.py
@@ -4,10 +4,10 @@
 from __future__ import annotations
 import io
 import math
-import re
 import struct
 import wave
 import logging
+from asyncio import CancelledError
 
 from homeassistant.components.tts import TextToSpeechEntity
 from homeassistant.config_entries import ConfigEntry
@@ -20,16 +20,22 @@
 
 _LOGGER = logging.getLogger(__name__)
 
-# --- Helper Functions - Chime & silence synthesis --
+# --- Helper Functions - Chime & Silence Synthesis ---
 
 def synthesize_chime(sample_rate: int = 44100, channels: int = 1, sampwidth: int = 2, duration: float = 1.0) -> bytes:
-    _LOGGER.debug("Synthesizing chime: sample_rate=%d, channels=%d, sampwidth=%d, duration=%.2f", sample_rate, channels, sampwidth, duration)
-    frequency1 = 440.0   # Note A
+    _LOGGER.debug(
+        "Synthesizing chime: sample_rate=%d, channels=%d, sampwidth=%d, duration=%.2f",
+        sample_rate,
+        channels,
+        sampwidth,
+        duration,
+    )
+    frequency1 = 440.0  # Note A
     frequency2 = 587.33  # Note D
     amplitude = 0.8
     num_samples = int(sample_rate * duration)
     output = io.BytesIO()
-    with wave.open(output, 'wb') as wf:
+    with wave.open(output, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(sampwidth)
         wf.setframerate(sample_rate)
@@ -40,33 +46,43 @@ def synthesize_chime(sample_rate: int = 44100, channels: int = 1, sampwidth: int
             sample2 = math.sin(2 * math.pi * frequency2 * t)
             sample = amplitude * fade * ((sample1 + sample2) / 2)
             int_sample = int(sample * 32767)
-            wf.writeframes(struct.pack('<h', int_sample))
+            wf.writeframes(struct.pack("<h", int_sample))
     chime_data = output.getvalue()
     _LOGGER.debug("Chime synthesized, length: %d bytes", len(chime_data))
     return chime_data
 
 def synthesize_silence(sample_rate: int, channels: int, sampwidth: int, duration: float = 0.3) -> bytes:
-    _LOGGER.debug("Synthesizing silence: sample_rate=%d, channels=%d, sampwidth=%d, duration=%.2f", sample_rate, channels, sampwidth, duration)
+    _LOGGER.debug(
+        "Synthesizing silence: sample_rate=%d, channels=%d, sampwidth=%d, duration=%.2f",
+        sample_rate,
+        channels,
+        sampwidth,
+        duration,
+    )
     num_samples = int(sample_rate * duration)
     output = io.BytesIO()
-    with wave.open(output, 'wb') as wf:
+    with wave.open(output, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(sampwidth)
         wf.setframerate(sample_rate)
         for _ in range(num_samples):
-            wf.writeframes(struct.pack('<h', 0))
+            wf.writeframes(struct.pack("<h", 0))
     silence_data = output.getvalue()
     _LOGGER.debug("Silence synthesized, length: %d bytes", len(silence_data))
     return silence_data
 
 def combine_wav_files(chime_bytes: bytes, pause_bytes: bytes, tts_bytes: bytes) -> bytes:
-    _LOGGER.debug("Combining WAV files: chime (%d bytes), pause (%d bytes), TTS (%d bytes)",
-                  len(chime_bytes), len(pause_bytes), len(tts_bytes))
+    _LOGGER.debug(
+        "Combining WAV files: chime (%d bytes), pause (%d bytes), TTS (%d bytes)",
+        len(chime_bytes),
+        len(pause_bytes),
+        len(tts_bytes),
+    )
     chime_io = io.BytesIO(chime_bytes)
     pause_io = io.BytesIO(pause_bytes)
     tts_io = io.BytesIO(tts_bytes)
-    
-    with wave.open(chime_io, 'rb') as w1, wave.open(pause_io, 'rb') as w2, wave.open(tts_io, 'rb') as w3:
+
+    with wave.open(chime_io, "rb") as w1, wave.open(pause_io, "rb") as w2, wave.open(tts_io, "rb") as w3:
         params1 = w1.getparams()
         params2 = w2.getparams()
         params3 = w3.getparams()
@@ -75,9 +91,9 @@ def combine_wav_files(chime_bytes: bytes, pause_bytes: bytes, tts_bytes: bytes)
         frames_chime = w1.readframes(w1.getnframes())
         frames_pause = w2.readframes(w2.getnframes())
         frames_tts = w3.readframes(w3.getnframes())
-    
+
     output = io.BytesIO()
-    with wave.open(output, 'wb') as wout:
+    with wave.open(output, "wb") as wout:
         wout.setparams(params1)
         wout.writeframes(frames_chime)
         wout.writeframes(frames_pause)
@@ -110,7 +126,7 @@ async def async_setup_entry(
         config_entry.data[CONF_VOICE],
         config_entry.data[CONF_MODEL],
         config_entry.data.get(CONF_SPEED, 1.0),
-        config_entry.data[CONF_URL]
+        config_entry.data[CONF_URL],
     )
     async_add_entities([OpenAITTSEntity(hass, config_entry, engine)])
 
@@ -142,14 +158,16 @@ def device_info(self) -> dict:
         return {
             "identifiers": {(DOMAIN, self._attr_unique_id)},
             "model": self._config.data.get(CONF_MODEL),
-            "manufacturer": "OpenAI"
+            "manufacturer": "OpenAI",
         }
 
     @property
     def name(self) -> str:
         return _map_model(self._config.data.get(CONF_MODEL, "")).upper()
 
-    def get_tts_audio(self, message: str, language: str, options: dict | None = None) -> tuple[str, bytes] | tuple[None, None]:
+    def get_tts_audio(
+        self, message: str, language: str, options: dict | None = None
+    ) -> tuple[str, bytes] | tuple[None, None]:
         try:
             if len(message) > 4096:
                 raise MaxLengthExceeded("Message exceeds maximum allowed length")
@@ -167,27 +185,35 @@ def get_tts_audio(self, message: str, language: str, options: dict | None = None
             if chime_enabled:
                 _LOGGER.debug("Chime option enabled; synthesizing chime and pause.")
                 tts_io = io.BytesIO(audio_content)
-                with wave.open(tts_io, 'rb') as tts_wave:
+                with wave.open(tts_io, "rb") as tts_wave:
                     sample_rate = tts_wave.getframerate()
                     channels = tts_wave.getnchannels()
                     sampwidth = tts_wave.getsampwidth()
                     tts_frames = tts_wave.getnframes()
-                _LOGGER.debug("TTS parameters: sample_rate=%d, channels=%d, sampwidth=%d, frames=%d",
-                              sample_rate, channels, sampwidth, tts_frames)
+                _LOGGER.debug(
+                    "TTS parameters: sample_rate=%d, channels=%d, sampwidth=%d, frames=%d",
+                    sample_rate,
+                    channels,
+                    sampwidth,
+                    tts_frames,
+                )
                 chime_audio = synthesize_chime(sample_rate=sample_rate, channels=channels, sampwidth=sampwidth, duration=1.0)
                 pause_audio = synthesize_silence(sample_rate=sample_rate, channels=channels, sampwidth=sampwidth, duration=0.3)
                 try:
                     combined_audio = combine_wav_files(chime_audio, pause_audio, audio_content)
                     _LOGGER.debug("Combined audio generated (chime -> pause -> TTS).")
                     return "wav", combined_audio
                 except Exception as ce:
-                    _LOGGER.error("Error combining audio: %s", ce)
+                    _LOGGER.exception("Error combining audio")
                     return "wav", audio_content
             else:
                 _LOGGER.debug("Chime option disabled; returning TTS audio only.")
                 return "wav", audio_content
+        except CancelledError as ce:
+            _LOGGER.exception("TTS task cancelled")
+            return None, None
         except MaxLengthExceeded as mle:
-            _LOGGER.error("Maximum message length exceeded: %s", mle)
+            _LOGGER.exception("Maximum message length exceeded")
         except Exception as e:
-            _LOGGER.error("Unknown error in get_tts_audio: %s", e)
+            _LOGGER.exception("Unknown error in get_tts_audio")
         return None, None

Original file line number	Diff line number	Diff line change
`@@ -10,5 +10,5 @@`
`10`	`10`	`"iot_class": "cloud_polling",`
`11`	`11`	`"issue_tracker": "https://github.com/sfortis/openai_tts/issues",`
`12`	`12`	`"requirements": [],`
`13`		`- "version": "0.3.0b0"`
	`13`	`+ "version": "0.3.1b0"`
`14`	`14`	`}`