Merge pull request #37 from Cohee1207/streaming

daswer123 · web-flow · commit fb16a096f3e5 · 2024-01-02T09:18:38.000+03:00
Add HTTP streaming for local models
diff --git a/xtts_api_server/server.py b/xtts_api_server/server.py
@@ -1,5 +1,5 @@
 from TTS.api import TTS
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, Request, Query
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse,StreamingResponse
 
@@ -179,6 +179,35 @@ def set_speaker_folder(speaker_req: SpeakerFolderRequest):
         logger.error(e)
         raise HTTPException(status_code=400, detail=str(e))
 
+@app.get('/tts_stream')
+async def tts_stream(request: Request, text: str = Query(), speaker_wav: str = Query(), language: str = Query()):
+    # Validate local model source.
+    if XTTS.model_source != "local":
+        raise HTTPException(status_code=400,
+                            detail="HTTP Streaming is only supported for local models.")
+    # Validate language code against supported languages.
+    if language.lower() not in supported_languages:
+        raise HTTPException(status_code=400,
+                            detail="Language code sent is either unsupported or misspelled.")
+            
+    async def generator():
+        chunks = XTTS.process_tts_to_file(
+            text=text,
+            speaker_name_or_path=speaker_wav,
+            language=language.lower(),
+            stream=True,
+        )
+        # Write file header to the output stream.
+        yield XTTS.get_wav_header()
+        async for chunk in chunks:
+            # Check if the client is still connected.
+            disconnected = await request.is_disconnected()
+            if disconnected:
+                break
+            yield chunk
+
+    return StreamingResponse(generator(), media_type='audio/x-wav')
+
 @app.post("/tts_to_audio/")
 async def tts_to_audio(request: SynthesisRequest):
     if STREAM_MODE or STREAM_MODE_IMPROVE:
diff --git a/xtts_api_server/tts_funcs.py b/xtts_api_server/tts_funcs.py
@@ -18,6 +18,9 @@
 import re
 import json
 import socket
+import io
+import wave
+import numpy as np
 
 # List of supported language codes
 supported_languages = {
@@ -87,6 +90,16 @@ def check_model_version_old_format(self,model_version):
             return "v"+model_version
         return model_version
 
+    def get_wav_header(self, channels:int=1, sample_rate:int=24000, width:int=2) -> bytes:
+        wav_buf = io.BytesIO()
+        with wave.open(wav_buf, "wb") as out:
+            out.setnchannels(channels)
+            out.setsampwidth(width)
+            out.setframerate(sample_rate)
+            out.writeframes(b"")
+        wav_buf.seek(0)
+        return wav_buf.read()
+
     # CACHE FUNCS
     def check_cache(self, text_params):
         if not self.enable_cache_results:
@@ -336,6 +349,48 @@ def clean_text(self,text):
         text = re.sub(r'"\s?(.*?)\s?"', r"'\1'", text)
         return text
 
+    async def stream_generation(self,text,speaker_name,speaker_wav,language,output_file):
+        # Log time
+        generate_start_time = time.time()  # Record the start time of loading the model
+
+        gpt_cond_latent, speaker_embedding = self.get_or_create_latents(speaker_name, speaker_wav)
+        file_chunks = []
+
+        chunks = self.model.inference_stream(
+            text,
+            language,
+            speaker_embedding=speaker_embedding,
+            gpt_cond_latent=gpt_cond_latent,
+            temperature=0.75,
+            length_penalty=1.0,
+            repetition_penalty=5.0,
+            top_k=50,
+            top_p=0.85,
+            enable_text_splitting=True,
+            stream_chunk_size=100,
+        )
+        
+        for chunk in chunks:
+            if isinstance(chunk, list):
+                chunk = torch.cat(chunk, dim=0)
+            file_chunks.append(chunk)
+            chunk = chunk.cpu().numpy()
+            chunk = chunk[None, : int(chunk.shape[0])]
+            chunk = np.clip(chunk, -1, 1)
+            chunk = (chunk * 32767).astype(np.int16)
+            yield chunk.tobytes()
+
+        if len(file_chunks) > 0:
+            wav = torch.cat(file_chunks, dim=0)
+            torchaudio.save(output_file, wav.cpu().squeeze().unsqueeze(0), 24000)
+        else:
+            logger.warning("No audio generated.")
+
+        generate_end_time = time.time()  # Record the time to generate TTS
+        generate_elapsed_time = generate_end_time - generate_start_time
+
+        logger.info(f"Processing time: {generate_elapsed_time:.2f} seconds.")
+
     def local_generation(self,text,speaker_name,speaker_wav,language,output_file):
         # Log time
         generate_start_time = time.time()  # Record the start time of loading the model
@@ -398,7 +453,7 @@ def get_speaker_wav(self, speaker_name_or_path):
 
 
     # MAIN FUNC
-    def process_tts_to_file(self, text, speaker_name_or_path, language, file_name_or_path="out.wav"):
+    def process_tts_to_file(self, text, speaker_name_or_path, language, file_name_or_path="out.wav", stream=False):
         try:
             speaker_wav = self.get_speaker_wav(speaker_name_or_path)
             # Determine output path based on whether a full path or a file name was provided
@@ -441,7 +496,16 @@ def process_tts_to_file(self, text, speaker_name_or_path, language, file_name_or
 
             # Define generation if model via api or locally
             if self.model_source == "local":
-                self.local_generation(clear_text,speaker_name_or_path,speaker_wav,language,output_file)
+                if stream:
+                    async def stream_fn():
+                        async for chunk in self.stream_generation(clear_text,speaker_name_or_path,speaker_wav,language,output_file):
+                            yield chunk
+                        self.switch_model_device()
+                        # After generation completes successfully...
+                        self.update_cache(text_params,output_file)
+                    return stream_fn()
+                else:
+                    self.local_generation(clear_text,speaker_name_or_path,speaker_wav,language,output_file)
             else:
                 self.api_generation(clear_text,speaker_wav,language,output_file)