feat: ✨ Added Defaults and costom language and speaker to endpoint

timhagel · timhagel · commit 52ddc5c2a3b8 · 2024-03-16T20:43:01.000-07:00
diff --git a/README.md b/README.md
@@ -1,23 +1,40 @@
 # MeloTTS API Server
-A quick easy way to access [MeloTTS](https://github.com/myshell-ai/MeloTTS) through REST API calls.
 
-Currently only locked to english with american accent. Easy fix if requested, or you can just change the hardcode speaker_ids before build if needed.
+A quick easy way to access [MeloTTS](https://github.com/myshell-ai/MeloTTS) through REST API calls.
 
-## Usage 
 Assuming you have docker installed and setup
+
 ### Build
-    git clone git@github.com:timhagel/MeloTTS-API-Server.git
+
+    git clone git@github.com:timhagel/melotts-api-server.git
     cd melotts-api-server
     docker build -t melotts-api-server .
- ### Run
-    docker run -p 8888:8080 melotts-api-server
+
+### Run (English)
+
+    docker run -p 8888:8080 -e DEFAULT_SPEED=1 -e DEFAULT_LANGUAGE=EN -e DEFAULT_SPEAKER_ID=EN-US  melotts-api-server
+
 ### Call API
-**localhost:8888/text_to_speech**
+
+**localhost:8888/convert/tts**
+
+##### Use Environment Defaults
 
     {
         "text": "Put input here"
     }
-Response : en-us.wav
 
-### Acknowledgement
+Response : .wav
+
+##### Customize (Everything except for "text" is optional)
+
+    {
+        "text": "input",
+        "speed": "speed",
+        "language": "language",
+        "speaker_id": "speaker_id"
+    }
+
+## Acknowledgement
+
 This just a API server for the awesome work of [MeloTTS](https://github.com/myshell-ai/MeloTTS) from [MyShell](https://github.com/myshell-ai)
diff --git a/app.py b/app.py
@@ -1,27 +1,46 @@
-from fastapi import FastAPI, Body
+import os
+import uvicorn
+from fastapi import FastAPI, Body, Depends
 from pydantic import BaseModel
 from fastapi.responses import FileResponse
 from melo.api import TTS
+from dotenv import load_dotenv
+import tempfile
 
-speed = 1.0
+load_dotenv()
+DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED'))
+DEFAULT_LANGUAGE = os.getenv('DEFAULT_LANGUAGE')
+DEFAULT_SPEAKER_ID = os.getenv('DEFAULT_SPEAKER_ID')
 device = 'auto' # Will automatically use GPU if available
 
 class TextModel(BaseModel):
     text: str
+    speed: float = DEFAULT_SPEED
+    language: str = DEFAULT_LANGUAGE
+    speaker_id: str = DEFAULT_SPEAKER_ID
 
 app = FastAPI()
 
-@app.post("/text_to_speech")
-async def create_upload_file(body: TextModel = Body(...)):
-    model = TTS(language='EN', device=device)
+def get_tts_model(body: TextModel):
+    return TTS(language=body.language, device=device)
+
+@app.post("/convert/tts")
+async def create_upload_file(body: TextModel = Body(...), model: TTS = Depends(get_tts_model)):
     speaker_ids = model.hps.data.spk2id
 
-    output_path = 'en-us.wav'
-    model.tts_to_file(body.text, speaker_ids['EN-US'], output_path, speed=speed)
+    # Create a temporary file
+    output_path = body.language + "_" + body.speaker_id + ".wav"
+    model.tts_to_file(body.text, speaker_ids[body.speaker_id], output_path, speed=body.speed)
+
+    # Create a temporary file
+    output_path = body.language + "_" + body.speaker_id + ".wav"
+    model.tts_to_file(body.text, speaker_ids[body.speaker_id], output_path, speed=body.speed)
 
+    print(os.path.basename(output_path))
     # Return the audio file
-    return FileResponse("en-us.wav", media_type="audio/mpeg", filename="en-us.wav")
+    response = FileResponse(output_path, media_type="audio/mpeg", filename=os.path.basename(output_path))
+
+    return response
 
 if __name__ == "__main__":
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8080)
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-fastapi[all] == 0.110.0
+fastapi[all] == 0.110.0
+python-dotenv == 1.0.0

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-fastapi[all] == 0.110.0`
	`1`	`+fastapi[all] == 0.110.0`
	`2`	`+python-dotenv == 1.0.0`