Implement a basic tts server

2025-03-06 09:09:30 +01:00
parent f14902c6e7
commit d54de1d291
8 changed files with 98 additions and 5 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,8 @@ dependencies = [
    "aioconsole==0.8.1",
    "piper-phonemize==1.1.0",
    "piper-tts==1.2.0",
+    "fastapi==0.115.11",
+    "uvicorn==0.34.0",
 ]

 [project.optional-dependencies]
--- a/src/multilang_translator/main_backend.py
+++ b/src/multilang_translator/main_backend.py
--- a/src/multilang_translator/main_cloud.py
+++ b/src/multilang_translator/main_cloud.py
--- a/src/multilang_translator/main_local.py
+++ b/src/multilang_translator/main_local.py
@@ -13,11 +13,12 @@ import aioconsole

 from auracast import multicast_control
 from auracast import auracast_config
-import multilang_translator.translator_config as translator_config
-from translator import llm_translator
-from translator.test_content import TESTSENTENCE
 from voice_provider import text_to_speech

+from multilang_translator import translator_config
+from multilang_translator.translator import llm_translator
+from multilang_translator.translator.test_content import TESTSENTENCE
+
 # TODO: look for a end to end translation solution

 def transcribe():
--- a/src/voice_client/client.py
+++ b/src/voice_client/client.py
@@ -0,0 +1,44 @@
+import requests
+import numpy as np
+import soundfile as sf
+
+from voice_models.request_models import SynthesizeRequest
+
+
+
+def request_synthesis(request_data: SynthesizeRequest):
+    response = requests.post(API_URL, json=request_data.model_dump())
+
+    if response.status_code == 200:
+        response_data = response.json()
+        
+        if request_data.return_lc3:
+            # Save LC3 audio as binary file
+            lc3_bytes = bytes.fromhex(response_data["audio_lc3"])
+            return lc3_bytes
+        
+        else:
+            # Convert hex-encoded PCM bytes back to numpy array and save as WAV
+            audio_bytes = bytes.fromhex(response_data["audio_pcm"])
+            audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
+            return audio_array
+    
+    else:
+        print(f"Error: {response.status_code}, {response.text}")
+
+if __name__ == "__main__":
+    API_URL = "http://127.0.0.1:8099/synthesize/"
+
+    target_rate=16000
+
+    # Example request
+    request_data = SynthesizeRequest(
+        text="Hello, this is a test.",
+        target_sample_rate=target_rate,
+        framework="piper",
+        model="de_DE-kerstin-low",
+        return_lc3=False  # Set to True to receive LC3 compressed output
+    )
+
+    audio = request_synthesis(request_data)
+    sf.write('hello.wav', audio, target_rate)
--- a/src/voice_models/request_models.py
+++ b/src/voice_models/request_models.py
@@ -0,0 +1,9 @@
+from pydantic import BaseModel
+
+class SynthesizeRequest(BaseModel):
+    text: str
+    target_sample_rate: int = 16000
+    framework: str = "piper"
+    model: str = "en_US-lessac-medium"
+    return_lc3: bool = False
+
--- a/src/voice_provider/text_to_speech.py
+++ b/src/voice_provider/text_to_speech.py
@@ -47,7 +47,13 @@ def synth_piper(text, model="en_US-lessac-medium"):


 # TODO: framework should probably be a dataclass that holds all the relevant informations, also model
-def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium", return_lc3=True):
+def synthesize(
+        text, 
+        target_sample_rate, 
+        framework, 
+        model="en_US-lessac-medium", 
+        return_lc3=True
+        ):

    if framework == 'piper':
        model_json, audio_raw = synth_piper(text, model)
@@ -85,5 +91,4 @@ if __name__ == '__main__':

    sf.write('hello.wav', audio, target_rate)

-    # TODO: "WARNING:piper.download:Wrong size (expected=5952, actual=4158
    print('Done.')
--- a/src/voice_provider/tts_server.py
+++ b/src/voice_provider/tts_server.py
@@ -0,0 +1,32 @@
+from fastapi import FastAPI, HTTPException
+import numpy as np
+
+from voice_models.request_models import SynthesizeRequest
+from voice_provider.text_to_speech import synthesize
+
+app = FastAPI()
+
+
+@app.post("/synthesize/")
+async def synthesize_speech(request: SynthesizeRequest):
+    try:
+        audio = synthesize(
+            text=request.text,
+            target_sample_rate=request.target_sample_rate,
+            framework=request.framework,
+            model=request.model,
+            return_lc3=request.return_lc3
+        )
+
+        if request.return_lc3:
+            return {"audio_lc3": audio.hex()}
+        else:
+            audio_bytes = audio.astype(np.float32).tobytes()
+            return {"audio_pcm": audio_bytes.hex()}
+        
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=8099)