Implement a basic tts server

This commit is contained in:
2025-03-06 09:09:30 +01:00
parent f14902c6e7
commit d54de1d291
8 changed files with 98 additions and 5 deletions

View File

@@ -10,6 +10,8 @@ dependencies = [
"aioconsole==0.8.1",
"piper-phonemize==1.1.0",
"piper-tts==1.2.0",
"fastapi==0.115.11",
"uvicorn==0.34.0",
]
[project.optional-dependencies]

View File

View File

View File

@@ -13,11 +13,12 @@ import aioconsole
from auracast import multicast_control
from auracast import auracast_config
import multilang_translator.translator_config as translator_config
from translator import llm_translator
from translator.test_content import TESTSENTENCE
from voice_provider import text_to_speech
from multilang_translator import translator_config
from multilang_translator.translator import llm_translator
from multilang_translator.translator.test_content import TESTSENTENCE
# TODO: look for a end to end translation solution
def transcribe():

View File

@@ -0,0 +1,44 @@
import requests
import numpy as np
import soundfile as sf
from voice_models.request_models import SynthesizeRequest
def request_synthesis(request_data: SynthesizeRequest):
response = requests.post(API_URL, json=request_data.model_dump())
if response.status_code == 200:
response_data = response.json()
if request_data.return_lc3:
# Save LC3 audio as binary file
lc3_bytes = bytes.fromhex(response_data["audio_lc3"])
return lc3_bytes
else:
# Convert hex-encoded PCM bytes back to numpy array and save as WAV
audio_bytes = bytes.fromhex(response_data["audio_pcm"])
audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
return audio_array
else:
print(f"Error: {response.status_code}, {response.text}")
if __name__ == "__main__":
API_URL = "http://127.0.0.1:8099/synthesize/"
target_rate=16000
# Example request
request_data = SynthesizeRequest(
text="Hello, this is a test.",
target_sample_rate=target_rate,
framework="piper",
model="de_DE-kerstin-low",
return_lc3=False # Set to True to receive LC3 compressed output
)
audio = request_synthesis(request_data)
sf.write('hello.wav', audio, target_rate)

View File

@@ -0,0 +1,9 @@
from pydantic import BaseModel
class SynthesizeRequest(BaseModel):
text: str
target_sample_rate: int = 16000
framework: str = "piper"
model: str = "en_US-lessac-medium"
return_lc3: bool = False

View File

@@ -47,7 +47,13 @@ def synth_piper(text, model="en_US-lessac-medium"):
# TODO: framework should probably be a dataclass that holds all the relevant informations, also model
def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium", return_lc3=True):
def synthesize(
text,
target_sample_rate,
framework,
model="en_US-lessac-medium",
return_lc3=True
):
if framework == 'piper':
model_json, audio_raw = synth_piper(text, model)
@@ -85,5 +91,4 @@ if __name__ == '__main__':
sf.write('hello.wav', audio, target_rate)
# TODO: "WARNING:piper.download:Wrong size (expected=5952, actual=4158
print('Done.')

View File

@@ -0,0 +1,32 @@
from fastapi import FastAPI, HTTPException
import numpy as np
from voice_models.request_models import SynthesizeRequest
from voice_provider.text_to_speech import synthesize
app = FastAPI()
@app.post("/synthesize/")
async def synthesize_speech(request: SynthesizeRequest):
try:
audio = synthesize(
text=request.text,
target_sample_rate=request.target_sample_rate,
framework=request.framework,
model=request.model,
return_lc3=request.return_lc3
)
if request.return_lc3:
return {"audio_lc3": audio.hex()}
else:
audio_bytes = audio.astype(np.float32).tobytes()
return {"audio_pcm": audio_bytes.hex()}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8099)