Implement a basic tts server
This commit is contained in:
@@ -10,6 +10,8 @@ dependencies = [
|
||||
"aioconsole==0.8.1",
|
||||
"piper-phonemize==1.1.0",
|
||||
"piper-tts==1.2.0",
|
||||
"fastapi==0.115.11",
|
||||
"uvicorn==0.34.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
0
src/multilang_translator/main_backend.py
Normal file
0
src/multilang_translator/main_backend.py
Normal file
0
src/multilang_translator/main_cloud.py
Normal file
0
src/multilang_translator/main_cloud.py
Normal file
@@ -13,11 +13,12 @@ import aioconsole
|
||||
|
||||
from auracast import multicast_control
|
||||
from auracast import auracast_config
|
||||
import multilang_translator.translator_config as translator_config
|
||||
from translator import llm_translator
|
||||
from translator.test_content import TESTSENTENCE
|
||||
from voice_provider import text_to_speech
|
||||
|
||||
from multilang_translator import translator_config
|
||||
from multilang_translator.translator import llm_translator
|
||||
from multilang_translator.translator.test_content import TESTSENTENCE
|
||||
|
||||
# TODO: look for a end to end translation solution
|
||||
|
||||
def transcribe():
|
||||
|
||||
44
src/voice_client/client.py
Normal file
44
src/voice_client/client.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import requests
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
from voice_models.request_models import SynthesizeRequest
|
||||
|
||||
|
||||
|
||||
def request_synthesis(request_data: SynthesizeRequest):
|
||||
response = requests.post(API_URL, json=request_data.model_dump())
|
||||
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
|
||||
if request_data.return_lc3:
|
||||
# Save LC3 audio as binary file
|
||||
lc3_bytes = bytes.fromhex(response_data["audio_lc3"])
|
||||
return lc3_bytes
|
||||
|
||||
else:
|
||||
# Convert hex-encoded PCM bytes back to numpy array and save as WAV
|
||||
audio_bytes = bytes.fromhex(response_data["audio_pcm"])
|
||||
audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
|
||||
return audio_array
|
||||
|
||||
else:
|
||||
print(f"Error: {response.status_code}, {response.text}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
API_URL = "http://127.0.0.1:8099/synthesize/"
|
||||
|
||||
target_rate=16000
|
||||
|
||||
# Example request
|
||||
request_data = SynthesizeRequest(
|
||||
text="Hello, this is a test.",
|
||||
target_sample_rate=target_rate,
|
||||
framework="piper",
|
||||
model="de_DE-kerstin-low",
|
||||
return_lc3=False # Set to True to receive LC3 compressed output
|
||||
)
|
||||
|
||||
audio = request_synthesis(request_data)
|
||||
sf.write('hello.wav', audio, target_rate)
|
||||
9
src/voice_models/request_models.py
Normal file
9
src/voice_models/request_models.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
class SynthesizeRequest(BaseModel):
|
||||
text: str
|
||||
target_sample_rate: int = 16000
|
||||
framework: str = "piper"
|
||||
model: str = "en_US-lessac-medium"
|
||||
return_lc3: bool = False
|
||||
|
||||
@@ -47,7 +47,13 @@ def synth_piper(text, model="en_US-lessac-medium"):
|
||||
|
||||
|
||||
# TODO: framework should probably be a dataclass that holds all the relevant informations, also model
|
||||
def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium", return_lc3=True):
|
||||
def synthesize(
|
||||
text,
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model="en_US-lessac-medium",
|
||||
return_lc3=True
|
||||
):
|
||||
|
||||
if framework == 'piper':
|
||||
model_json, audio_raw = synth_piper(text, model)
|
||||
@@ -85,5 +91,4 @@ if __name__ == '__main__':
|
||||
|
||||
sf.write('hello.wav', audio, target_rate)
|
||||
|
||||
# TODO: "WARNING:piper.download:Wrong size (expected=5952, actual=4158
|
||||
print('Done.')
|
||||
|
||||
32
src/voice_provider/tts_server.py
Normal file
32
src/voice_provider/tts_server.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
import numpy as np
|
||||
|
||||
from voice_models.request_models import SynthesizeRequest
|
||||
from voice_provider.text_to_speech import synthesize
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.post("/synthesize/")
|
||||
async def synthesize_speech(request: SynthesizeRequest):
|
||||
try:
|
||||
audio = synthesize(
|
||||
text=request.text,
|
||||
target_sample_rate=request.target_sample_rate,
|
||||
framework=request.framework,
|
||||
model=request.model,
|
||||
return_lc3=request.return_lc3
|
||||
)
|
||||
|
||||
if request.return_lc3:
|
||||
return {"audio_lc3": audio.hex()}
|
||||
else:
|
||||
audio_bytes = audio.astype(np.float32).tobytes()
|
||||
return {"audio_pcm": audio_bytes.hex()}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="127.0.0.1", port=8099)
|
||||
Reference in New Issue
Block a user