12 Commits

Author SHA1 Message Date
pstruebi 3759a026d4 update mock database with wireguard internal addresses 2025-03-27 16:42:39 +00:00
pstruebi 6bfbc6e180 add speaker wav 2025-03-25 12:44:26 +01:00
pstruebi e02593c78d use different xtts voice 2025-03-25 12:34:16 +01:00
pstruebi c095b058d6 add basic support for xtts 2025-03-25 12:02:59 +01:00
pstruebi a9dbe52a7e update environment setup 2025-03-25 09:22:42 +00:00
pstruebi 1d4a2b3b45 update lock file 2025-03-25 09:54:04 +01:00
pstruebi 36dd34b042 some adjustements for server deployment 2025-03-25 08:53:22 +00:00
pstruebi 4971f1e7f6 update docker build 2025-03-23 16:36:33 +01:00
pstruebi 466fb1762e update dockerfile 2025-03-20 12:25:01 +00:00
pstruebi b9ca04af82 change project structure for packaging that makes sense with poetry 2025-03-20 11:56:33 +01:00
pstruebi 17cf41166b adjust dependencies and dockerfiles 2025-03-19 13:15:04 +00:00
pstruebi 5e5c3e2040 refractoring 2025-03-19 13:45:46 +01:00
33 changed files with 5660 additions and 109 deletions
+1
View File
@@ -1,5 +1,6 @@
*.pyc
*.wav
!/src/voice_provider/speaker_wav/*
*.lc3
*.onnx
*.onnx.json
+35
View File
@@ -0,0 +1,35 @@
FROM python:3.11-slim
# Install system dependencies and poetry
RUN apt-get update && apt-get install -y \
git \
gcc \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# accept new ssh server
RUN sed /^StrictHostKeyChecking/d /etc/ssh/ssh_config; \
echo StrictHostKeyChecking no >> /etc/ssh/ssh_config
# Install and configure poetry
RUN --mount=type=cache,target=/root/.cache \
pip install poetry
RUN poetry config virtualenvs.create false
WORKDIR /app
# copy the app code
COPY ./src .
COPY poetry.lock .
COPY pyproject.toml .
# Install the project with all dependencies
RUN --mount=type=cache,target=/root/.cache \
--mount=type=ssh,required=true \
poetry install --no-interaction --without dev --no-root
# Expose the API port
EXPOSE 7999
# Run the translator server directly from the module path
CMD ["python", "-m", "auracast_translator.translator_server.translator_server"]
+13
View File
@@ -0,0 +1,13 @@
services:
auracast-translator:
build:
context: .
dockerfile: Dockerfile
ssh:
- default=~/.ssh/id_ed25519 #lappi
ports:
- "7999:7999"
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
Generated
+5392
View File
File diff suppressed because it is too large Load Diff
+38 -22
View File
@@ -1,27 +1,43 @@
[project]
name = "multilang_translator"
requires-python = ">= 3.11"
version = '0.1'
dependencies = [
"auracast @git+https://git@gitea.pstruebi.xyz/auracaster/bumble-auracast",
"requests==2.32.3",
"ollama==0.4.7",
"aioconsole==0.8.1",
"fastapi==0.115.11",
"uvicorn==0.34.0",
"aiohttp==3.9.3",
[tool.poetry]
name = "auracast_translator"
version = "0.1.0"
authors = ["Patrick S <pstruebi>"]
description = "Announcement System"
readme = "readme.md"
packages = [
{ include = "translator_models", from = "src" },
{ include = "translator_client", from = "src" },
{ include = "auracast_translator", from = "src" },
]
[project.optional-dependencies]
test = [
"pytest >= 8.2",
]
[tool.poetry.dependencies]
python = "~3.11"
setuptools= ">=77"
coqui-tts = "0.26"
[tool.poetry.group.tts.dependencies]
piper-phonemize = "==1.1.0"
piper-tts = "==1.2.0"
[tool.poetry.group.general.dependencies]
requests="2.32.3"
aiohttp="3.9.3"
fastapi="0.115.11"
uvicorn="0.34.0"
ollama="0.4.7"
piper-tts="1.2.0"
librosa="0.10.1"
aioconsole="0.8.1"
lc3 = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/liblc3.git", rev = "7558637303106c7ea971e7bb8cedf379d3e08bcc" }
auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.git" }
#[tool.poetry.group.gpu.dependencies]
#onnxruntime-gpu = "^1.20.1"
# TODO: for running piper on gpu investigate
# https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements
# put everything in pytorch container according to piper github:
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
# Use a seperate container for the voice provider
[tool.poetry.group.dev.dependencies]
pytest = {version=">8.2", optional=true}
[tool.pytest.ini_options]
addopts = [
@@ -29,5 +45,5 @@ addopts = [
]
[build-system]
requires = ["setuptools>=61", "wheel", "setuptools_scm>=8"]
build-backend = "setuptools.build_meta"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
+3 -8
View File
@@ -1,11 +1,6 @@
# Prerequisites
sudo apt install liblc3-tools
use python3.9
pip install piper-tts soundfile librosa pyserial pytest
# Install the project
poetry env use python3.11
poetry install
# Piper update voices
piper --update-voices -m en_US-lessac-medium
# TODO:
- investigate using a pipeline instead of writing to intermediate files to gain performance
@@ -10,14 +10,14 @@ from auracast import auracast_config
import voice_client
import voice_models
from multilang_translator import translator_config
from multilang_translator.translator import llm_translator
from auracast_translator.translator_models import translator_models
from auracast_translator.translator import llm_translator
import voice_client.tts_client
import voice_models.request_models
async def announcement_from_german_text(
config: translator_config.TranslatorConfigGroup,
config: translator_models.TranslatorConfigGroup,
text_de
):
base_lang = "deu"
@@ -9,9 +9,9 @@ from auracast import multicast_control
from auracast import auracast_config
from voice_provider import text_to_speech
from multilang_translator import translator_config
from multilang_translator.translator import llm_translator
from multilang_translator.translator.test_content import TESTSENTENCE
from auracast_translator import translator_config
from auracast_translator.translator import llm_translator
from auracast_translator.translator.test_content import TESTSENTENCE
# TODO: look for a end to end translation solution
@@ -6,7 +6,7 @@ import time
import ollama
import aiohttp
from multilang_translator.translator import syspromts
from auracast_translator.translator import syspromts
# ollama.create( # TODO: create models on startup
# model='example',
@@ -125,7 +125,7 @@ async def translate_de_to_x_async(
if __name__ == "__main__":
import time
from multilang_translator.translator import test_content
from auracast_translator.translator import test_content
start=time.time()
@@ -3,7 +3,7 @@ Database file for endpoint definitions.
This file contains configurations for auracast endpoints including their IP addresses and capabilities.
"""
from typing import List, Optional
from multilang_translator.translator_models.translator_models import EndpointGroup, Endpoint
from translator_models.translator_models import EndpointGroup, Endpoint
SUPPORTED_LANGUAGES = ["deu", "eng", "fra", "spa", "ita"]
@@ -13,19 +13,19 @@ ENDPOINTS: dict[int: Endpoint] = { # for now make sure, .id and key are the same
0: Endpoint(
id=0,
name="Local Endpoint",
url="http://localhost:5000",
url="http://10.13.13.3:5000", #"http://localhost:5000", #patricks laptop
max_broadcasts=3,
),
1: Endpoint(
id=1,
name="Gate 1",
url="http://pi3:5000",
url="http://10.13.13.4:5000", #pi4
max_broadcasts=3,
),
2: Endpoint(
id=2,
name="Gate 2",
url="http://192.168.1.102:5000",
url="http://10.13.13.5:5000",
max_broadcasts=3,
),
}
@@ -7,7 +7,7 @@ import logging as log
import sys
import os
# Add the parent directory to the Python path to find the multilang_translator package
# Add the parent directory to the Python path to find the auracast_translator package
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
if parent_dir not in sys.path:
@@ -20,7 +20,7 @@ if __name__ == "__main__":
)
log.info("Starting Translator API server")
uvicorn.run(
"multilang_translator.translator_server.translator_server:app",
"auracast_translator.translator_server.translator_server:app",
host="0.0.0.0",
port=7999,
reload=True,
@@ -11,9 +11,9 @@ from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
# Import models
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
from multilang_translator.translator import llm_translator
from multilang_translator.translator_server import endpoints_db
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
from auracast_translator.translator import llm_translator
from auracast_translator.translator_server import endpoints_db
from voice_provider import text_to_speech
# Import the endpoints database and multicast client
@@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup):
ep_group.sampling_rate_hz,
trans_conf.tts_system,
trans_conf.tts_model,
return_lc3=True
return_lc3=True,
language=trans_conf.xtts_language,
speaker=trans_conf.xtts_speaker,
speaker_wav=trans_conf.xtts_speaker_wav
)
synthesis_tasks.append(task)
@@ -329,9 +332,10 @@ async def get_available_languages():
if __name__ == "__main__":
import os
import uvicorn
log.basicConfig(
level=log.DEBUG,
level=os.environ.get('LOG_LEVEL', log.DEBUG),
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
)
# with reload=True logging of modules does not function as expected
@@ -342,5 +346,5 @@ if __name__ == "__main__":
port=7999,
#reload=True,
#log_config=None,
#log_level="info"
log_level="debug"
)
@@ -1,20 +0,0 @@
import os
from pydantic import BaseModel
VENV_DIR = os.path.join(os.path.dirname(__file__), './../../venv')
class TranslatorLangConfig(BaseModel):
translator_llm: str = 'llama3.2:3b-instruct-q4_0' # TODO: this was migrated to translator_models - remove this
llm_client: str = 'ollama'
llm_host_url: str | None = 'http://localhost:11434'
llm_host_token: str | None = None
tts_system: str = 'piper'
tts_model: str ='de_DE-kerstin-low'
class TranslatorConfig(BaseModel):
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
ita: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'it_IT-paola-medium')
@@ -3,10 +3,9 @@ API client functions for interacting with the Translator API.
"""
import requests
from typing import List, Optional, Dict, Any, Tuple
from enum import Enum
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
# This can be overridden through environment variables
@@ -35,12 +35,19 @@ class TranslatorLangConfig(BaseModel):
# llm_host_url: str | None = 'http://localhost:11434'
# llm_host_token: str | None = None
tts_system: str = 'piper'
tts_model: str ='de_DE-kerstin-low'
tts_system: str = 'piper' # Options: 'piper', 'xtts'
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
xtts_language: str = 'de' # Language code for XTTS
xtts_speaker: Optional[str] = None # Speaker name for XTTS
xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
class TranslatorConfig(BaseModel):
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
deu: TranslatorLangConfig = TranslatorLangConfig(
tts_system='xtts',
xtts_language='de',
xtts_speaker_wav='female.wav'
)
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
+4 -1
View File
@@ -1,4 +1,5 @@
from pydantic import BaseModel
from typing import Optional
class SynthesizeRequest(BaseModel):
text: str
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
framework: str = "piper"
model: str = "en_US-lessac-medium"
return_lc3: bool = False
language: str = "en" # Language code for XTTS
speaker: Optional[str] = None # Speaker name for XTTS
speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
Binary file not shown.
+108 -14
View File
@@ -1,4 +1,7 @@
import os
# Set environment variable to auto-accept Coqui TTS license
os.environ["COQUI_TOS_AGREED"] = "1"
import shutil
import subprocess
import time
@@ -6,16 +9,35 @@ import json
import logging as log
import numpy as np
import asyncio
import torch
from voice_provider.utils.resample import resample_array
from voice_provider.utils.encode_lc3 import encode_lc3
# Now import TTS - the license will be auto-accepted
from TTS.api import TTS
# Get device for XTTS
if torch.cuda.is_available():
log.info('XTTS will run on GPU')
XTTS_DEVICE = "cuda"
else:
log.info('XTTS will run on CPU')
XTTS_DEVICE = "cpu"
# Load XTTS model globally - only once
log.info("Initializing XTTS model...")
start_init = time.time()
XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
end_init = time.time()
log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
PIPER_EXE = shutil.which('piper')
TTS_DIR = os.path.join(os.path.dirname(__file__))
PIPER_WORKDIR = f'{TTS_DIR}/piper'
if not PIPER_EXE:
PIPER_EXE = f'{TTS_DIR}/../../venv/bin/piper'
PIPER_EXE = f'{TTS_DIR}/../../.venv/bin/piper'
def synth_piper(text, model="en_US-lessac-medium"):
pwd = os.getcwd()
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
[
PIPER_EXE,
'--cuda',
#'--cuda',
'--model', model,
'--output-raw'
],
@@ -52,7 +74,10 @@ def synthesize(
target_sample_rate,
framework,
model="en_US-lessac-medium",
return_lc3=True
return_lc3=True,
language="en",
speaker=None,
speaker_wav=None
):
if framework == 'piper':
@@ -64,7 +89,40 @@ def synthesize(
elif framework == 'koro':
pass
elif framework == 'xtts':
pass
start = time.time()
# Generate audio using XTTS
# XTTS always outputs at 24kHz
xtts_sample_rate = 24000
# Validate speaker parameters - XTTS needs either speaker or speaker_wav
if speaker is None and speaker_wav is None:
# Use the first available speaker if none specified
speaker = XTTS_MODEL.speakers[0]
log.info(f"No speaker specified, using default: {speaker}")
# Generate audio samples using tts.tts
if speaker_wav:
# expand path to speaker_wav folder
speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav)
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
else:
log.info(f"Generating XTTS audio with speaker: {speaker}")
audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
# Ensure audio_np is a numpy array and properly scaled
audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
# Log some info about the audio data
log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, "
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
# Resample from 24kHz to target sample rate with speedup factor
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate)
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
elif framework == 'zonos':
pass
else: raise NotImplementedError('unknown framework')
@@ -82,7 +140,10 @@ async def synthesize_async(
target_sample_rate,
framework,
model="en_US-lessac-medium",
return_lc3=True
return_lc3=True,
language="en",
speaker=None,
speaker_wav=None
):
"""
Asynchronous version of the synthesize function that runs in a thread pool.
@@ -90,9 +151,12 @@ async def synthesize_async(
Args:
text: Text to synthesize
target_sample_rate: Target sample rate for the audio
framework: TTS framework to use (e.g., 'piper')
framework: TTS framework to use (e.g., 'piper', 'xtts')
model: Model to use for synthesis
return_lc3: Whether to return LC3-encoded audio
language: Language code (used by XTTS)
speaker: Speaker ID for XTTS
speaker_wav: Path to speaker sample for XTTS voice cloning
Returns:
LC3-encoded audio as string or raw audio as numpy array
@@ -101,23 +165,53 @@ async def synthesize_async(
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
lambda: synthesize(
text,
target_sample_rate,
framework,
model,
return_lc3,
language,
speaker,
speaker_wav
)
)
return result
if __name__ == '__main__':
import logging
import soundfile as sf
logging.basicConfig(
level=logging.INFO,
log.basicConfig(
level=log.INFO,
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
)
target_rate=16000
target_rate = 16000
audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
sf.write('hello.wav', audio, target_rate)
# First, print available XTTS speakers
print("Available XTTS speakers:")
print(XTTS_MODEL.speakers)
# Demo of Piper
print("Testing Piper TTS...")
audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
sf.write('hello_piper.wav', audio_piper, target_rate)
# Demo of XTTS with Annmarie Nele for German
speaker_wav = 'female.wav'
print(f"Testing XTTS with German language using speaker: {speaker_wav}")
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
audio_xtts = synthesize(
text=text_to_synthesize,
target_sample_rate=target_rate,
framework='xtts',
language='de',
speaker_wav=speaker_wav,
return_lc3=False
)
# Save the wav file
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
print('Done.')
+4 -1
View File
@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
target_sample_rate=request.target_sample_rate,
framework=request.framework,
model=request.model,
return_lc3=request.return_lc3
return_lc3=request.return_lc3,
language=request.language,
speaker=request.speaker,
speaker_wav=request.speaker_wav
)
if request.return_lc3:
+15 -6
View File
@@ -24,20 +24,29 @@ def resample_file(filename, out_filename, target_rate):
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
def resample_array(audio, rate, target_rate):
def resample_array(audio, rate, target_rate, speedup=1.0):
start=time.time()
# Load the original audio file
if rate == target_rate: # Nothing to do
log.info('audio already at target rate, skipping resample')
if rate == target_rate and speedup == 1.0: # Nothing to do
log.info('audio already at target rate with no speedup, skipping resample')
return audio
# Apply speedup if needed
if speedup != 1.0:
# When speeding up, we need to resample to a lower rate first
# This effectively shortens the audio duration
effective_orig_sr = rate * speedup
log.info(f"Applying speedup factor of {speedup}")
else:
effective_orig_sr = rate
# Convert the sample rate to target rate
resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)
# Save the resampled audio as a new .wav file
log.info("Resampling took %s s", round(time.time() - start, 3))
log.info("Resampling took %s s", round(time.time() - start, 3))
return resampled_audio
@@ -45,4 +54,4 @@ if __name__ == "__main__":
import os
os.chdir(os.path.dirname(__file__))
file_dir = '../text_to_speech/'
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav')
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav', 16000)
+3 -3
View File
@@ -4,9 +4,9 @@ import time
import os
import subprocess
from multilang_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
from auracast_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
log.basicConfig(
level=log.INFO,
+3 -3
View File
@@ -4,9 +4,9 @@ import time
import os
import subprocess
from multilang_translator.translator_config import LANG_CONFIG
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
from auracast_translator.translator_config import LANG_CONFIG
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
def test_config_broadcaster(ft_configure_broadcaster):
+2 -2
View File
@@ -1,5 +1,5 @@
from multilang_translator.main_local import announcement_from_german_text
from multilang_translator.translator import test_content
from auracast_translator.main_local import announcement_from_german_text
from auracast_translator.translator import test_content
def test_announcement_from_german_text(
+3 -3
View File
@@ -1,6 +1,6 @@
from multilang_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
from multilang_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
from multilang_translator.main_local import translate_from_german
from auracast_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
from auracast_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
from auracast_translator.main_local import translate_from_german
import time
+1 -1
View File
@@ -1,4 +1,4 @@
from multilang_translator.text_to_speech.text_to_speech import synthesize
from auracast_translator.text_to_speech.text_to_speech import synthesize
def test_synthesize():
synthesize("Hello, how are you?", "en_US-lessac-medium", "hello.wav")