Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3759a026d4 | |||
| 6bfbc6e180 | |||
| e02593c78d | |||
| c095b058d6 | |||
| a9dbe52a7e | |||
| 1d4a2b3b45 | |||
| 36dd34b042 | |||
| 4971f1e7f6 | |||
| 466fb1762e | |||
| b9ca04af82 | |||
| 17cf41166b | |||
| 5e5c3e2040 |
@@ -1,5 +1,6 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
*.wav
|
*.wav
|
||||||
|
!/src/voice_provider/speaker_wav/*
|
||||||
*.lc3
|
*.lc3
|
||||||
*.onnx
|
*.onnx
|
||||||
*.onnx.json
|
*.onnx.json
|
||||||
|
|||||||
+35
@@ -0,0 +1,35 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Install system dependencies and poetry
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
git \
|
||||||
|
gcc \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# accept new ssh server
|
||||||
|
RUN sed /^StrictHostKeyChecking/d /etc/ssh/ssh_config; \
|
||||||
|
echo StrictHostKeyChecking no >> /etc/ssh/ssh_config
|
||||||
|
|
||||||
|
# Install and configure poetry
|
||||||
|
RUN --mount=type=cache,target=/root/.cache \
|
||||||
|
pip install poetry
|
||||||
|
RUN poetry config virtualenvs.create false
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# copy the app code
|
||||||
|
COPY ./src .
|
||||||
|
COPY poetry.lock .
|
||||||
|
COPY pyproject.toml .
|
||||||
|
|
||||||
|
# Install the project with all dependencies
|
||||||
|
RUN --mount=type=cache,target=/root/.cache \
|
||||||
|
--mount=type=ssh,required=true \
|
||||||
|
poetry install --no-interaction --without dev --no-root
|
||||||
|
|
||||||
|
# Expose the API port
|
||||||
|
EXPOSE 7999
|
||||||
|
|
||||||
|
# Run the translator server directly from the module path
|
||||||
|
CMD ["python", "-m", "auracast_translator.translator_server.translator_server"]
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
services:
|
||||||
|
auracast-translator:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ssh:
|
||||||
|
- default=~/.ssh/id_ed25519 #lappi
|
||||||
|
ports:
|
||||||
|
- "7999:7999"
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
Generated
+5392
File diff suppressed because it is too large
Load Diff
+38
-22
@@ -1,27 +1,43 @@
|
|||||||
[project]
|
[tool.poetry]
|
||||||
name = "multilang_translator"
|
name = "auracast_translator"
|
||||||
requires-python = ">= 3.11"
|
version = "0.1.0"
|
||||||
version = '0.1'
|
authors = ["Patrick S <pstruebi>"]
|
||||||
|
description = "Announcement System"
|
||||||
dependencies = [
|
readme = "readme.md"
|
||||||
"auracast @git+https://git@gitea.pstruebi.xyz/auracaster/bumble-auracast",
|
packages = [
|
||||||
"requests==2.32.3",
|
{ include = "translator_models", from = "src" },
|
||||||
"ollama==0.4.7",
|
{ include = "translator_client", from = "src" },
|
||||||
"aioconsole==0.8.1",
|
{ include = "auracast_translator", from = "src" },
|
||||||
"fastapi==0.115.11",
|
|
||||||
"uvicorn==0.34.0",
|
|
||||||
"aiohttp==3.9.3",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[tool.poetry.dependencies]
|
||||||
test = [
|
python = "~3.11"
|
||||||
"pytest >= 8.2",
|
setuptools= ">=77"
|
||||||
]
|
coqui-tts = "0.26"
|
||||||
|
|
||||||
[tool.poetry.group.tts.dependencies]
|
[tool.poetry.group.general.dependencies]
|
||||||
piper-phonemize = "==1.1.0"
|
requests="2.32.3"
|
||||||
piper-tts = "==1.2.0"
|
aiohttp="3.9.3"
|
||||||
|
fastapi="0.115.11"
|
||||||
|
uvicorn="0.34.0"
|
||||||
|
ollama="0.4.7"
|
||||||
|
piper-tts="1.2.0"
|
||||||
|
librosa="0.10.1"
|
||||||
|
aioconsole="0.8.1"
|
||||||
|
lc3 = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/liblc3.git", rev = "7558637303106c7ea971e7bb8cedf379d3e08bcc" }
|
||||||
|
auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.git" }
|
||||||
|
|
||||||
|
#[tool.poetry.group.gpu.dependencies]
|
||||||
|
#onnxruntime-gpu = "^1.20.1"
|
||||||
|
# TODO: for running piper on gpu investigate
|
||||||
|
# https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements
|
||||||
|
# put everything in pytorch container according to piper github:
|
||||||
|
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
|
||||||
|
# Use a seperate container for the voice provider
|
||||||
|
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
pytest = {version=">8.2", optional=true}
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = [
|
addopts = [
|
||||||
@@ -29,5 +45,5 @@ addopts = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools>=61", "wheel", "setuptools_scm>=8"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|||||||
@@ -1,11 +1,6 @@
|
|||||||
# Prerequisites
|
# Install the project
|
||||||
sudo apt install liblc3-tools
|
poetry env use python3.11
|
||||||
|
poetry install
|
||||||
use python3.9
|
|
||||||
pip install piper-tts soundfile librosa pyserial pytest
|
|
||||||
|
|
||||||
# Piper update voices
|
# Piper update voices
|
||||||
piper --update-voices -m en_US-lessac-medium
|
piper --update-voices -m en_US-lessac-medium
|
||||||
|
|
||||||
# TODO:
|
|
||||||
- investigate using a pipeline instead of writing to intermediate files to gain performance
|
|
||||||
@@ -10,14 +10,14 @@ from auracast import auracast_config
|
|||||||
import voice_client
|
import voice_client
|
||||||
import voice_models
|
import voice_models
|
||||||
|
|
||||||
from multilang_translator import translator_config
|
from auracast_translator.translator_models import translator_models
|
||||||
from multilang_translator.translator import llm_translator
|
from auracast_translator.translator import llm_translator
|
||||||
import voice_client.tts_client
|
import voice_client.tts_client
|
||||||
import voice_models.request_models
|
import voice_models.request_models
|
||||||
|
|
||||||
|
|
||||||
async def announcement_from_german_text(
|
async def announcement_from_german_text(
|
||||||
config: translator_config.TranslatorConfigGroup,
|
config: translator_models.TranslatorConfigGroup,
|
||||||
text_de
|
text_de
|
||||||
):
|
):
|
||||||
base_lang = "deu"
|
base_lang = "deu"
|
||||||
@@ -9,9 +9,9 @@ from auracast import multicast_control
|
|||||||
from auracast import auracast_config
|
from auracast import auracast_config
|
||||||
from voice_provider import text_to_speech
|
from voice_provider import text_to_speech
|
||||||
|
|
||||||
from multilang_translator import translator_config
|
from auracast_translator import translator_config
|
||||||
from multilang_translator.translator import llm_translator
|
from auracast_translator.translator import llm_translator
|
||||||
from multilang_translator.translator.test_content import TESTSENTENCE
|
from auracast_translator.translator.test_content import TESTSENTENCE
|
||||||
|
|
||||||
# TODO: look for a end to end translation solution
|
# TODO: look for a end to end translation solution
|
||||||
|
|
||||||
+2
-2
@@ -6,7 +6,7 @@ import time
|
|||||||
import ollama
|
import ollama
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
from multilang_translator.translator import syspromts
|
from auracast_translator.translator import syspromts
|
||||||
|
|
||||||
# ollama.create( # TODO: create models on startup
|
# ollama.create( # TODO: create models on startup
|
||||||
# model='example',
|
# model='example',
|
||||||
@@ -125,7 +125,7 @@ async def translate_de_to_x_async(
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import time
|
import time
|
||||||
from multilang_translator.translator import test_content
|
from auracast_translator.translator import test_content
|
||||||
|
|
||||||
|
|
||||||
start=time.time()
|
start=time.time()
|
||||||
+4
-4
@@ -3,7 +3,7 @@ Database file for endpoint definitions.
|
|||||||
This file contains configurations for auracast endpoints including their IP addresses and capabilities.
|
This file contains configurations for auracast endpoints including their IP addresses and capabilities.
|
||||||
"""
|
"""
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from multilang_translator.translator_models.translator_models import EndpointGroup, Endpoint
|
from translator_models.translator_models import EndpointGroup, Endpoint
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_LANGUAGES = ["deu", "eng", "fra", "spa", "ita"]
|
SUPPORTED_LANGUAGES = ["deu", "eng", "fra", "spa", "ita"]
|
||||||
@@ -13,19 +13,19 @@ ENDPOINTS: dict[int: Endpoint] = { # for now make sure, .id and key are the same
|
|||||||
0: Endpoint(
|
0: Endpoint(
|
||||||
id=0,
|
id=0,
|
||||||
name="Local Endpoint",
|
name="Local Endpoint",
|
||||||
url="http://localhost:5000",
|
url="http://10.13.13.3:5000", #"http://localhost:5000", #patricks laptop
|
||||||
max_broadcasts=3,
|
max_broadcasts=3,
|
||||||
),
|
),
|
||||||
1: Endpoint(
|
1: Endpoint(
|
||||||
id=1,
|
id=1,
|
||||||
name="Gate 1",
|
name="Gate 1",
|
||||||
url="http://pi3:5000",
|
url="http://10.13.13.4:5000", #pi4
|
||||||
max_broadcasts=3,
|
max_broadcasts=3,
|
||||||
),
|
),
|
||||||
2: Endpoint(
|
2: Endpoint(
|
||||||
id=2,
|
id=2,
|
||||||
name="Gate 2",
|
name="Gate 2",
|
||||||
url="http://192.168.1.102:5000",
|
url="http://10.13.13.5:5000",
|
||||||
max_broadcasts=3,
|
max_broadcasts=3,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
+2
-2
@@ -7,7 +7,7 @@ import logging as log
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Add the parent directory to the Python path to find the multilang_translator package
|
# Add the parent directory to the Python path to find the auracast_translator package
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
|
||||||
if parent_dir not in sys.path:
|
if parent_dir not in sys.path:
|
||||||
@@ -20,7 +20,7 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
log.info("Starting Translator API server")
|
log.info("Starting Translator API server")
|
||||||
uvicorn.run(
|
uvicorn.run(
|
||||||
"multilang_translator.translator_server.translator_server:app",
|
"auracast_translator.translator_server.translator_server:app",
|
||||||
host="0.0.0.0",
|
host="0.0.0.0",
|
||||||
port=7999,
|
port=7999,
|
||||||
reload=True,
|
reload=True,
|
||||||
+10
-6
@@ -11,9 +11,9 @@ from fastapi import FastAPI, HTTPException
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
# Import models
|
# Import models
|
||||||
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||||
from multilang_translator.translator import llm_translator
|
from auracast_translator.translator import llm_translator
|
||||||
from multilang_translator.translator_server import endpoints_db
|
from auracast_translator.translator_server import endpoints_db
|
||||||
from voice_provider import text_to_speech
|
from voice_provider import text_to_speech
|
||||||
|
|
||||||
# Import the endpoints database and multicast client
|
# Import the endpoints database and multicast client
|
||||||
@@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup):
|
|||||||
ep_group.sampling_rate_hz,
|
ep_group.sampling_rate_hz,
|
||||||
trans_conf.tts_system,
|
trans_conf.tts_system,
|
||||||
trans_conf.tts_model,
|
trans_conf.tts_model,
|
||||||
return_lc3=True
|
return_lc3=True,
|
||||||
|
language=trans_conf.xtts_language,
|
||||||
|
speaker=trans_conf.xtts_speaker,
|
||||||
|
speaker_wav=trans_conf.xtts_speaker_wav
|
||||||
)
|
)
|
||||||
synthesis_tasks.append(task)
|
synthesis_tasks.append(task)
|
||||||
|
|
||||||
@@ -329,9 +332,10 @@ async def get_available_languages():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
import uvicorn
|
import uvicorn
|
||||||
log.basicConfig(
|
log.basicConfig(
|
||||||
level=log.DEBUG,
|
level=os.environ.get('LOG_LEVEL', log.DEBUG),
|
||||||
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
||||||
)
|
)
|
||||||
# with reload=True logging of modules does not function as expected
|
# with reload=True logging of modules does not function as expected
|
||||||
@@ -342,5 +346,5 @@ if __name__ == "__main__":
|
|||||||
port=7999,
|
port=7999,
|
||||||
#reload=True,
|
#reload=True,
|
||||||
#log_config=None,
|
#log_config=None,
|
||||||
#log_level="info"
|
log_level="debug"
|
||||||
)
|
)
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
import os
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
VENV_DIR = os.path.join(os.path.dirname(__file__), './../../venv')
|
|
||||||
|
|
||||||
class TranslatorLangConfig(BaseModel):
|
|
||||||
translator_llm: str = 'llama3.2:3b-instruct-q4_0' # TODO: this was migrated to translator_models - remove this
|
|
||||||
llm_client: str = 'ollama'
|
|
||||||
llm_host_url: str | None = 'http://localhost:11434'
|
|
||||||
llm_host_token: str | None = None
|
|
||||||
tts_system: str = 'piper'
|
|
||||||
tts_model: str ='de_DE-kerstin-low'
|
|
||||||
|
|
||||||
class TranslatorConfig(BaseModel):
|
|
||||||
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
|
|
||||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
|
||||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
|
||||||
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
|
||||||
ita: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'it_IT-paola-medium')
|
|
||||||
|
|
||||||
+1
-2
@@ -3,10 +3,9 @@ API client functions for interacting with the Translator API.
|
|||||||
"""
|
"""
|
||||||
import requests
|
import requests
|
||||||
from typing import List, Optional, Dict, Any, Tuple
|
from typing import List, Optional, Dict, Any, Tuple
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
|
|
||||||
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||||
|
|
||||||
|
|
||||||
# This can be overridden through environment variables
|
# This can be overridden through environment variables
|
||||||
+10
-3
@@ -35,12 +35,19 @@ class TranslatorLangConfig(BaseModel):
|
|||||||
# llm_host_url: str | None = 'http://localhost:11434'
|
# llm_host_url: str | None = 'http://localhost:11434'
|
||||||
# llm_host_token: str | None = None
|
# llm_host_token: str | None = None
|
||||||
|
|
||||||
tts_system: str = 'piper'
|
tts_system: str = 'piper' # Options: 'piper', 'xtts'
|
||||||
tts_model: str ='de_DE-kerstin-low'
|
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
|
||||||
|
xtts_language: str = 'de' # Language code for XTTS
|
||||||
|
xtts_speaker: Optional[str] = None # Speaker name for XTTS
|
||||||
|
xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||||
|
|
||||||
|
|
||||||
class TranslatorConfig(BaseModel):
|
class TranslatorConfig(BaseModel):
|
||||||
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
|
deu: TranslatorLangConfig = TranslatorLangConfig(
|
||||||
|
tts_system='xtts',
|
||||||
|
xtts_language='de',
|
||||||
|
xtts_speaker_wav='female.wav'
|
||||||
|
)
|
||||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
||||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
||||||
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
class SynthesizeRequest(BaseModel):
|
class SynthesizeRequest(BaseModel):
|
||||||
text: str
|
text: str
|
||||||
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
|
|||||||
framework: str = "piper"
|
framework: str = "piper"
|
||||||
model: str = "en_US-lessac-medium"
|
model: str = "en_US-lessac-medium"
|
||||||
return_lc3: bool = False
|
return_lc3: bool = False
|
||||||
|
language: str = "en" # Language code for XTTS
|
||||||
|
speaker: Optional[str] = None # Speaker name for XTTS
|
||||||
|
speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||||
|
|||||||
Binary file not shown.
@@ -1,4 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
|
# Set environment variable to auto-accept Coqui TTS license
|
||||||
|
os.environ["COQUI_TOS_AGREED"] = "1"
|
||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
@@ -6,16 +9,35 @@ import json
|
|||||||
import logging as log
|
import logging as log
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import torch
|
||||||
from voice_provider.utils.resample import resample_array
|
from voice_provider.utils.resample import resample_array
|
||||||
from voice_provider.utils.encode_lc3 import encode_lc3
|
from voice_provider.utils.encode_lc3 import encode_lc3
|
||||||
|
|
||||||
|
# Now import TTS - the license will be auto-accepted
|
||||||
|
from TTS.api import TTS
|
||||||
|
|
||||||
|
# Get device for XTTS
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
log.info('XTTS will run on GPU')
|
||||||
|
XTTS_DEVICE = "cuda"
|
||||||
|
else:
|
||||||
|
log.info('XTTS will run on CPU')
|
||||||
|
XTTS_DEVICE = "cpu"
|
||||||
|
|
||||||
|
# Load XTTS model globally - only once
|
||||||
|
log.info("Initializing XTTS model...")
|
||||||
|
start_init = time.time()
|
||||||
|
XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
|
||||||
|
end_init = time.time()
|
||||||
|
log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
|
||||||
|
|
||||||
PIPER_EXE = shutil.which('piper')
|
PIPER_EXE = shutil.which('piper')
|
||||||
|
|
||||||
TTS_DIR = os.path.join(os.path.dirname(__file__))
|
TTS_DIR = os.path.join(os.path.dirname(__file__))
|
||||||
PIPER_WORKDIR = f'{TTS_DIR}/piper'
|
PIPER_WORKDIR = f'{TTS_DIR}/piper'
|
||||||
|
|
||||||
if not PIPER_EXE:
|
if not PIPER_EXE:
|
||||||
PIPER_EXE = f'{TTS_DIR}/../../venv/bin/piper'
|
PIPER_EXE = f'{TTS_DIR}/../../.venv/bin/piper'
|
||||||
|
|
||||||
def synth_piper(text, model="en_US-lessac-medium"):
|
def synth_piper(text, model="en_US-lessac-medium"):
|
||||||
pwd = os.getcwd()
|
pwd = os.getcwd()
|
||||||
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
|
|||||||
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
|
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
|
||||||
[
|
[
|
||||||
PIPER_EXE,
|
PIPER_EXE,
|
||||||
'--cuda',
|
#'--cuda',
|
||||||
'--model', model,
|
'--model', model,
|
||||||
'--output-raw'
|
'--output-raw'
|
||||||
],
|
],
|
||||||
@@ -52,7 +74,10 @@ def synthesize(
|
|||||||
target_sample_rate,
|
target_sample_rate,
|
||||||
framework,
|
framework,
|
||||||
model="en_US-lessac-medium",
|
model="en_US-lessac-medium",
|
||||||
return_lc3=True
|
return_lc3=True,
|
||||||
|
language="en",
|
||||||
|
speaker=None,
|
||||||
|
speaker_wav=None
|
||||||
):
|
):
|
||||||
|
|
||||||
if framework == 'piper':
|
if framework == 'piper':
|
||||||
@@ -64,7 +89,40 @@ def synthesize(
|
|||||||
elif framework == 'koro':
|
elif framework == 'koro':
|
||||||
pass
|
pass
|
||||||
elif framework == 'xtts':
|
elif framework == 'xtts':
|
||||||
pass
|
start = time.time()
|
||||||
|
|
||||||
|
# Generate audio using XTTS
|
||||||
|
# XTTS always outputs at 24kHz
|
||||||
|
xtts_sample_rate = 24000
|
||||||
|
|
||||||
|
# Validate speaker parameters - XTTS needs either speaker or speaker_wav
|
||||||
|
if speaker is None and speaker_wav is None:
|
||||||
|
# Use the first available speaker if none specified
|
||||||
|
speaker = XTTS_MODEL.speakers[0]
|
||||||
|
log.info(f"No speaker specified, using default: {speaker}")
|
||||||
|
|
||||||
|
# Generate audio samples using tts.tts
|
||||||
|
if speaker_wav:
|
||||||
|
# expand path to speaker_wav folder
|
||||||
|
speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav)
|
||||||
|
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
|
||||||
|
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
|
||||||
|
else:
|
||||||
|
log.info(f"Generating XTTS audio with speaker: {speaker}")
|
||||||
|
audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
|
||||||
|
|
||||||
|
# Ensure audio_np is a numpy array and properly scaled
|
||||||
|
audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
|
||||||
|
|
||||||
|
# Log some info about the audio data
|
||||||
|
log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, "
|
||||||
|
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
|
||||||
|
|
||||||
|
# Resample from 24kHz to target sample rate with speedup factor
|
||||||
|
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate)
|
||||||
|
|
||||||
|
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
|
||||||
|
|
||||||
elif framework == 'zonos':
|
elif framework == 'zonos':
|
||||||
pass
|
pass
|
||||||
else: raise NotImplementedError('unknown framework')
|
else: raise NotImplementedError('unknown framework')
|
||||||
@@ -82,7 +140,10 @@ async def synthesize_async(
|
|||||||
target_sample_rate,
|
target_sample_rate,
|
||||||
framework,
|
framework,
|
||||||
model="en_US-lessac-medium",
|
model="en_US-lessac-medium",
|
||||||
return_lc3=True
|
return_lc3=True,
|
||||||
|
language="en",
|
||||||
|
speaker=None,
|
||||||
|
speaker_wav=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Asynchronous version of the synthesize function that runs in a thread pool.
|
Asynchronous version of the synthesize function that runs in a thread pool.
|
||||||
@@ -90,9 +151,12 @@ async def synthesize_async(
|
|||||||
Args:
|
Args:
|
||||||
text: Text to synthesize
|
text: Text to synthesize
|
||||||
target_sample_rate: Target sample rate for the audio
|
target_sample_rate: Target sample rate for the audio
|
||||||
framework: TTS framework to use (e.g., 'piper')
|
framework: TTS framework to use (e.g., 'piper', 'xtts')
|
||||||
model: Model to use for synthesis
|
model: Model to use for synthesis
|
||||||
return_lc3: Whether to return LC3-encoded audio
|
return_lc3: Whether to return LC3-encoded audio
|
||||||
|
language: Language code (used by XTTS)
|
||||||
|
speaker: Speaker ID for XTTS
|
||||||
|
speaker_wav: Path to speaker sample for XTTS voice cloning
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
LC3-encoded audio as string or raw audio as numpy array
|
LC3-encoded audio as string or raw audio as numpy array
|
||||||
@@ -101,23 +165,53 @@ async def synthesize_async(
|
|||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
result = await loop.run_in_executor(
|
result = await loop.run_in_executor(
|
||||||
None,
|
None,
|
||||||
lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
|
lambda: synthesize(
|
||||||
|
text,
|
||||||
|
target_sample_rate,
|
||||||
|
framework,
|
||||||
|
model,
|
||||||
|
return_lc3,
|
||||||
|
language,
|
||||||
|
speaker,
|
||||||
|
speaker_wav
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import logging
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
logging.basicConfig(
|
log.basicConfig(
|
||||||
level=logging.INFO,
|
level=log.INFO,
|
||||||
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
||||||
)
|
)
|
||||||
target_rate=16000
|
target_rate = 16000
|
||||||
|
|
||||||
audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
|
# First, print available XTTS speakers
|
||||||
|
print("Available XTTS speakers:")
|
||||||
|
print(XTTS_MODEL.speakers)
|
||||||
|
|
||||||
sf.write('hello.wav', audio, target_rate)
|
# Demo of Piper
|
||||||
|
print("Testing Piper TTS...")
|
||||||
|
audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
|
||||||
|
sf.write('hello_piper.wav', audio_piper, target_rate)
|
||||||
|
|
||||||
|
# Demo of XTTS with Annmarie Nele for German
|
||||||
|
speaker_wav = 'female.wav'
|
||||||
|
print(f"Testing XTTS with German language using speaker: {speaker_wav}")
|
||||||
|
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
|
||||||
|
|
||||||
|
audio_xtts = synthesize(
|
||||||
|
text=text_to_synthesize,
|
||||||
|
target_sample_rate=target_rate,
|
||||||
|
framework='xtts',
|
||||||
|
language='de',
|
||||||
|
speaker_wav=speaker_wav,
|
||||||
|
return_lc3=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the wav file
|
||||||
|
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
|
||||||
|
|
||||||
print('Done.')
|
print('Done.')
|
||||||
|
|||||||
@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
|
|||||||
target_sample_rate=request.target_sample_rate,
|
target_sample_rate=request.target_sample_rate,
|
||||||
framework=request.framework,
|
framework=request.framework,
|
||||||
model=request.model,
|
model=request.model,
|
||||||
return_lc3=request.return_lc3
|
return_lc3=request.return_lc3,
|
||||||
|
language=request.language,
|
||||||
|
speaker=request.speaker,
|
||||||
|
speaker_wav=request.speaker_wav
|
||||||
)
|
)
|
||||||
|
|
||||||
if request.return_lc3:
|
if request.return_lc3:
|
||||||
|
|||||||
@@ -24,16 +24,25 @@ def resample_file(filename, out_filename, target_rate):
|
|||||||
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
|
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
|
||||||
|
|
||||||
|
|
||||||
def resample_array(audio, rate, target_rate):
|
def resample_array(audio, rate, target_rate, speedup=1.0):
|
||||||
start=time.time()
|
start=time.time()
|
||||||
# Load the original audio file
|
# Load the original audio file
|
||||||
|
|
||||||
if rate == target_rate: # Nothing to do
|
if rate == target_rate and speedup == 1.0: # Nothing to do
|
||||||
log.info('audio already at target rate, skipping resample')
|
log.info('audio already at target rate with no speedup, skipping resample')
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
# Apply speedup if needed
|
||||||
|
if speedup != 1.0:
|
||||||
|
# When speeding up, we need to resample to a lower rate first
|
||||||
|
# This effectively shortens the audio duration
|
||||||
|
effective_orig_sr = rate * speedup
|
||||||
|
log.info(f"Applying speedup factor of {speedup}")
|
||||||
|
else:
|
||||||
|
effective_orig_sr = rate
|
||||||
|
|
||||||
# Convert the sample rate to target rate
|
# Convert the sample rate to target rate
|
||||||
resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
|
resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)
|
||||||
|
|
||||||
# Save the resampled audio as a new .wav file
|
# Save the resampled audio as a new .wav file
|
||||||
|
|
||||||
@@ -45,4 +54,4 @@ if __name__ == "__main__":
|
|||||||
import os
|
import os
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
file_dir = '../text_to_speech/'
|
file_dir = '../text_to_speech/'
|
||||||
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav')
|
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav', 16000)
|
||||||
|
|||||||
+3
-3
@@ -4,9 +4,9 @@ import time
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from multilang_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
|
from auracast_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
|
||||||
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||||
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||||
|
|
||||||
log.basicConfig(
|
log.basicConfig(
|
||||||
level=log.INFO,
|
level=log.INFO,
|
||||||
|
|||||||
@@ -4,9 +4,9 @@ import time
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from multilang_translator.translator_config import LANG_CONFIG
|
from auracast_translator.translator_config import LANG_CONFIG
|
||||||
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||||
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||||
|
|
||||||
|
|
||||||
def test_config_broadcaster(ft_configure_broadcaster):
|
def test_config_broadcaster(ft_configure_broadcaster):
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from multilang_translator.main_local import announcement_from_german_text
|
from auracast_translator.main_local import announcement_from_german_text
|
||||||
from multilang_translator.translator import test_content
|
from auracast_translator.translator import test_content
|
||||||
|
|
||||||
|
|
||||||
def test_announcement_from_german_text(
|
def test_announcement_from_german_text(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from multilang_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
|
from auracast_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
|
||||||
from multilang_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
|
from auracast_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
|
||||||
from multilang_translator.main_local import translate_from_german
|
from auracast_translator.main_local import translate_from_german
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|||||||
+1
-1
@@ -1,4 +1,4 @@
|
|||||||
from multilang_translator.text_to_speech.text_to_speech import synthesize
|
from auracast_translator.text_to_speech.text_to_speech import synthesize
|
||||||
|
|
||||||
def test_synthesize():
|
def test_synthesize():
|
||||||
synthesize("Hello, how are you?", "en_US-lessac-medium", "hello.wav")
|
synthesize("Hello, how are you?", "en_US-lessac-medium", "hello.wav")
|
||||||
Reference in New Issue
Block a user