Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3759a026d4 | |||
| 6bfbc6e180 | |||
| e02593c78d | |||
| c095b058d6 | |||
| a9dbe52a7e | |||
| 1d4a2b3b45 | |||
| 36dd34b042 | |||
| 4971f1e7f6 | |||
| 466fb1762e | |||
| b9ca04af82 | |||
| 17cf41166b | |||
| 5e5c3e2040 |
@@ -1,5 +1,6 @@
|
||||
*.pyc
|
||||
*.wav
|
||||
!/src/voice_provider/speaker_wav/*
|
||||
*.lc3
|
||||
*.onnx
|
||||
*.onnx.json
|
||||
|
||||
+35
@@ -0,0 +1,35 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies and poetry
|
||||
RUN apt-get update && apt-get install -y \
|
||||
git \
|
||||
gcc \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# accept new ssh server
|
||||
RUN sed /^StrictHostKeyChecking/d /etc/ssh/ssh_config; \
|
||||
echo StrictHostKeyChecking no >> /etc/ssh/ssh_config
|
||||
|
||||
# Install and configure poetry
|
||||
RUN --mount=type=cache,target=/root/.cache \
|
||||
pip install poetry
|
||||
RUN poetry config virtualenvs.create false
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# copy the app code
|
||||
COPY ./src .
|
||||
COPY poetry.lock .
|
||||
COPY pyproject.toml .
|
||||
|
||||
# Install the project with all dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache \
|
||||
--mount=type=ssh,required=true \
|
||||
poetry install --no-interaction --without dev --no-root
|
||||
|
||||
# Expose the API port
|
||||
EXPOSE 7999
|
||||
|
||||
# Run the translator server directly from the module path
|
||||
CMD ["python", "-m", "auracast_translator.translator_server.translator_server"]
|
||||
@@ -0,0 +1,13 @@
|
||||
services:
|
||||
auracast-translator:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
ssh:
|
||||
- default=~/.ssh/id_ed25519 #lappi
|
||||
ports:
|
||||
- "7999:7999"
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
restart: unless-stopped
|
||||
|
||||
Generated
+5392
File diff suppressed because it is too large
Load Diff
+38
-22
@@ -1,27 +1,43 @@
|
||||
[project]
|
||||
name = "multilang_translator"
|
||||
requires-python = ">= 3.11"
|
||||
version = '0.1'
|
||||
|
||||
dependencies = [
|
||||
"auracast @git+https://git@gitea.pstruebi.xyz/auracaster/bumble-auracast",
|
||||
"requests==2.32.3",
|
||||
"ollama==0.4.7",
|
||||
"aioconsole==0.8.1",
|
||||
"fastapi==0.115.11",
|
||||
"uvicorn==0.34.0",
|
||||
"aiohttp==3.9.3",
|
||||
[tool.poetry]
|
||||
name = "auracast_translator"
|
||||
version = "0.1.0"
|
||||
authors = ["Patrick S <pstruebi>"]
|
||||
description = "Announcement System"
|
||||
readme = "readme.md"
|
||||
packages = [
|
||||
{ include = "translator_models", from = "src" },
|
||||
{ include = "translator_client", from = "src" },
|
||||
{ include = "auracast_translator", from = "src" },
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = [
|
||||
"pytest >= 8.2",
|
||||
]
|
||||
[tool.poetry.dependencies]
|
||||
python = "~3.11"
|
||||
setuptools= ">=77"
|
||||
coqui-tts = "0.26"
|
||||
|
||||
[tool.poetry.group.tts.dependencies]
|
||||
piper-phonemize = "==1.1.0"
|
||||
piper-tts = "==1.2.0"
|
||||
[tool.poetry.group.general.dependencies]
|
||||
requests="2.32.3"
|
||||
aiohttp="3.9.3"
|
||||
fastapi="0.115.11"
|
||||
uvicorn="0.34.0"
|
||||
ollama="0.4.7"
|
||||
piper-tts="1.2.0"
|
||||
librosa="0.10.1"
|
||||
aioconsole="0.8.1"
|
||||
lc3 = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/liblc3.git", rev = "7558637303106c7ea971e7bb8cedf379d3e08bcc" }
|
||||
auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.git" }
|
||||
|
||||
#[tool.poetry.group.gpu.dependencies]
|
||||
#onnxruntime-gpu = "^1.20.1"
|
||||
# TODO: for running piper on gpu investigate
|
||||
# https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements
|
||||
# put everything in pytorch container according to piper github:
|
||||
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
|
||||
# Use a seperate container for the voice provider
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = {version=">8.2", optional=true}
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = [
|
||||
@@ -29,5 +45,5 @@ addopts = [
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61", "wheel", "setuptools_scm>=8"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
# Prerequisites
|
||||
sudo apt install liblc3-tools
|
||||
|
||||
use python3.9
|
||||
pip install piper-tts soundfile librosa pyserial pytest
|
||||
# Install the project
|
||||
poetry env use python3.11
|
||||
poetry install
|
||||
|
||||
# Piper update voices
|
||||
piper --update-voices -m en_US-lessac-medium
|
||||
|
||||
# TODO:
|
||||
- investigate using a pipeline instead of writing to intermediate files to gain performance
|
||||
@@ -10,14 +10,14 @@ from auracast import auracast_config
|
||||
import voice_client
|
||||
import voice_models
|
||||
|
||||
from multilang_translator import translator_config
|
||||
from multilang_translator.translator import llm_translator
|
||||
from auracast_translator.translator_models import translator_models
|
||||
from auracast_translator.translator import llm_translator
|
||||
import voice_client.tts_client
|
||||
import voice_models.request_models
|
||||
|
||||
|
||||
async def announcement_from_german_text(
|
||||
config: translator_config.TranslatorConfigGroup,
|
||||
config: translator_models.TranslatorConfigGroup,
|
||||
text_de
|
||||
):
|
||||
base_lang = "deu"
|
||||
@@ -9,9 +9,9 @@ from auracast import multicast_control
|
||||
from auracast import auracast_config
|
||||
from voice_provider import text_to_speech
|
||||
|
||||
from multilang_translator import translator_config
|
||||
from multilang_translator.translator import llm_translator
|
||||
from multilang_translator.translator.test_content import TESTSENTENCE
|
||||
from auracast_translator import translator_config
|
||||
from auracast_translator.translator import llm_translator
|
||||
from auracast_translator.translator.test_content import TESTSENTENCE
|
||||
|
||||
# TODO: look for a end to end translation solution
|
||||
|
||||
+2
-2
@@ -6,7 +6,7 @@ import time
|
||||
import ollama
|
||||
import aiohttp
|
||||
|
||||
from multilang_translator.translator import syspromts
|
||||
from auracast_translator.translator import syspromts
|
||||
|
||||
# ollama.create( # TODO: create models on startup
|
||||
# model='example',
|
||||
@@ -125,7 +125,7 @@ async def translate_de_to_x_async(
|
||||
|
||||
if __name__ == "__main__":
|
||||
import time
|
||||
from multilang_translator.translator import test_content
|
||||
from auracast_translator.translator import test_content
|
||||
|
||||
|
||||
start=time.time()
|
||||
+4
-4
@@ -3,7 +3,7 @@ Database file for endpoint definitions.
|
||||
This file contains configurations for auracast endpoints including their IP addresses and capabilities.
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from multilang_translator.translator_models.translator_models import EndpointGroup, Endpoint
|
||||
from translator_models.translator_models import EndpointGroup, Endpoint
|
||||
|
||||
|
||||
SUPPORTED_LANGUAGES = ["deu", "eng", "fra", "spa", "ita"]
|
||||
@@ -13,19 +13,19 @@ ENDPOINTS: dict[int: Endpoint] = { # for now make sure, .id and key are the same
|
||||
0: Endpoint(
|
||||
id=0,
|
||||
name="Local Endpoint",
|
||||
url="http://localhost:5000",
|
||||
url="http://10.13.13.3:5000", #"http://localhost:5000", #patricks laptop
|
||||
max_broadcasts=3,
|
||||
),
|
||||
1: Endpoint(
|
||||
id=1,
|
||||
name="Gate 1",
|
||||
url="http://pi3:5000",
|
||||
url="http://10.13.13.4:5000", #pi4
|
||||
max_broadcasts=3,
|
||||
),
|
||||
2: Endpoint(
|
||||
id=2,
|
||||
name="Gate 2",
|
||||
url="http://192.168.1.102:5000",
|
||||
url="http://10.13.13.5:5000",
|
||||
max_broadcasts=3,
|
||||
),
|
||||
}
|
||||
+2
-2
@@ -7,7 +7,7 @@ import logging as log
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the parent directory to the Python path to find the multilang_translator package
|
||||
# Add the parent directory to the Python path to find the auracast_translator package
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
|
||||
if parent_dir not in sys.path:
|
||||
@@ -20,7 +20,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
log.info("Starting Translator API server")
|
||||
uvicorn.run(
|
||||
"multilang_translator.translator_server.translator_server:app",
|
||||
"auracast_translator.translator_server.translator_server:app",
|
||||
host="0.0.0.0",
|
||||
port=7999,
|
||||
reload=True,
|
||||
+10
-6
@@ -11,9 +11,9 @@ from fastapi import FastAPI, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
# Import models
|
||||
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||
from multilang_translator.translator import llm_translator
|
||||
from multilang_translator.translator_server import endpoints_db
|
||||
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||
from auracast_translator.translator import llm_translator
|
||||
from auracast_translator.translator_server import endpoints_db
|
||||
from voice_provider import text_to_speech
|
||||
|
||||
# Import the endpoints database and multicast client
|
||||
@@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup):
|
||||
ep_group.sampling_rate_hz,
|
||||
trans_conf.tts_system,
|
||||
trans_conf.tts_model,
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language=trans_conf.xtts_language,
|
||||
speaker=trans_conf.xtts_speaker,
|
||||
speaker_wav=trans_conf.xtts_speaker_wav
|
||||
)
|
||||
synthesis_tasks.append(task)
|
||||
|
||||
@@ -329,9 +332,10 @@ async def get_available_languages():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
import uvicorn
|
||||
log.basicConfig(
|
||||
level=log.DEBUG,
|
||||
level=os.environ.get('LOG_LEVEL', log.DEBUG),
|
||||
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
||||
)
|
||||
# with reload=True logging of modules does not function as expected
|
||||
@@ -342,5 +346,5 @@ if __name__ == "__main__":
|
||||
port=7999,
|
||||
#reload=True,
|
||||
#log_config=None,
|
||||
#log_level="info"
|
||||
log_level="debug"
|
||||
)
|
||||
@@ -1,20 +0,0 @@
|
||||
import os
|
||||
from pydantic import BaseModel
|
||||
|
||||
VENV_DIR = os.path.join(os.path.dirname(__file__), './../../venv')
|
||||
|
||||
class TranslatorLangConfig(BaseModel):
|
||||
translator_llm: str = 'llama3.2:3b-instruct-q4_0' # TODO: this was migrated to translator_models - remove this
|
||||
llm_client: str = 'ollama'
|
||||
llm_host_url: str | None = 'http://localhost:11434'
|
||||
llm_host_token: str | None = None
|
||||
tts_system: str = 'piper'
|
||||
tts_model: str ='de_DE-kerstin-low'
|
||||
|
||||
class TranslatorConfig(BaseModel):
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
|
||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
||||
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
||||
ita: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'it_IT-paola-medium')
|
||||
|
||||
+1
-2
@@ -3,10 +3,9 @@ API client functions for interacting with the Translator API.
|
||||
"""
|
||||
import requests
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
from multilang_translator.translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||
from translator_models.translator_models import AnnouncementStates, Endpoint, EndpointGroup
|
||||
|
||||
|
||||
# This can be overridden through environment variables
|
||||
+10
-3
@@ -35,12 +35,19 @@ class TranslatorLangConfig(BaseModel):
|
||||
# llm_host_url: str | None = 'http://localhost:11434'
|
||||
# llm_host_token: str | None = None
|
||||
|
||||
tts_system: str = 'piper'
|
||||
tts_model: str ='de_DE-kerstin-low'
|
||||
tts_system: str = 'piper' # Options: 'piper', 'xtts'
|
||||
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
|
||||
xtts_language: str = 'de' # Language code for XTTS
|
||||
xtts_speaker: Optional[str] = None # Speaker name for XTTS
|
||||
xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||
|
||||
|
||||
class TranslatorConfig(BaseModel):
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(
|
||||
tts_system='xtts',
|
||||
xtts_language='de',
|
||||
xtts_speaker_wav='female.wav'
|
||||
)
|
||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
||||
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
||||
@@ -1,4 +1,5 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
|
||||
class SynthesizeRequest(BaseModel):
|
||||
text: str
|
||||
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
|
||||
framework: str = "piper"
|
||||
model: str = "en_US-lessac-medium"
|
||||
return_lc3: bool = False
|
||||
|
||||
language: str = "en" # Language code for XTTS
|
||||
speaker: Optional[str] = None # Speaker name for XTTS
|
||||
speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||
|
||||
Binary file not shown.
@@ -1,4 +1,7 @@
|
||||
import os
|
||||
# Set environment variable to auto-accept Coqui TTS license
|
||||
os.environ["COQUI_TOS_AGREED"] = "1"
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
@@ -6,16 +9,35 @@ import json
|
||||
import logging as log
|
||||
import numpy as np
|
||||
import asyncio
|
||||
import torch
|
||||
from voice_provider.utils.resample import resample_array
|
||||
from voice_provider.utils.encode_lc3 import encode_lc3
|
||||
|
||||
# Now import TTS - the license will be auto-accepted
|
||||
from TTS.api import TTS
|
||||
|
||||
# Get device for XTTS
|
||||
if torch.cuda.is_available():
|
||||
log.info('XTTS will run on GPU')
|
||||
XTTS_DEVICE = "cuda"
|
||||
else:
|
||||
log.info('XTTS will run on CPU')
|
||||
XTTS_DEVICE = "cpu"
|
||||
|
||||
# Load XTTS model globally - only once
|
||||
log.info("Initializing XTTS model...")
|
||||
start_init = time.time()
|
||||
XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
|
||||
end_init = time.time()
|
||||
log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
|
||||
|
||||
PIPER_EXE = shutil.which('piper')
|
||||
|
||||
TTS_DIR = os.path.join(os.path.dirname(__file__))
|
||||
PIPER_WORKDIR = f'{TTS_DIR}/piper'
|
||||
|
||||
if not PIPER_EXE:
|
||||
PIPER_EXE = f'{TTS_DIR}/../../venv/bin/piper'
|
||||
PIPER_EXE = f'{TTS_DIR}/../../.venv/bin/piper'
|
||||
|
||||
def synth_piper(text, model="en_US-lessac-medium"):
|
||||
pwd = os.getcwd()
|
||||
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
|
||||
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
|
||||
[
|
||||
PIPER_EXE,
|
||||
'--cuda',
|
||||
#'--cuda',
|
||||
'--model', model,
|
||||
'--output-raw'
|
||||
],
|
||||
@@ -52,7 +74,10 @@ def synthesize(
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model="en_US-lessac-medium",
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language="en",
|
||||
speaker=None,
|
||||
speaker_wav=None
|
||||
):
|
||||
|
||||
if framework == 'piper':
|
||||
@@ -64,7 +89,40 @@ def synthesize(
|
||||
elif framework == 'koro':
|
||||
pass
|
||||
elif framework == 'xtts':
|
||||
pass
|
||||
start = time.time()
|
||||
|
||||
# Generate audio using XTTS
|
||||
# XTTS always outputs at 24kHz
|
||||
xtts_sample_rate = 24000
|
||||
|
||||
# Validate speaker parameters - XTTS needs either speaker or speaker_wav
|
||||
if speaker is None and speaker_wav is None:
|
||||
# Use the first available speaker if none specified
|
||||
speaker = XTTS_MODEL.speakers[0]
|
||||
log.info(f"No speaker specified, using default: {speaker}")
|
||||
|
||||
# Generate audio samples using tts.tts
|
||||
if speaker_wav:
|
||||
# expand path to speaker_wav folder
|
||||
speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav)
|
||||
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
|
||||
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
|
||||
else:
|
||||
log.info(f"Generating XTTS audio with speaker: {speaker}")
|
||||
audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
|
||||
|
||||
# Ensure audio_np is a numpy array and properly scaled
|
||||
audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
|
||||
|
||||
# Log some info about the audio data
|
||||
log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, "
|
||||
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
|
||||
|
||||
# Resample from 24kHz to target sample rate with speedup factor
|
||||
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate)
|
||||
|
||||
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
|
||||
|
||||
elif framework == 'zonos':
|
||||
pass
|
||||
else: raise NotImplementedError('unknown framework')
|
||||
@@ -82,7 +140,10 @@ async def synthesize_async(
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model="en_US-lessac-medium",
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language="en",
|
||||
speaker=None,
|
||||
speaker_wav=None
|
||||
):
|
||||
"""
|
||||
Asynchronous version of the synthesize function that runs in a thread pool.
|
||||
@@ -90,9 +151,12 @@ async def synthesize_async(
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
target_sample_rate: Target sample rate for the audio
|
||||
framework: TTS framework to use (e.g., 'piper')
|
||||
framework: TTS framework to use (e.g., 'piper', 'xtts')
|
||||
model: Model to use for synthesis
|
||||
return_lc3: Whether to return LC3-encoded audio
|
||||
language: Language code (used by XTTS)
|
||||
speaker: Speaker ID for XTTS
|
||||
speaker_wav: Path to speaker sample for XTTS voice cloning
|
||||
|
||||
Returns:
|
||||
LC3-encoded audio as string or raw audio as numpy array
|
||||
@@ -101,23 +165,53 @@ async def synthesize_async(
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
|
||||
lambda: synthesize(
|
||||
text,
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model,
|
||||
return_lc3,
|
||||
language,
|
||||
speaker,
|
||||
speaker_wav
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import logging
|
||||
import soundfile as sf
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
log.basicConfig(
|
||||
level=log.INFO,
|
||||
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
||||
)
|
||||
target_rate=16000
|
||||
target_rate = 16000
|
||||
|
||||
audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
|
||||
|
||||
sf.write('hello.wav', audio, target_rate)
|
||||
# First, print available XTTS speakers
|
||||
print("Available XTTS speakers:")
|
||||
print(XTTS_MODEL.speakers)
|
||||
|
||||
# Demo of Piper
|
||||
print("Testing Piper TTS...")
|
||||
audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
|
||||
sf.write('hello_piper.wav', audio_piper, target_rate)
|
||||
|
||||
# Demo of XTTS with Annmarie Nele for German
|
||||
speaker_wav = 'female.wav'
|
||||
print(f"Testing XTTS with German language using speaker: {speaker_wav}")
|
||||
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
|
||||
|
||||
audio_xtts = synthesize(
|
||||
text=text_to_synthesize,
|
||||
target_sample_rate=target_rate,
|
||||
framework='xtts',
|
||||
language='de',
|
||||
speaker_wav=speaker_wav,
|
||||
return_lc3=False
|
||||
)
|
||||
|
||||
# Save the wav file
|
||||
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
|
||||
|
||||
print('Done.')
|
||||
|
||||
@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
|
||||
target_sample_rate=request.target_sample_rate,
|
||||
framework=request.framework,
|
||||
model=request.model,
|
||||
return_lc3=request.return_lc3
|
||||
return_lc3=request.return_lc3,
|
||||
language=request.language,
|
||||
speaker=request.speaker,
|
||||
speaker_wav=request.speaker_wav
|
||||
)
|
||||
|
||||
if request.return_lc3:
|
||||
|
||||
@@ -24,20 +24,29 @@ def resample_file(filename, out_filename, target_rate):
|
||||
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
|
||||
|
||||
|
||||
def resample_array(audio, rate, target_rate):
|
||||
def resample_array(audio, rate, target_rate, speedup=1.0):
|
||||
start=time.time()
|
||||
# Load the original audio file
|
||||
|
||||
if rate == target_rate: # Nothing to do
|
||||
log.info('audio already at target rate, skipping resample')
|
||||
if rate == target_rate and speedup == 1.0: # Nothing to do
|
||||
log.info('audio already at target rate with no speedup, skipping resample')
|
||||
return audio
|
||||
|
||||
# Apply speedup if needed
|
||||
if speedup != 1.0:
|
||||
# When speeding up, we need to resample to a lower rate first
|
||||
# This effectively shortens the audio duration
|
||||
effective_orig_sr = rate * speedup
|
||||
log.info(f"Applying speedup factor of {speedup}")
|
||||
else:
|
||||
effective_orig_sr = rate
|
||||
|
||||
# Convert the sample rate to target rate
|
||||
resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
|
||||
resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)
|
||||
|
||||
# Save the resampled audio as a new .wav file
|
||||
|
||||
log.info("Resampling took %s s", round(time.time() - start, 3))
|
||||
log.info("Resampling took %s s", round(time.time() - start, 3))
|
||||
return resampled_audio
|
||||
|
||||
|
||||
@@ -45,4 +54,4 @@ if __name__ == "__main__":
|
||||
import os
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
file_dir = '../text_to_speech/'
|
||||
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav')
|
||||
resample_file(f'{file_dir}/welcome.wav', f'{file_dir}/welcome_resampled.wav', 16000)
|
||||
|
||||
+3
-3
@@ -4,9 +4,9 @@ import time
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from multilang_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
|
||||
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||
from auracast_translator.backend_controller.broadcaster_config import broadcaster_config, BROADCAST_CONFIG
|
||||
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||
|
||||
log.basicConfig(
|
||||
level=log.INFO,
|
||||
|
||||
@@ -4,9 +4,9 @@ import time
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from multilang_translator.translator_config import LANG_CONFIG
|
||||
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||
from auracast_translator.translator_config import LANG_CONFIG
|
||||
from auracast_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
|
||||
from auracast_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster
|
||||
|
||||
|
||||
def test_config_broadcaster(ft_configure_broadcaster):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from multilang_translator.main_local import announcement_from_german_text
|
||||
from multilang_translator.translator import test_content
|
||||
from auracast_translator.main_local import announcement_from_german_text
|
||||
from auracast_translator.translator import test_content
|
||||
|
||||
|
||||
def test_announcement_from_german_text(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from multilang_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
|
||||
from multilang_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
|
||||
from multilang_translator.main_local import translate_from_german
|
||||
from auracast_translator.translator.llm_translator import translator_de_en, translator_de_fr, translator_de_it
|
||||
from auracast_translator.translator.test_content import TESTSENTENCE_DE_BROKER, TESTSENTENCE_DE_RAINBOW
|
||||
from auracast_translator.main_local import translate_from_german
|
||||
|
||||
|
||||
import time
|
||||
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
from multilang_translator.text_to_speech.text_to_speech import synthesize
|
||||
from auracast_translator.text_to_speech.text_to_speech import synthesize
|
||||
|
||||
def test_synthesize():
|
||||
synthesize("Hello, how are you?", "en_US-lessac-medium", "hello.wav")
|
||||
Reference in New Issue
Block a user