add basic support for xtts

This commit is contained in:
2025-03-25 12:02:59 +01:00
parent a9dbe52a7e
commit c095b058d6
7 changed files with 3113 additions and 130 deletions

3072
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,6 +13,7 @@ packages = [
[tool.poetry.dependencies]
python = "~3.11"
setuptools= ">=77"
coqui-tts = "0.26"
[tool.poetry.group.general.dependencies]
requests="2.32.3"
@@ -36,7 +37,7 @@ auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.gi
[tool.poetry.group.dev.dependencies]
pytest = ">8.2"
pytest = {version=">8.2", optional=true}
[tool.pytest.ini_options]
addopts = [

View File

@@ -35,12 +35,18 @@ class TranslatorLangConfig(BaseModel):
# llm_host_url: str | None = 'http://localhost:11434'
# llm_host_token: str | None = None
tts_system: str = 'piper'
tts_model: str ='de_DE-kerstin-low'
tts_system: str = 'piper' # Options: 'piper', 'xtts'
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
tts_language: str = 'de' # Language code for XTTS
tts_speaker: Optional[str] = None # Speaker name for XTTS
class TranslatorConfig(BaseModel):
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
deu: TranslatorLangConfig = TranslatorLangConfig(
tts_system='xtts',
tts_language='de',
tts_speaker='Annmarie Nele'
)
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')

View File

@@ -1,4 +1,5 @@
from pydantic import BaseModel
from typing import Optional
class SynthesizeRequest(BaseModel):
text: str
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
framework: str = "piper"
model: str = "en_US-lessac-medium"
return_lc3: bool = False
language: str = "en" # Language code for XTTS
speaker: Optional[str] = None # Speaker name for XTTS
speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning

View File

@@ -1,4 +1,7 @@
import os
# Set environment variable to auto-accept Coqui TTS license
os.environ["COQUI_TOS_AGREED"] = "1"
import shutil
import subprocess
import time
@@ -6,9 +9,28 @@ import json
import logging as log
import numpy as np
import asyncio
import torch
from voice_provider.utils.resample import resample_array
from voice_provider.utils.encode_lc3 import encode_lc3
# Now import TTS - the license will be auto-accepted
from TTS.api import TTS
# Get device for XTTS
if torch.cuda.is_available():
log.info('XTTS will run on GPU')
XTTS_DEVICE = "cuda"
else:
log.info('XTTS will run on CPU')
XTTS_DEVICE = "cpu"
# Load XTTS model globally - only once
log.info("Initializing XTTS model...")
start_init = time.time()
XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
end_init = time.time()
log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
PIPER_EXE = shutil.which('piper')
TTS_DIR = os.path.join(os.path.dirname(__file__))
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
[
PIPER_EXE,
'--cuda',
#'--cuda',
'--model', model,
'--output-raw'
],
@@ -52,7 +74,10 @@ def synthesize(
target_sample_rate,
framework,
model="en_US-lessac-medium",
return_lc3=True
return_lc3=True,
language="en",
speaker=None,
speaker_wav=None
):
if framework == 'piper':
@@ -64,7 +89,38 @@ def synthesize(
elif framework == 'koro':
pass
elif framework == 'xtts':
pass
start = time.time()
# Generate audio using XTTS
# XTTS always outputs at 24kHz
xtts_sample_rate = 24000
# Validate speaker parameters - XTTS needs either speaker or speaker_wav
if speaker is None and speaker_wav is None:
# Use the first available speaker if none specified
speaker = XTTS_MODEL.speakers[0]
log.info(f"No speaker specified, using default: {speaker}")
# Generate audio samples using tts.tts
if speaker_wav:
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
else:
log.info(f"Generating XTTS audio with speaker: {speaker}")
audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
# Ensure audio_np is a numpy array and properly scaled
audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
# Log some info about the audio data
log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, "
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
# Resample from 24kHz to target sample rate with speedup factor
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05)
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
elif framework == 'zonos':
pass
else: raise NotImplementedError('unknown framework')
@@ -82,7 +138,10 @@ async def synthesize_async(
target_sample_rate,
framework,
model="en_US-lessac-medium",
return_lc3=True
return_lc3=True,
language="en",
speaker=None,
speaker_wav=None
):
"""
Asynchronous version of the synthesize function that runs in a thread pool.
@@ -90,9 +149,12 @@ async def synthesize_async(
Args:
text: Text to synthesize
target_sample_rate: Target sample rate for the audio
framework: TTS framework to use (e.g., 'piper')
framework: TTS framework to use (e.g., 'piper', 'xtts')
model: Model to use for synthesis
return_lc3: Whether to return LC3-encoded audio
language: Language code (used by XTTS)
speaker: Speaker ID for XTTS
speaker_wav: Path to speaker sample for XTTS voice cloning
Returns:
LC3-encoded audio as string or raw audio as numpy array
@@ -101,23 +163,64 @@ async def synthesize_async(
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
lambda: synthesize(
text,
target_sample_rate,
framework,
model,
return_lc3,
language,
speaker,
speaker_wav
)
)
return result
if __name__ == '__main__':
import logging
import soundfile as sf
logging.basicConfig(
level=logging.INFO,
log.basicConfig(
level=log.INFO,
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
)
target_rate=16000
target_rate = 16000
audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
# First, print available XTTS speakers
print("Available XTTS speakers:")
print(XTTS_MODEL.speakers)
# Demo of Piper
print("Testing Piper TTS...")
audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
sf.write('hello_piper.wav', audio_piper, target_rate)
sf.write('hello.wav', audio, target_rate)
# Demo of XTTS with Annmarie Nele for German
german_speaker = 'Annmarie Nele'
print(f"Testing XTTS with German language using speaker: {german_speaker}")
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
audio_xtts = synthesize(
text=text_to_synthesize,
target_sample_rate=target_rate,
framework='xtts',
language='de',
speaker=german_speaker,
return_lc3=False
)
# Save the wav file
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
# Also test with LC3 encoding
lc3_xtts = synthesize(
text=text_to_synthesize,
target_sample_rate=target_rate,
framework='xtts',
language='de',
speaker=german_speaker,
return_lc3=True
)
print(f"Generated LC3 data length: {len(lc3_xtts)} bytes")
print('Done.')

View File

@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
target_sample_rate=request.target_sample_rate,
framework=request.framework,
model=request.model,
return_lc3=request.return_lc3
return_lc3=request.return_lc3,
language=request.language,
speaker=request.speaker,
speaker_wav=request.speaker_wav
)
if request.return_lc3:

View File

@@ -24,20 +24,29 @@ def resample_file(filename, out_filename, target_rate):
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
def resample_array(audio, rate, target_rate):
def resample_array(audio, rate, target_rate, speedup=1.0):
start=time.time()
# Load the original audio file
if rate == target_rate: # Nothing to do
log.info('audio already at target rate, skipping resample')
if rate == target_rate and speedup == 1.0: # Nothing to do
log.info('audio already at target rate with no speedup, skipping resample')
return audio
# Apply speedup if needed
if speedup != 1.0:
# When speeding up, we need to resample to a lower rate first
# This effectively shortens the audio duration
effective_orig_sr = rate * speedup
log.info(f"Applying speedup factor of {speedup}")
else:
effective_orig_sr = rate
# Convert the sample rate to target rate
resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)
# Save the resampled audio as a new .wav file
log.info("Resampling took %s s", round(time.time() - start, 3))
log.info("Resampling took %s s", round(time.time() - start, 3))
return resampled_audio