add basic support for xtts
This commit is contained in:
3072
poetry.lock
generated
3072
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,7 @@ packages = [
|
||||
[tool.poetry.dependencies]
|
||||
python = "~3.11"
|
||||
setuptools= ">=77"
|
||||
coqui-tts = "0.26"
|
||||
|
||||
[tool.poetry.group.general.dependencies]
|
||||
requests="2.32.3"
|
||||
@@ -36,7 +37,7 @@ auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.gi
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = ">8.2"
|
||||
pytest = {version=">8.2", optional=true}
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = [
|
||||
|
||||
@@ -35,12 +35,18 @@ class TranslatorLangConfig(BaseModel):
|
||||
# llm_host_url: str | None = 'http://localhost:11434'
|
||||
# llm_host_token: str | None = None
|
||||
|
||||
tts_system: str = 'piper'
|
||||
tts_model: str ='de_DE-kerstin-low'
|
||||
tts_system: str = 'piper' # Options: 'piper', 'xtts'
|
||||
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
|
||||
tts_language: str = 'de' # Language code for XTTS
|
||||
tts_speaker: Optional[str] = None # Speaker name for XTTS
|
||||
|
||||
|
||||
class TranslatorConfig(BaseModel):
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(
|
||||
tts_system='xtts',
|
||||
tts_language='de',
|
||||
tts_speaker='Annmarie Nele'
|
||||
)
|
||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
||||
spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
|
||||
class SynthesizeRequest(BaseModel):
|
||||
text: str
|
||||
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
|
||||
framework: str = "piper"
|
||||
model: str = "en_US-lessac-medium"
|
||||
return_lc3: bool = False
|
||||
|
||||
language: str = "en" # Language code for XTTS
|
||||
speaker: Optional[str] = None # Speaker name for XTTS
|
||||
speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import os
|
||||
# Set environment variable to auto-accept Coqui TTS license
|
||||
os.environ["COQUI_TOS_AGREED"] = "1"
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
@@ -6,9 +9,28 @@ import json
|
||||
import logging as log
|
||||
import numpy as np
|
||||
import asyncio
|
||||
import torch
|
||||
from voice_provider.utils.resample import resample_array
|
||||
from voice_provider.utils.encode_lc3 import encode_lc3
|
||||
|
||||
# Now import TTS - the license will be auto-accepted
|
||||
from TTS.api import TTS
|
||||
|
||||
# Get device for XTTS
|
||||
if torch.cuda.is_available():
|
||||
log.info('XTTS will run on GPU')
|
||||
XTTS_DEVICE = "cuda"
|
||||
else:
|
||||
log.info('XTTS will run on CPU')
|
||||
XTTS_DEVICE = "cpu"
|
||||
|
||||
# Load XTTS model globally - only once
|
||||
log.info("Initializing XTTS model...")
|
||||
start_init = time.time()
|
||||
XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
|
||||
end_init = time.time()
|
||||
log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
|
||||
|
||||
PIPER_EXE = shutil.which('piper')
|
||||
|
||||
TTS_DIR = os.path.join(os.path.dirname(__file__))
|
||||
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
|
||||
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
|
||||
[
|
||||
PIPER_EXE,
|
||||
'--cuda',
|
||||
#'--cuda',
|
||||
'--model', model,
|
||||
'--output-raw'
|
||||
],
|
||||
@@ -52,7 +74,10 @@ def synthesize(
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model="en_US-lessac-medium",
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language="en",
|
||||
speaker=None,
|
||||
speaker_wav=None
|
||||
):
|
||||
|
||||
if framework == 'piper':
|
||||
@@ -64,7 +89,38 @@ def synthesize(
|
||||
elif framework == 'koro':
|
||||
pass
|
||||
elif framework == 'xtts':
|
||||
pass
|
||||
start = time.time()
|
||||
|
||||
# Generate audio using XTTS
|
||||
# XTTS always outputs at 24kHz
|
||||
xtts_sample_rate = 24000
|
||||
|
||||
# Validate speaker parameters - XTTS needs either speaker or speaker_wav
|
||||
if speaker is None and speaker_wav is None:
|
||||
# Use the first available speaker if none specified
|
||||
speaker = XTTS_MODEL.speakers[0]
|
||||
log.info(f"No speaker specified, using default: {speaker}")
|
||||
|
||||
# Generate audio samples using tts.tts
|
||||
if speaker_wav:
|
||||
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
|
||||
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
|
||||
else:
|
||||
log.info(f"Generating XTTS audio with speaker: {speaker}")
|
||||
audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
|
||||
|
||||
# Ensure audio_np is a numpy array and properly scaled
|
||||
audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
|
||||
|
||||
# Log some info about the audio data
|
||||
log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, "
|
||||
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
|
||||
|
||||
# Resample from 24kHz to target sample rate with speedup factor
|
||||
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05)
|
||||
|
||||
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
|
||||
|
||||
elif framework == 'zonos':
|
||||
pass
|
||||
else: raise NotImplementedError('unknown framework')
|
||||
@@ -82,7 +138,10 @@ async def synthesize_async(
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model="en_US-lessac-medium",
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language="en",
|
||||
speaker=None,
|
||||
speaker_wav=None
|
||||
):
|
||||
"""
|
||||
Asynchronous version of the synthesize function that runs in a thread pool.
|
||||
@@ -90,9 +149,12 @@ async def synthesize_async(
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
target_sample_rate: Target sample rate for the audio
|
||||
framework: TTS framework to use (e.g., 'piper')
|
||||
framework: TTS framework to use (e.g., 'piper', 'xtts')
|
||||
model: Model to use for synthesis
|
||||
return_lc3: Whether to return LC3-encoded audio
|
||||
language: Language code (used by XTTS)
|
||||
speaker: Speaker ID for XTTS
|
||||
speaker_wav: Path to speaker sample for XTTS voice cloning
|
||||
|
||||
Returns:
|
||||
LC3-encoded audio as string or raw audio as numpy array
|
||||
@@ -101,23 +163,64 @@ async def synthesize_async(
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
|
||||
lambda: synthesize(
|
||||
text,
|
||||
target_sample_rate,
|
||||
framework,
|
||||
model,
|
||||
return_lc3,
|
||||
language,
|
||||
speaker,
|
||||
speaker_wav
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import logging
|
||||
import soundfile as sf
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
log.basicConfig(
|
||||
level=log.INFO,
|
||||
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
|
||||
)
|
||||
target_rate=16000
|
||||
target_rate = 16000
|
||||
|
||||
audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
|
||||
# First, print available XTTS speakers
|
||||
print("Available XTTS speakers:")
|
||||
print(XTTS_MODEL.speakers)
|
||||
|
||||
# Demo of Piper
|
||||
print("Testing Piper TTS...")
|
||||
audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
|
||||
sf.write('hello_piper.wav', audio_piper, target_rate)
|
||||
|
||||
sf.write('hello.wav', audio, target_rate)
|
||||
# Demo of XTTS with Annmarie Nele for German
|
||||
german_speaker = 'Annmarie Nele'
|
||||
print(f"Testing XTTS with German language using speaker: {german_speaker}")
|
||||
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
|
||||
|
||||
audio_xtts = synthesize(
|
||||
text=text_to_synthesize,
|
||||
target_sample_rate=target_rate,
|
||||
framework='xtts',
|
||||
language='de',
|
||||
speaker=german_speaker,
|
||||
return_lc3=False
|
||||
)
|
||||
|
||||
# Save the wav file
|
||||
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
|
||||
|
||||
# Also test with LC3 encoding
|
||||
lc3_xtts = synthesize(
|
||||
text=text_to_synthesize,
|
||||
target_sample_rate=target_rate,
|
||||
framework='xtts',
|
||||
language='de',
|
||||
speaker=german_speaker,
|
||||
return_lc3=True
|
||||
)
|
||||
print(f"Generated LC3 data length: {len(lc3_xtts)} bytes")
|
||||
|
||||
print('Done.')
|
||||
|
||||
@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
|
||||
target_sample_rate=request.target_sample_rate,
|
||||
framework=request.framework,
|
||||
model=request.model,
|
||||
return_lc3=request.return_lc3
|
||||
return_lc3=request.return_lc3,
|
||||
language=request.language,
|
||||
speaker=request.speaker,
|
||||
speaker_wav=request.speaker_wav
|
||||
)
|
||||
|
||||
if request.return_lc3:
|
||||
|
||||
@@ -24,20 +24,29 @@ def resample_file(filename, out_filename, target_rate):
|
||||
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
|
||||
|
||||
|
||||
def resample_array(audio, rate, target_rate):
|
||||
def resample_array(audio, rate, target_rate, speedup=1.0):
|
||||
start=time.time()
|
||||
# Load the original audio file
|
||||
|
||||
if rate == target_rate: # Nothing to do
|
||||
log.info('audio already at target rate, skipping resample')
|
||||
if rate == target_rate and speedup == 1.0: # Nothing to do
|
||||
log.info('audio already at target rate with no speedup, skipping resample')
|
||||
return audio
|
||||
|
||||
# Apply speedup if needed
|
||||
if speedup != 1.0:
|
||||
# When speeding up, we need to resample to a lower rate first
|
||||
# This effectively shortens the audio duration
|
||||
effective_orig_sr = rate * speedup
|
||||
log.info(f"Applying speedup factor of {speedup}")
|
||||
else:
|
||||
effective_orig_sr = rate
|
||||
|
||||
# Convert the sample rate to target rate
|
||||
resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
|
||||
resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)
|
||||
|
||||
# Save the resampled audio as a new .wav file
|
||||
|
||||
log.info("Resampling took %s s", round(time.time() - start, 3))
|
||||
log.info("Resampling took %s s", round(time.time() - start, 3))
|
||||
return resampled_audio
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user