add basic support for xtts

2025-03-25 12:02:59 +01:00
parent a9dbe52a7e
commit c095b058d6
7 changed files with 3113 additions and 130 deletions
@@ -13,6 +13,7 @@ packages = [
 [tool.poetry.dependencies]
 python = "~3.11"
 setuptools= ">=77"
+coqui-tts = "0.26"

 [tool.poetry.group.general.dependencies]
 requests="2.32.3"
@@ -36,7 +37,7 @@ auracast = { git = "ssh://git@ssh.pstruebi.xyz:222/auracaster/bumble-auracast.gi


 [tool.poetry.group.dev.dependencies]
-pytest = ">8.2"
+pytest = {version=">8.2", optional=true}

 [tool.pytest.ini_options]
 addopts = [
@@ -35,12 +35,18 @@ class TranslatorLangConfig(BaseModel):
    # llm_host_url: str | None = 'http://localhost:11434'
    # llm_host_token: str | None = None
    
-    tts_system: str = 'piper'
-    tts_model: str ='de_DE-kerstin-low'
+    tts_system: str = 'piper'  # Options: 'piper', 'xtts'
+    tts_model: str = 'de_DE-kerstin-low'  # For piper: model name, for xtts: unused
+    tts_language: str = 'de'  # Language code for XTTS
+    tts_speaker: Optional[str] = None  # Speaker name for XTTS


 class TranslatorConfig(BaseModel):
-    deu: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'de_DE-thorsten-high')
+    deu: TranslatorLangConfig = TranslatorLangConfig(
+        tts_system='xtts',
+        tts_language='de',
+        tts_speaker='Annmarie Nele'
+    )
    eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
    fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
    spa: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'es_ES-sharvard-medium')
@@ -1,4 +1,5 @@
 from pydantic import BaseModel
+from typing import Optional

 class SynthesizeRequest(BaseModel):
    text: str
@@ -6,4 +7,6 @@ class SynthesizeRequest(BaseModel):
    framework: str = "piper"
    model: str = "en_US-lessac-medium"
    return_lc3: bool = False
-
+    language: str = "en"  # Language code for XTTS
+    speaker: Optional[str] = None  # Speaker name for XTTS
+    speaker_wav: Optional[str] = None  # Path to speaker sample for XTTS voice cloning
@@ -1,4 +1,7 @@
 import os
+# Set environment variable to auto-accept Coqui TTS license
+os.environ["COQUI_TOS_AGREED"] = "1"
+
 import shutil
 import subprocess
 import time
@@ -6,9 +9,28 @@ import json
 import logging as log
 import numpy as np
 import asyncio
+import torch
 from voice_provider.utils.resample import resample_array
 from voice_provider.utils.encode_lc3 import encode_lc3

+# Now import TTS - the license will be auto-accepted
+from TTS.api import TTS
+
+# Get device for XTTS
+if torch.cuda.is_available():
+    log.info('XTTS will run on GPU')
+    XTTS_DEVICE = "cuda"  
+else:
+    log.info('XTTS will run on CPU')
+    XTTS_DEVICE = "cpu"
+
+# Load XTTS model globally - only once
+log.info("Initializing XTTS model...")
+start_init = time.time()
+XTTS_MODEL = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(XTTS_DEVICE)
+end_init = time.time()
+log.info(f"XTTS initialization completed in {end_init - start_init:.2f} seconds")
+
 PIPER_EXE = shutil.which('piper')

 TTS_DIR = os.path.join(os.path.dirname(__file__))
@@ -26,7 +48,7 @@ def synth_piper(text, model="en_US-lessac-medium"):
    ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model 
        [
            PIPER_EXE, 
-            '--cuda',
+            #'--cuda',
            '--model', model, 
            '--output-raw'
        ], 
@@ -52,7 +74,10 @@ def synthesize(
        target_sample_rate, 
        framework, 
        model="en_US-lessac-medium", 
-        return_lc3=True
+        return_lc3=True,
+        language="en",
+        speaker=None,
+        speaker_wav=None
        ):

    if framework == 'piper':
@@ -64,7 +89,38 @@ def synthesize(
    elif framework == 'koro':
        pass
    elif framework == 'xtts':
-        pass
+        start = time.time()
+        
+        # Generate audio using XTTS
+        # XTTS always outputs at 24kHz
+        xtts_sample_rate = 24000  
+        
+        # Validate speaker parameters - XTTS needs either speaker or speaker_wav
+        if speaker is None and speaker_wav is None:
+            # Use the first available speaker if none specified
+            speaker = XTTS_MODEL.speakers[0]
+            log.info(f"No speaker specified, using default: {speaker}")
+        
+        # Generate audio samples using tts.tts
+        if speaker_wav:
+            log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
+            audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
+        else:
+            log.info(f"Generating XTTS audio with speaker: {speaker}")
+            audio_list = XTTS_MODEL.tts(text=text, speaker=speaker, language=language)
+        
+        # Ensure audio_np is a numpy array and properly scaled
+        audio_np = np.array(audio_list, dtype=np.float32)# / (2**15-1)
+
+        # Log some info about the audio data
+        log.info(f"XTTS audio shape: {audio_np.shape}, dtype: {audio_np.dtype}, " 
+                f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
+        
+        # Resample from 24kHz to target sample rate with speedup factor
+        audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05)
+        
+        log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
+
    elif framework == 'zonos':
        pass
    else: raise NotImplementedError('unknown framework')
@@ -82,7 +138,10 @@ async def synthesize_async(
        target_sample_rate, 
        framework, 
        model="en_US-lessac-medium", 
-        return_lc3=True
+        return_lc3=True,
+        language="en",
+        speaker=None,
+        speaker_wav=None
        ):
    """
    Asynchronous version of the synthesize function that runs in a thread pool.
@@ -90,9 +149,12 @@ async def synthesize_async(
    Args:
        text: Text to synthesize
        target_sample_rate: Target sample rate for the audio
-        framework: TTS framework to use (e.g., 'piper')
+        framework: TTS framework to use (e.g., 'piper', 'xtts')
        model: Model to use for synthesis
        return_lc3: Whether to return LC3-encoded audio
+        language: Language code (used by XTTS)
+        speaker: Speaker ID for XTTS
+        speaker_wav: Path to speaker sample for XTTS voice cloning
        
    Returns:
        LC3-encoded audio as string or raw audio as numpy array
@@ -101,23 +163,64 @@ async def synthesize_async(
    loop = asyncio.get_event_loop()
    result = await loop.run_in_executor(
        None,
-        lambda: synthesize(text, target_sample_rate, framework, model, return_lc3)
+        lambda: synthesize(
+            text, 
+            target_sample_rate, 
+            framework, 
+            model, 
+            return_lc3,
+            language,
+            speaker,
+            speaker_wav
+        )
    )
    return result


 if __name__ == '__main__':
-    import logging
    import soundfile as sf

-    logging.basicConfig(
-        level=logging.INFO,
+    log.basicConfig(
+        level=log.INFO,
        format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
    )
-    target_rate=16000
+    target_rate = 16000

-    audio = synthesize('Hello World', target_rate, 'piper', model= 'de_DE-kerstin-low', return_lc3=False)
+    # First, print available XTTS speakers
+    print("Available XTTS speakers:")
+    print(XTTS_MODEL.speakers)
+    
+    # Demo of Piper
+    print("Testing Piper TTS...")
+    audio_piper = synthesize('Hello World', target_rate, 'piper', model='de_DE-kerstin-low', return_lc3=False)
+    sf.write('hello_piper.wav', audio_piper, target_rate)

-    sf.write('hello.wav', audio, target_rate)
+    # Demo of XTTS with Annmarie Nele for German
+    german_speaker = 'Annmarie Nele'
+    print(f"Testing XTTS with German language using speaker: {german_speaker}")
+    text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
+    
+    audio_xtts = synthesize(
+        text=text_to_synthesize,
+        target_sample_rate=target_rate,
+        framework='xtts',
+        language='de',
+        speaker=german_speaker,
+        return_lc3=False
+    )
+    
+    # Save the wav file
+    sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
+    
+    # Also test with LC3 encoding
+    lc3_xtts = synthesize(
+        text=text_to_synthesize,
+        target_sample_rate=target_rate,
+        framework='xtts',
+        language='de',
+        speaker=german_speaker,
+        return_lc3=True
+    )
+    print(f"Generated LC3 data length: {len(lc3_xtts)} bytes")

    print('Done.')
@@ -17,7 +17,10 @@ async def synthesize_speech(request: SynthesizeRequest):
            target_sample_rate=request.target_sample_rate,
            framework=request.framework,
            model=request.model,
-            return_lc3=request.return_lc3
+            return_lc3=request.return_lc3,
+            language=request.language,
+            speaker=request.speaker,
+            speaker_wav=request.speaker_wav
        )

        if request.return_lc3:
@@ -24,20 +24,29 @@ def resample_file(filename, out_filename, target_rate):
    log.info("Resampling of %s took  %s s", os.path.basename(filename), round(time.time() - start, 3))


-def resample_array(audio, rate, target_rate):
+def resample_array(audio, rate, target_rate, speedup=1.0):
    start=time.time()
    # Load the original audio file

-    if rate == target_rate: # Nothing to do 
-        log.info('audio already at target rate, skipping resample')
+    if rate == target_rate and speedup == 1.0: # Nothing to do 
+        log.info('audio already at target rate with no speedup, skipping resample')
        return audio

+    # Apply speedup if needed
+    if speedup != 1.0:
+        # When speeding up, we need to resample to a lower rate first
+        # This effectively shortens the audio duration
+        effective_orig_sr = rate * speedup
+        log.info(f"Applying speedup factor of {speedup}")
+    else:
+        effective_orig_sr = rate
+
    # Convert the sample rate to target rate
-    resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate)
+    resampled_audio = librosa.resample(audio, orig_sr=effective_orig_sr, target_sr=target_rate)

    # Save the resampled audio as a new .wav file
    
-    log.info("Resampling took  %s s", round(time.time() - start, 3))
+    log.info("Resampling took %s s", round(time.time() - start, 3))
    return resampled_audio