diff --git a/src/auracast_translator/translator_server/translator_server.py b/src/auracast_translator/translator_server/translator_server.py index 5e9291b..b4759b8 100644 --- a/src/auracast_translator/translator_server/translator_server.py +++ b/src/auracast_translator/translator_server/translator_server.py @@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup): ep_group.sampling_rate_hz, trans_conf.tts_system, trans_conf.tts_model, - return_lc3=True + return_lc3=True, + language=trans_conf.xtts_language, + speaker=trans_conf.xtts_speaker, + speaker_wav=trans_conf.xtts_speaker_wav ) synthesis_tasks.append(task) diff --git a/src/translator_models/translator_models.py b/src/translator_models/translator_models.py index a004c31..17fc154 100644 --- a/src/translator_models/translator_models.py +++ b/src/translator_models/translator_models.py @@ -37,15 +37,16 @@ class TranslatorLangConfig(BaseModel): tts_system: str = 'piper' # Options: 'piper', 'xtts' tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused - tts_language: str = 'de' # Language code for XTTS - tts_speaker: Optional[str] = None # Speaker name for XTTS + xtts_language: str = 'de' # Language code for XTTS + xtts_speaker: Optional[str] = None # Speaker name for XTTS + xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning class TranslatorConfig(BaseModel): deu: TranslatorLangConfig = TranslatorLangConfig( tts_system='xtts', - tts_language='de', - tts_speaker='Annmarie Nele' + xtts_language='de', + xtts_speaker_wav='female.wav' ) eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium') fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium') diff --git a/src/voice_provider/text_to_speech.py b/src/voice_provider/text_to_speech.py index a530b8f..384f111 100644 --- a/src/voice_provider/text_to_speech.py +++ b/src/voice_provider/text_to_speech.py @@ -103,6 +103,8 @@ def synthesize( # Generate audio samples using tts.tts if speaker_wav: + # expand path to speaker_wav folder + speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav) log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}") audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language) else: @@ -117,7 +119,7 @@ def synthesize( f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}") # Resample from 24kHz to target sample rate with speedup factor - audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05) + audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate) log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds") @@ -196,8 +198,8 @@ if __name__ == '__main__': sf.write('hello_piper.wav', audio_piper, target_rate) # Demo of XTTS with Annmarie Nele for German - german_speaker = 'Annmarie Nele' - print(f"Testing XTTS with German language using speaker: {german_speaker}") + speaker_wav = 'female.wav' + print(f"Testing XTTS with German language using speaker: {speaker_wav}") text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin." audio_xtts = synthesize( @@ -205,22 +207,11 @@ if __name__ == '__main__': target_sample_rate=target_rate, framework='xtts', language='de', - speaker=german_speaker, + speaker_wav=speaker_wav, return_lc3=False ) # Save the wav file sf.write('hello_xtts_german.wav', audio_xtts, target_rate) - # Also test with LC3 encoding - lc3_xtts = synthesize( - text=text_to_synthesize, - target_sample_rate=target_rate, - framework='xtts', - language='de', - speaker=german_speaker, - return_lc3=True - ) - print(f"Generated LC3 data length: {len(lc3_xtts)} bytes") - print('Done.')