use different xtts voice

This commit is contained in:
2025-03-25 12:34:16 +01:00
parent c095b058d6
commit e02593c78d
3 changed files with 15 additions and 20 deletions
@@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup):
ep_group.sampling_rate_hz,
trans_conf.tts_system,
trans_conf.tts_model,
return_lc3=True
return_lc3=True,
language=trans_conf.xtts_language,
speaker=trans_conf.xtts_speaker,
speaker_wav=trans_conf.xtts_speaker_wav
)
synthesis_tasks.append(task)
+5 -4
View File
@@ -37,15 +37,16 @@ class TranslatorLangConfig(BaseModel):
tts_system: str = 'piper' # Options: 'piper', 'xtts'
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
tts_language: str = 'de' # Language code for XTTS
tts_speaker: Optional[str] = None # Speaker name for XTTS
xtts_language: str = 'de' # Language code for XTTS
xtts_speaker: Optional[str] = None # Speaker name for XTTS
xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
class TranslatorConfig(BaseModel):
deu: TranslatorLangConfig = TranslatorLangConfig(
tts_system='xtts',
tts_language='de',
tts_speaker='Annmarie Nele'
xtts_language='de',
xtts_speaker_wav='female.wav'
)
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
+6 -15
View File
@@ -103,6 +103,8 @@ def synthesize(
# Generate audio samples using tts.tts
if speaker_wav:
# expand path to speaker_wav folder
speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav)
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
else:
@@ -117,7 +119,7 @@ def synthesize(
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
# Resample from 24kHz to target sample rate with speedup factor
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05)
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate)
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
@@ -196,8 +198,8 @@ if __name__ == '__main__':
sf.write('hello_piper.wav', audio_piper, target_rate)
# Demo of XTTS with Annmarie Nele for German
german_speaker = 'Annmarie Nele'
print(f"Testing XTTS with German language using speaker: {german_speaker}")
speaker_wav = 'female.wav'
print(f"Testing XTTS with German language using speaker: {speaker_wav}")
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
audio_xtts = synthesize(
@@ -205,22 +207,11 @@ if __name__ == '__main__':
target_sample_rate=target_rate,
framework='xtts',
language='de',
speaker=german_speaker,
speaker_wav=speaker_wav,
return_lc3=False
)
# Save the wav file
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
# Also test with LC3 encoding
lc3_xtts = synthesize(
text=text_to_synthesize,
target_sample_rate=target_rate,
framework='xtts',
language='de',
speaker=german_speaker,
return_lc3=True
)
print(f"Generated LC3 data length: {len(lc3_xtts)} bytes")
print('Done.')