use different xtts voice
This commit is contained in:
@@ -154,7 +154,10 @@ async def make_announcement(text: str, ep_group: EndpointGroup):
|
||||
ep_group.sampling_rate_hz,
|
||||
trans_conf.tts_system,
|
||||
trans_conf.tts_model,
|
||||
return_lc3=True
|
||||
return_lc3=True,
|
||||
language=trans_conf.xtts_language,
|
||||
speaker=trans_conf.xtts_speaker,
|
||||
speaker_wav=trans_conf.xtts_speaker_wav
|
||||
)
|
||||
synthesis_tasks.append(task)
|
||||
|
||||
|
||||
@@ -37,15 +37,16 @@ class TranslatorLangConfig(BaseModel):
|
||||
|
||||
tts_system: str = 'piper' # Options: 'piper', 'xtts'
|
||||
tts_model: str = 'de_DE-kerstin-low' # For piper: model name, for xtts: unused
|
||||
tts_language: str = 'de' # Language code for XTTS
|
||||
tts_speaker: Optional[str] = None # Speaker name for XTTS
|
||||
xtts_language: str = 'de' # Language code for XTTS
|
||||
xtts_speaker: Optional[str] = None # Speaker name for XTTS
|
||||
xtts_speaker_wav: Optional[str] = None # Path to speaker sample for XTTS voice cloning
|
||||
|
||||
|
||||
class TranslatorConfig(BaseModel):
|
||||
deu: TranslatorLangConfig = TranslatorLangConfig(
|
||||
tts_system='xtts',
|
||||
tts_language='de',
|
||||
tts_speaker='Annmarie Nele'
|
||||
xtts_language='de',
|
||||
xtts_speaker_wav='female.wav'
|
||||
)
|
||||
eng: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'en_GB-alba-medium')
|
||||
fra: TranslatorLangConfig = TranslatorLangConfig(tts_model = 'fr_FR-siwis-medium')
|
||||
|
||||
@@ -103,6 +103,8 @@ def synthesize(
|
||||
|
||||
# Generate audio samples using tts.tts
|
||||
if speaker_wav:
|
||||
# expand path to speaker_wav folder
|
||||
speaker_wav = os.path.join(os.path.dirname(__file__), 'speaker_wav', speaker_wav)
|
||||
log.info(f"Generating XTTS audio with speaker_wav: {speaker_wav}")
|
||||
audio_list = XTTS_MODEL.tts(text=text, speaker_wav=speaker_wav, language=language)
|
||||
else:
|
||||
@@ -117,7 +119,7 @@ def synthesize(
|
||||
f"min: {audio_np.min():.4f}, max: {audio_np.max():.4f}")
|
||||
|
||||
# Resample from 24kHz to target sample rate with speedup factor
|
||||
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate, speedup=1.05)
|
||||
audio = resample_array(audio_np, xtts_sample_rate, target_sample_rate)
|
||||
|
||||
log.info(f"XTTS synthesis completed in {time.time() - start:.2f} seconds")
|
||||
|
||||
@@ -196,8 +198,8 @@ if __name__ == '__main__':
|
||||
sf.write('hello_piper.wav', audio_piper, target_rate)
|
||||
|
||||
# Demo of XTTS with Annmarie Nele for German
|
||||
german_speaker = 'Annmarie Nele'
|
||||
print(f"Testing XTTS with German language using speaker: {german_speaker}")
|
||||
speaker_wav = 'female.wav'
|
||||
print(f"Testing XTTS with German language using speaker: {speaker_wav}")
|
||||
text_to_synthesize = "Dies ist ein Test der XTTS Stimme auf Deutsch mit Annmarie Nele als Sprecherin."
|
||||
|
||||
audio_xtts = synthesize(
|
||||
@@ -205,22 +207,11 @@ if __name__ == '__main__':
|
||||
target_sample_rate=target_rate,
|
||||
framework='xtts',
|
||||
language='de',
|
||||
speaker=german_speaker,
|
||||
speaker_wav=speaker_wav,
|
||||
return_lc3=False
|
||||
)
|
||||
|
||||
# Save the wav file
|
||||
sf.write('hello_xtts_german.wav', audio_xtts, target_rate)
|
||||
|
||||
# Also test with LC3 encoding
|
||||
lc3_xtts = synthesize(
|
||||
text=text_to_synthesize,
|
||||
target_sample_rate=target_rate,
|
||||
framework='xtts',
|
||||
language='de',
|
||||
speaker=german_speaker,
|
||||
return_lc3=True
|
||||
)
|
||||
print(f"Generated LC3 data length: {len(lc3_xtts)} bytes")
|
||||
|
||||
print('Done.')
|
||||
|
||||
Reference in New Issue
Block a user