From 00895f52b203dcee2d7056102ee4a66f920cafe5 Mon Sep 17 00:00:00 2001 From: pstruebi Date: Tue, 25 Feb 2025 11:53:49 +0100 Subject: [PATCH] make it work with bumble auracast backend --- multilang_translator/config.py | 19 +++-- multilang_translator/main.py | 75 ++++++++++--------- .../text_to_speech/resample.py | 18 ----- .../text_to_speech/text_to_speech.py | 15 ++-- multilang_translator/utils/__init__.py | 0 multilang_translator/utils/resample.py | 27 +++++++ 6 files changed, 91 insertions(+), 63 deletions(-) delete mode 100644 multilang_translator/text_to_speech/resample.py create mode 100644 multilang_translator/utils/__init__.py create mode 100644 multilang_translator/utils/resample.py diff --git a/multilang_translator/config.py b/multilang_translator/config.py index d5643f7..806dbef 100644 --- a/multilang_translator/config.py +++ b/multilang_translator/config.py @@ -3,28 +3,35 @@ import os ANNOUNCEMENT_DIR = os.path.join(os.path.dirname(__file__), 'announcements') VENV_DIR = os.path.join(os.path.dirname(__file__), '../venv') PIPER_EXE_PATH = f'{VENV_DIR}/bin/piper' -SAMPLING_RATE_HZ = int(16e3) FRAME_DUR_MS = 10 +SAMPLING_RATE_HZ = int(16e3) BITRATE_BPS = int(32e3) + +def mk_filename_lc3(lang=''): + return f"{ANNOUNCEMENT_DIR}/announcement_{lang}_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}.lc3" + LANG_CONFIG = { "de": { - "file": f"{ANNOUNCEMENT_DIR}/announcement_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}_de", + "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_de.wav", + "filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_de_resamp.wav", "tts": 'de_DE-kerstin-low', }, "en": { - "file": f"{ANNOUNCEMENT_DIR}/announcement_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}_en", + "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_en.wav", + "filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_en_resamp.wav", "tts": 'en_US-lessac-medium' }, "fr": { - "file": f"{ANNOUNCEMENT_DIR}/announcement_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}_fr", + "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_fr.wav", + "filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_fr_resamp.wav", "tts": 'fr_FR-siwis-medium' }, # "es": { - # "file": f"{ANNOUNCEMENT_DIR}/announcement_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}_es", + # "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_es.wav", # "tts": 'es_ES-sharvard-medium' # }, # "it": { - # "file": f"{ANNOUNCEMENT_DIR}/announcement_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}_it", + # "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_it.wav", # "tts": 'it_IT-paola-medium' # } } diff --git a/multilang_translator/main.py b/multilang_translator/main.py index c872ce2..4e8b99e 100644 --- a/multilang_translator/main.py +++ b/multilang_translator/main.py @@ -8,6 +8,7 @@ import asyncio from copy import copy import time import logging as log +from utils import resample from translator import llm_translator, test_content from text_to_speech import text_to_speech from encode import encode_lc3 @@ -15,75 +16,79 @@ from auracast import multicast_control from auracast import auracast_config from config import LANG_CONFIG, BITRATE_BPS, SAMPLING_RATE_HZ, FRAME_DUR_MS +# TODO: look for a end to end translation solution def transcribe(): - pass + pass # TODO: Implement transcribing input audio e.g. with whisper -def syntesize(text, tts_model, output_file): - audio_dur = text_to_speech.synthesize(text, tts_model, output_file) - #resample.resample(output_file, output_file, target_rate=SAMPLING_RATE_HZ) - #encode_lc3.encode_lc3(output_file, bps=BITRATE_BPS, frame_dur_ms=FRAME_DUR_MS) +def syntesize_resample(text, tts_model, file_wav, file_wav_resamp): + audio_dur = text_to_speech.synthesize(text, tts_model, file_wav) + resample.resample_file(file_wav, file_wav_resamp, target_rate=SAMPLING_RATE_HZ) return audio_dur + def translate_from_german(text_de): config = copy(LANG_CONFIG) base_lang = "de" - file = config[base_lang]["file"] - audio_dur_s = {} - audio_dur_s [base_lang] = syntesize(text_de, config['de']["tts"], f'{file}.wav') - - del config[base_lang] - + file = config[base_lang]["filepath_wav"] + file_resamp = config[base_lang]['filepath_wav_resamp'] + tts_json = {} + #tts_json[base_lang] = syntesize_resample(text_de, config['de']["tts"], file, file_resamp) + # delete source language since no translation is needed for it + #del config[base_lang] for key, val in config.items(): - text = llm_translator.translate_de_to_x(text_de, key) - file = val['file'] - audio_dur_s[key] = syntesize(text, val['tts'], f'{file}.wav') - return audio_dur_s + if key == base_lang: + text = text_de + else: + text = llm_translator.translate_de_to_x(text_de, key) + + file = val['filepath_wav'] + file_resamp = val['filepath_wav_resamp'] + tts_json[key] = syntesize_resample(text, val['tts'], file, file_resamp) + + return tts_json + async def announcement_from_german_text(caster:multicast_control.Multicaster, text_de): - translate_from_german(text_de) - # Transfer the files to broadcaster memory + tts_json = translate_from_german(text_de) + start = time.time() - await caster.init_audio() - await caster.start_streaming() - - #for val in LANG_CONFIG.values(): - # copy_to_broadcaster(f'{val["file"]}.lc3') + rates = [d['audio']['sample_rate'] for d in tts_json.values()] + #for i, big in enumerate(caster.big_conf): + # big.input_format = f'int16le,{rates[i]},1' - #log.info("Transfering files to broadcaster took %s s", round(time.time() - start, 3)) - - #time.sleep(2) - - # Instruct the broadcaster to stream the files - # for i, d in enumerate(list(LANG_CONFIG.items())): - # key, val = d - # broadcaster_play_file(i, f'{os.path.basename(val["file"])}.lc3') - # time.sleep(audio_durs[key]) + await caster.init_audio() + caster.start_streaming() log.info("Starting all broadcasts took %s s", round(time.time() - start, 3)) async def main(): + log.basicConfig( + level=log.DEBUG, + format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s' + ) + + global_conf = auracast_config.global_base_config #global_conf.transport='serial:/dev/serial/by-id/usb-SEGGER_J-Link_001057705357-if02,1000000,rtscts' # transport for nrf54l15dk global_conf.transport='serial:/dev/serial/by-id/usb-ZEPHYR_Zephyr_HCI_UART_sample_81BD14B8D71B5662-if00,115200,rtscts' #nrf52dongle hci_uart usb cdc - big_conf = [ # TODO: integrate this in the LANG_CONFIG dict + big_conf = [ # TODO: integrate this in the LANG_CONFIG dict, better: make a hirachry of dataclasses auracast_config.broadcast_de, auracast_config.broadcast_en, auracast_config.broadcast_fr, #auracast_config.broadcast_es, #auracast_config.broadcast_it, ] - files = [v['file'] for v in LANG_CONFIG.values()] + files = [v['filepath_wav_resamp'] for v in LANG_CONFIG.values()] for i, conf in enumerate(big_conf): conf.loop_wav = False conf.audio_source = f'file:{files[i]}' - conf.input_format = 'int16le,48000,1' # TODO: Use actual samplint rate from piper caster = multicast_control.Multicaster(global_conf, big_conf) await caster.init_broadcast() @@ -92,5 +97,7 @@ async def main(): await announcement_from_german_text(caster, test_content.TESTSENTENCE_DE_HELLO) + await asyncio.wait([caster.streamer.task]) + if __name__ == '__main__': asyncio.run(main()) \ No newline at end of file diff --git a/multilang_translator/text_to_speech/resample.py b/multilang_translator/text_to_speech/resample.py deleted file mode 100644 index 384d737..0000000 --- a/multilang_translator/text_to_speech/resample.py +++ /dev/null @@ -1,18 +0,0 @@ -# resample .wave from 22.05 to 24kHz sampling rate - -import librosa -import soundfile as sf - - -def resample(filename, out_filename, target_rate=int(24e3)): - # Load the original audio file - audio, rate = librosa.load(filename) - - # Convert the sample rate to 24 kHz - resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate) - - # Save the resampled audio as a new .wav file - sf.write(out_filename, resampled_audio, target_rate) - -if __name__ == "__main__": - resample('text_to_speech/welcome.wav', 'text_to_speech/welcome_resampled.wav') diff --git a/multilang_translator/text_to_speech/text_to_speech.py b/multilang_translator/text_to_speech/text_to_speech.py index d4aba98..d5895ba 100644 --- a/multilang_translator/text_to_speech/text_to_speech.py +++ b/multilang_translator/text_to_speech/text_to_speech.py @@ -3,7 +3,6 @@ import subprocess import time import json import logging as log -import wave from multilang_translator import config TTS_DIR = os.path.join(os.path.dirname(__file__)) @@ -23,15 +22,21 @@ def synthesize(text, model="en_US-lessac-medium", output_file="out.wav"): log.info('%s', ret.stderr) assert ret.returncode == 0, 'Piper returncode was not 0.' - - os.chdir(pwd) - log.info("Running piper took %s s", round(time.time() - start, 3)) + + log.info("Running piper for model %s took %s s", model, round(time.time() - start, 3)) with open (f'{model}.onnx.json') as f: # TODO: wrap everything into a class, store the json permanentl - model_json = json.load(f) + os.chdir(pwd) return model_json if __name__ == '__main__': + import logging + + logging.basicConfig( + level=logging.INFO, + format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s' + ) + synthesize('Hello World') diff --git a/multilang_translator/utils/__init__.py b/multilang_translator/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/multilang_translator/utils/resample.py b/multilang_translator/utils/resample.py new file mode 100644 index 0000000..2d06f6b --- /dev/null +++ b/multilang_translator/utils/resample.py @@ -0,0 +1,27 @@ +# resample .wav source to target sampling rate +import logging as log +import time +import os +import librosa +import soundfile as sf + + +def resample_file(filename, out_filename, target_rate=int(24e3)): + start=time.time() + # Load the original audio file + audio, rate = librosa.load(filename) + + if rate == target_rate: # Nothing to do + sf.write(out_filename, audio, target_rate) + return + + # Convert the sample rate to 24 kHz + resampled_audio = librosa.resample(audio, orig_sr=rate, target_sr=target_rate) + + # Save the resampled audio as a new .wav file + sf.write(out_filename, resampled_audio, target_rate) + + log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3)) + +if __name__ == "__main__": + resample_file('text_to_speech/welcome.wav', 'text_to_speech/welcome_resampled.wav')