diff --git a/.vscode/launch.json b/.vscode/launch.json index 7774467..889dcfc 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,8 @@ "type": "debugpy", "request": "launch", "program": "${file}", - "console": "integratedTerminal" + "console": "integratedTerminal", + "justMyCode": true } ] } \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json index fa6c685..43682fb 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -13,6 +13,11 @@ "label": "pip install -e bumble", "type": "shell", "command": "./venv/bin/python -m pip install -e ../bumble --config-settings editable_mode=compat" + }, + { + "label": "pip install -e auracast", + "type": "shell", + "command": "./venv/bin/python -m pip install -e ../bumble-auracast --config-settings editable_mode=compat" } ] } \ No newline at end of file diff --git a/multilang_translator/config.py b/multilang_translator/config.py index 806dbef..e54ca61 100644 --- a/multilang_translator/config.py +++ b/multilang_translator/config.py @@ -7,8 +7,6 @@ FRAME_DUR_MS = 10 SAMPLING_RATE_HZ = int(16e3) BITRATE_BPS = int(32e3) -def mk_filename_lc3(lang=''): - return f"{ANNOUNCEMENT_DIR}/announcement_{lang}_{SAMPLING_RATE_HZ//1000}_{FRAME_DUR_MS}_{BITRATE_BPS//1000}.lc3" LANG_CONFIG = { "de": { diff --git a/multilang_translator/main.py b/multilang_translator/main.py index a7fecc7..a9adc42 100644 --- a/multilang_translator/main.py +++ b/multilang_translator/main.py @@ -32,34 +32,34 @@ def syntesize_resample(text, tts_model, file_wav, file_wav_resamp): return audio_dur -def translate_from_german(text_de, model): +async def announcement_from_german_text( + caster: multicast_control.Multicaster, + text_de + ): + TRANSLATOR_LLM = 'llama3.2:3b-instruct-q4_0' + config = copy(LANG_CONFIG) base_lang = "de" - file = config[base_lang]["filepath_wav"] - file_resamp = config[base_lang]['filepath_wav_resamp'] - tts_json = {} - - for key, val in config.items(): + for i, d in enumerate(config.items()): + key, val = d if key == base_lang: text = text_de else: - text = llm_translator.translate_de_to_x(text_de, key, model=model) - + text = llm_translator.translate_de_to_x(text_de, key, model=TRANSLATOR_LLM) + log.info('%s', text) - file = val['filepath_wav'] - file_resamp = val['filepath_wav_resamp'] - tts_json[key] = syntesize_resample(text, val['tts'], file, file_resamp) - return tts_json - - -async def announcement_from_german_text(caster:multicast_control.Multicaster, text_de): - - tts_json = translate_from_german(text_de, model = 'llama3.2:3b-instruct-q4_0') + lc3_audio = text_to_speech.synthesize( + text, + SAMPLING_RATE_HZ, + 'piper', + val['tts'], + return_lc3=True + ) + caster.big_conf[i].audio_source = lc3_audio start = time.time() - await caster.init_audio() caster.start_streaming() log.info("Starting all broadcasts took %s s", round(time.time() - start, 3)) @@ -74,7 +74,7 @@ async def command_line_ui(caster: multicast_control.Multicaster): prompt += "\n".join([f"{i}: {s}" for i,s in enumerate(sentence_list)]) prompt += "\n" command = await aioconsole.ainput(prompt) - + if command.strip().lower() == "quit": print("👋 Exiting...") if caster.device: @@ -103,25 +103,22 @@ async def main(): #global_conf.transport='serial:/dev/serial/by-id/usb-SEGGER_J-Link_001057705357-if02,1000000,rtscts' # transport for nrf54l15dk global_conf.transport='serial:/dev/serial/by-id/usb-ZEPHYR_Zephyr_HCI_UART_sample_81BD14B8D71B5662-if00,115200,rtscts' #nrf52dongle hci_uart usb cdc - big_conf = [ + big_conf = [ auracast_config.broadcast_de, auracast_config.broadcast_en, auracast_config.broadcast_fr, #auracast_config.broadcast_es, #auracast_config.broadcast_it, ] - files = [v['filepath_wav_resamp'] for v in LANG_CONFIG.values()] for i, conf in enumerate(big_conf): - conf.loop_wav = False - conf.audio_source = f'file:{files[i]}' + conf.loop = False caster = multicast_control.Multicaster(global_conf, big_conf) await caster.init_broadcast() - - #await announcement_from_german_text(caster, test_content.TESTSENTENCE_DE_HELLO) - await command_line_ui(caster) + #await announcement_from_german_text(caster, test_content.TESTSENTENCE.DE_HELLO) #await asyncio.wait([caster.streamer.task]) + await command_line_ui(caster) if __name__ == '__main__': asyncio.run(main()) diff --git a/multilang_translator/text_to_speech/encode_lc3.py b/multilang_translator/text_to_speech/encode_lc3.py new file mode 100644 index 0000000..c120771 --- /dev/null +++ b/multilang_translator/text_to_speech/encode_lc3.py @@ -0,0 +1,32 @@ +import numpy as np +import lc3 + +def encode( + audio: np.array, + output_sample_rate_hz, + octets_per_frame, + frame_duration_us=10000, + pcm_bit_depth = 16 + ): + + encoder = lc3.Encoder( + frame_duration_us=frame_duration_us, + sample_rate_hz=output_sample_rate_hz, + num_channels=1, + #input_sample_rate_hz=input_sample_rate, + ) + + lc3_frame_samples = encoder.get_frame_samples() # number of the pcm samples per lc3 frame + + # reshape array into slices of lc3_frame_samples and padd with zeros + pad_width = (lc3_frame_samples - len(audio) % lc3_frame_samples) % lc3_frame_samples # Compute padding length + arr_padded = np.pad(audio, (0, pad_width), mode='constant', constant_values=0) + reshaped_arr = arr_padded.reshape(-1, lc3_frame_samples) + + lc3_bytes = b'' + for pcm_frame in reshaped_arr: + lc3_bytes += encoder.encode( + pcm_frame, num_bytes=octets_per_frame, bit_depth=pcm_bit_depth + ) + + return lc3_bytes \ No newline at end of file diff --git a/multilang_translator/text_to_speech/text_to_speech.py b/multilang_translator/text_to_speech/text_to_speech.py index d5895ba..5ce3dde 100644 --- a/multilang_translator/text_to_speech/text_to_speech.py +++ b/multilang_translator/text_to_speech/text_to_speech.py @@ -3,40 +3,80 @@ import subprocess import time import json import logging as log +import numpy as np from multilang_translator import config +from multilang_translator.utils.resample import resample_array +from multilang_translator.text_to_speech import encode_lc3 TTS_DIR = os.path.join(os.path.dirname(__file__)) -def synthesize(text, model="en_US-lessac-medium", output_file="out.wav"): - +def synth_piper(text, model="en_US-lessac-medium",): pwd = os.getcwd() os.chdir(TTS_DIR) start = time.time() - ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent instance of the model - [config.PIPER_EXE_PATH, '--model', model, '--output_file', output_file], + ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model + [config.PIPER_EXE_PATH, + '--cuda', + '--model', model, + '--output-raw'], input=text.encode('utf-8'), capture_output=True ) - log.info('%s', ret.stdout) - log.info('%s', ret.stderr) + log.warning('Piper stderr:\n%s', ret.stderr) assert ret.returncode == 0, 'Piper returncode was not 0.' + audio = ret.stdout log.info("Running piper for model %s took %s s", model, round(time.time() - start, 3)) - with open (f'{model}.onnx.json') as f: # TODO: wrap everything into a class, store the json permanentl + with open (f'{model}.onnx.json') as f: # TODO: wrap everything into a class, store the json permanently model_json = json.load(f) os.chdir(pwd) - return model_json + return model_json, audio + + +# TODO: framework should probably be a dataclass that holds all the relevant informations, also model +# TODO: make a common repo that hold the configuration dataclasses ? +def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium", return_lc3=True): + + if framework == 'piper': + model_json, audio_raw = synth_piper(text, model) + tts_sample_rate = model_json['audio']['sample_rate'] + audio_np = np.frombuffer(audio_raw, dtype=np.dtype('