refractoring/config (#2)

- Implement streaming lc3 without the usage of files
- use pydantic for config management

Reviewed-on: https://gitea.pstruebi.xyz/auracaster/multilang-translator-local/pulls/2
This commit was merged in pull request #2.
This commit is contained in:
2025-03-05 17:58:49 +01:00
parent 0f0c8a1040
commit 7fa677d865
13 changed files with 158 additions and 121 deletions

View File

@@ -1,39 +0,0 @@
import os
ANNOUNCEMENT_DIR = os.path.join(os.path.dirname(__file__), 'announcements')
VENV_DIR = os.path.join(os.path.dirname(__file__), '../venv')
PIPER_EXE_PATH = f'{VENV_DIR}/bin/piper'
FRAME_DUR_MS = 10
SAMPLING_RATE_HZ = int(16e3)
BITRATE_BPS = int(32e3)
LANG_CONFIG = {
"de": {
"filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_de.wav",
"filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_de_resamp.wav",
"tts": 'de_DE-kerstin-low',
},
"en": {
"filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_en.wav",
"filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_en_resamp.wav",
"tts": 'en_US-lessac-medium'
},
"fr": {
"filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_fr.wav",
"filepath_wav_resamp": f"{ANNOUNCEMENT_DIR}/announcement_fr_resamp.wav",
"tts": 'fr_FR-siwis-medium'
},
# "es": {
# "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_es.wav",
# "tts": 'es_ES-sharvard-medium'
# },
# "it": {
# "filepath_wav": f"{ANNOUNCEMENT_DIR}/announcement_it.wav",
# "tts": 'it_IT-paola-medium'
# }
}
os.makedirs(ANNOUNCEMENT_DIR, exist_ok=True)
# TODO. use dataclasses from Multicaster with inherit

View File

@@ -4,6 +4,7 @@ list prompt example
"""
from __future__ import print_function, unicode_literals
from typing import List
from dataclasses import asdict
import asyncio
from copy import copy
@@ -11,13 +12,12 @@ import time
import logging as log
import aioconsole
import multilang_translator.translator_config as translator_config
from utils import resample
from translator import llm_translator, test_content
from text_to_speech import text_to_speech
from encode import encode_lc3
from auracast import multicast_control
from auracast import auracast_config
from config import LANG_CONFIG, SAMPLING_RATE_HZ
from translator.test_content import TESTSENTENCE
# TODO: look for a end to end translation solution
@@ -26,35 +26,34 @@ def transcribe():
pass # TODO: Implement transcribing input audio e.g. with whisper
def syntesize_resample(text, tts_model, file_wav, file_wav_resamp):
audio_dur = text_to_speech.synthesize(text, tts_model, file_wav)
resample.resample_file(file_wav, file_wav_resamp, target_rate=SAMPLING_RATE_HZ)
return audio_dur
async def announcement_from_german_text(
global_config: auracast_config.AuracastGlobalConfig,
translator_config: List[translator_config.TranslatorConfigDe],
caster: multicast_control.Multicaster,
text_de
):
TRANSLATOR_LLM = 'llama3.2:3b-instruct-q4_0'
base_lang = "deu"
config = copy(LANG_CONFIG)
base_lang = "de"
for i, d in enumerate(config.items()):
key, val = d
if key == base_lang:
for i, trans in enumerate(translator_config):
if trans.big.language == base_lang:
text = text_de
else:
text = llm_translator.translate_de_to_x(text_de, key, model=TRANSLATOR_LLM)
text = llm_translator.translate_de_to_x(
text_de,
trans.big.language,
model=trans.translator_llm,
client = trans.llm_client,
host=trans.llm_host_url,
token=trans.llm_host_token
)
log.info('%s', text)
lc3_audio = text_to_speech.synthesize(
text,
SAMPLING_RATE_HZ,
'piper',
val['tts'],
global_config.auracast_sampling_rate_hz,
trans.tts_system,
trans.tts_model,
return_lc3=True
)
caster.big_conf[i].audio_source = lc3_audio
@@ -65,7 +64,7 @@ async def announcement_from_german_text(
log.info("Starting all broadcasts took %s s", round(time.time() - start, 3))
async def command_line_ui(caster: multicast_control.Multicaster):
async def command_line_ui(global_conf, translator_conf, caster: multicast_control.Multicaster):
while True:
# make a list of all available testsentence
sentence_list = list(asdict(TESTSENTENCE).values())
@@ -86,43 +85,55 @@ async def command_line_ui(caster: multicast_control.Multicaster):
# Check if command is a single number
elif command.strip().isdigit():
ind = int(command.strip())
await announcement_from_german_text(caster, sentence_list[ind])
await announcement_from_german_text(
global_conf,
translator_conf,
caster,
sentence_list[ind])
await asyncio.wait([caster.streamer.task])
# Interpret the command as announcement
else:
await announcement_from_german_text(caster, command)
await asyncio.wait([caster.streamer.task])
async def main():
log.basicConfig(
level=log.INFO,
format='%(module)s.py:%(lineno)d %(levelname)s: %(message)s'
)
global_conf = auracast_config.global_base_config
global_conf = auracast_config.AuracastGlobalConfig()
#global_conf.transport='serial:/dev/serial/by-id/usb-SEGGER_J-Link_001057705357-if02,1000000,rtscts' # transport for nrf54l15dk
global_conf.transport='serial:/dev/serial/by-id/usb-ZEPHYR_Zephyr_HCI_UART_sample_81BD14B8D71B5662-if00,115200,rtscts' #nrf52dongle hci_uart usb cdc
big_conf = [
auracast_config.broadcast_de,
auracast_config.broadcast_en,
auracast_config.broadcast_fr,
translator_conf = [
translator_config.TranslatorConfigDe(),
translator_config.TranslatorConfigEn(),
translator_config.TranslatorConfigFr(),
#auracast_config.broadcast_es,
#auracast_config.broadcast_it,
]
for i, conf in enumerate(big_conf):
conf.loop = False
for conf in translator_conf:
conf.big.loop = False
conf.llm_client = 'openwebui' # comment out for local llm
conf.llm_host_url = 'https://ollama.pstruebi.xyz'
conf.llm_host_token = 'sk-17124cb84df14cc6ab2d9e17d0724d13'
caster = multicast_control.Multicaster(global_conf, big_conf)
caster = multicast_control.Multicaster(global_conf, [conf.big for conf in translator_conf])
await caster.init_broadcast()
#await announcement_from_german_text(caster, test_content.TESTSENTENCE.DE_HELLO)
#await asyncio.wait([caster.streamer.task])
await command_line_ui(caster)
# await announcement_from_german_text(
# global_conf,
# translator_conf,
# caster,
# test_content.TESTSENTENCE.DE_HELLO
# )
# await asyncio.wait([caster.streamer.task])
await command_line_ui(global_conf, translator_conf, caster)
if __name__ == '__main__':
asyncio.run(main())
# TODO: integrate this in the LANG_CONFIG dict, better: make a hierachy of dataclasses
# TODO: remove the nececcity for files
# TODO: add support for multiple radios

View File

@@ -4,28 +4,29 @@ import time
import json
import logging as log
import numpy as np
from multilang_translator import config
from multilang_translator import translator_config
from multilang_translator.utils.resample import resample_array
from multilang_translator.text_to_speech import encode_lc3
TTS_DIR = os.path.join(os.path.dirname(__file__))
PIPER_DIR = f'{TTS_DIR}/piper'
os.makedirs(PIPER_DIR, exist_ok=True)
def synth_piper(text, model="en_US-lessac-medium",):
def synth_piper(text, model="en_US-lessac-medium"):
pwd = os.getcwd()
os.chdir(PIPER_DIR)
start = time.time()
# make sure piper has voices.json in working directory, otherwise it attempts to always load models
ret = subprocess.run( # TODO: wrap this whole thing in a class and open a permanent pipe to the model
[config.PIPER_EXE_PATH,
'--cuda',
'--data-dir', PIPER_DIR,
'--download-dir', PIPER_DIR,
'--model', model,
'--output-raw'
],
[translator_config.PIPER_EXE_PATH,
'--cuda',
'--model', model,
'--output-raw'
],
input=text.encode('utf-8'),
capture_output=True
)
os.chdir(pwd)
log.warning('Piper stderr:\n%s', ret.stderr)
assert ret.returncode == 0, 'Piper returncode was not 0.'
@@ -47,7 +48,7 @@ def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium",
tts_sample_rate = model_json['audio']['sample_rate']
audio_np = np.frombuffer(audio_raw, dtype=np.dtype('<i2')).astype(np.float32) /(2**15-1)# convert to float fraction
audio = resample_array(audio_np, tts_sample_rate, target_sample_rate)
elif framework == 'koro':
pass
elif framework == 'xtts':
@@ -57,9 +58,8 @@ def synthesize(text, target_sample_rate, framework, model="en_US-lessac-medium",
else: raise NotImplementedError('unknown framework')
if return_lc3:
audio_pcm = (audio_np * 2**15-1).astype(np.int16)
audio_pcm = (audio * 2**15-1).astype(np.int16)
lc3 = encode_lc3.encode(audio_pcm, target_sample_rate, 40) # TODO: octetts per frame should be parameter
return lc3
else:
return audio
@@ -79,5 +79,5 @@ if __name__ == '__main__':
sf.write('hello.wav', audio, target_rate)
# TODO: "WARNING:piper.download:Wrong size (expected=5952, actual=4158
print('Done.')

View File

@@ -1,2 +0,0 @@
from .credentials import *
from .syspromts import *

View File

@@ -1,2 +0,0 @@
BASE_URL='https://ollama.hinterwaldner.duckdns.org'
TOKEN = 'sk-17124cb84df14cc6ab2d9e17d0724d13'

View File

@@ -5,45 +5,75 @@ import logging as log
import time
import ollama
from multilang_translator.translator import credentials
from multilang_translator.translator import syspromts
from multilang_translator.translator import test_content
# ollama.create( # TODO: create models on startup
# model='example',
# from_='llama3.2', system="You are Mario from Super Mario Bros."
# )
def query_model(model, query):
url = f'{credentials.BASE_URL}/api/chat/completions'
async def chat():
message = {'role': 'user', 'content': 'Why is the sky blue?'}
response = await ollama.AsyncClient().chat(model='llama3.2', messages=[message])
def query_openwebui(model, system, query, url, token):
url = f'{url}/api/chat/completions'
headers = {
'Authorization': f'Bearer {credentials.TOKEN}',
'Authorization': f'Bearer {token}',
}
payload = {
'model': model,
'messages': [{'role': 'user', 'content': query}],
'messages': [
{'role': 'system', 'content': system},
{'role': 'user', 'content': query}
],
}
start = time.time()
response = requests.post(url, headers=headers, json=payload)
log.info("Translating the text took %s s", round(time.time() - start, 2))
return response.json()
return response.json()['choices'][0]['message']['content']
def translate_de_to_x(text:str, target_language: str, model='llama3.2:3b-instruct-q4_0'): # remember to use instruct models
def query_ollama(model, system, query, host='http://localhost:11434'):
client = ollama.Client(
host=host,
)
response = client.chat(
model = model,
messages = [
{'role': 'system', 'content': system},
{'role': 'user', 'content': query}
],
)
return response.message.content
def translate_de_to_x( # TODO: use async ollama client later - implenent a translate async function
text:str,
target_language: str,
client='ollama',
model='llama3.2:3b-instruct-q4_0', # remember to use instruct models
host = None,
token = None
):
start=time.time()
s = getattr(syspromts, f"TRANSLATOR_DE_{target_language.upper()}")
response = ollama.chat(
model = model,
messages = [
{'role': 'system', 'content': s},
{'role': 'user', 'content': text}
],
)
s = getattr(syspromts, f"TRANSLATOR_DEU_{target_language.upper()}")
if client == 'ollama':
response = query_ollama(model, s, text, host=host)
elif client == 'openwebui':
response = query_openwebui(model, s, text, url=host, token=token)
else: raise NotImplementedError('llm client not implemented')
log.info('Running the translator to %s took %s s', target_language, round(time.time() - start, 3))
return response['message']['content']
return response
if __name__ == "__main__":
import time
from multilang_translator.translator import test_content
start=time.time()
response = translate_de_to_x('Der Zug ist da.', target_language='en', model='llama3.2:1b-instruct-q4_0')

View File

@@ -1,4 +1,6 @@
TRANSLATOR_DE_EN = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Englische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DE_FR = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Französische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DE_ES = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Spanische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DE_IT = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Italienische. Antworte nur mit der übersetzten Satz.\n'
# TODO: make this more elegant. this can probably be generated and the base lang be assumed by the llm?
TRANSLATOR_DEU_ENG = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Englische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DEU_FRA = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Französische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DEU_SPA = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Spanische. Antworte nur mit der übersetzten Satz.\n'
TRANSLATOR_DEU_ITA = 'Du bist ein Übersetzer. Übersetze die folgende Satz aus dem Deutschen ins Italienische. Antworte nur mit der übersetzten Satz.\n'

View File

@@ -5,7 +5,6 @@ class TestContent:
DE_HELLO: str = 'Hallo Welt.'
DE_GATE_OPENED: str = "Gate 23 ist jetzt geöffnet."
DE_TRAIN_ARRIVING: str = "Der Zug Nach Wien fährt heute von Gleis 3."
DE_SECURITY_CHECKPOINT_OPENING: str = "Sicherheitskontrolle 5 ist jetzt geöffnet. Bitte setzen Sie sich in Bewegung, um Ihre Wartezeit während Sicherungsprüfungen zu minimieren."
DE_SECURITY_CHECKPOINT_OPENING: str = "Sicherheitskontrolle 5 ist jetzt geöffnet. Bitte setzen Sie sich in Bewegung, um Ihre Wartezeit während Sicherheitsüberprüfungen zu minimieren."
DE_RAINBOW: str = 'Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einer von der Sonne beschienenen Wolke oder Regenwand wahrgenommen wird und ein großes Farbspektrum anzeigt.'
DE_WAVE_PARTICLE: str = 'Der Wellen-Teilchen-Dualismus ist eine Konzeption, die postuliert, dass Teilchen sowohl als Wellen auf der Mikroebene verhalten sich und genau bestimme Eigenschaften wie Impuls und Energietrang besaßen.'
TESTSENTENCE = TestContent()

View File

@@ -0,0 +1,37 @@
import os
from pydantic import BaseModel
from auracast import auracast_config
ANNOUNCEMENT_DIR = os.path.join(os.path.dirname(__file__), 'announcements')
VENV_DIR = os.path.join(os.path.dirname(__file__), '../venv')
PIPER_EXE_PATH = f'{VENV_DIR}/bin/piper'
class TranslatorBaseconfig(BaseModel):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigDe()
translator_llm: str = 'llama3.2:3b-instruct-q4_0'
llm_client: str = 'ollama'
llm_host_url: str | None = 'http://localhost:11434'
llm_host_token: str | None = None
tts_system: str = 'piper'
tts_model: str ='de_DE-kerstin-low'
class TranslatorConfigDe(TranslatorBaseconfig):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigDe()
tts_model: str ='de_DE-thorsten-high'
class TranslatorConfigEn(TranslatorBaseconfig):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigEn()
tts_model: str = 'en_GB-alba-medium'
class TranslatorConfigFr(TranslatorBaseconfig):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigFr()
tts_model: str = 'fr_FR-siwis-medium'
class TranslatorConfigEs(TranslatorBaseconfig):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigEs()
tts_model: str = 'es_ES-sharvard-medium'
class TranslatorConfigIt(TranslatorBaseconfig):
big: auracast_config.AuracastBigConfig = auracast_config.AuracastBigConfigIt()
tts_model: str = 'it_IT-paola-medium'

View File

@@ -6,7 +6,7 @@ import librosa
import soundfile as sf
def resample_file(filename, out_filename, target_rate=int(24e3)):
def resample_file(filename, out_filename, target_rate):
start=time.time()
# Load the original audio file
audio, rate = librosa.load(filename)
@@ -24,7 +24,7 @@ def resample_file(filename, out_filename, target_rate=int(24e3)):
log.info("Resampling of %s took %s s", os.path.basename(filename), round(time.time() - start, 3))
def resample_array(audio, rate, target_rate=int(24e3)):
def resample_array(audio, rate, target_rate):
start=time.time()
# Load the original audio file

View File

@@ -5,10 +5,11 @@ version = '0.1'
dependencies = [
"auracast @git+https://git@gitea.pstruebi.xyz/auracaster/bumble-auracast",
"requests",
"ollama",
"aioconsole",
"piper-tts==1.2.0"
"requests==2.32.3",
"ollama==0.4.7",
"aioconsole==0.8.1",
"piper-phonemize==1.1.0",
"piper-tts==1.2.0",
]
[project.optional-dependencies]

View File

@@ -4,7 +4,7 @@ import time
import os
import subprocess
from multilang_translator.config import LANG_CONFIG
from multilang_translator.translator_config import LANG_CONFIG
from multilang_translator.backend_controller.broadcaster_play_once import broadcaster_play_file
from multilang_translator.backend_controller.broadcaster_copy_files import copy_to_broadcaster