From 53fd074d22edba52b636ccbf9ebf833bcd5bda07 Mon Sep 17 00:00:00 2001 From: pober Date: Thu, 28 May 2026 10:12:51 +0200 Subject: [PATCH] STT experiments, implements both vosx and whisper; whisper is too slow (cpu heavy) vosx has low acc; bigger vosx needs too much storage. --- pyproject.toml | 4 +- src/auracast/faster_whisper_textcast.py | 259 +++++++++++++++++++++ src/auracast/server/multicast_frontend.py | 90 +++++++- src/auracast/server/multicast_server.py | 139 +++++++++++ src/auracast/vosk_textcast.py | 270 ++++++++++++++++++++++ 5 files changed, 759 insertions(+), 3 deletions(-) create mode 100644 src/auracast/faster_whisper_textcast.py create mode 100644 src/auracast/vosk_textcast.py diff --git a/pyproject.toml b/pyproject.toml index b416cc9..ed6456f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,9 @@ dependencies = [ "smbus2 (>=0.5.0,<0.6.0)", "samplerate (>=0.2.2,<0.3.0)", "rpi-gpio (>=0.7.1,<0.8.0)", - "pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc" + "pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc", + "vosk (>=0.3.45)", + "faster-whisper (>=1.0.0)" ] [project.optional-dependencies] diff --git a/src/auracast/faster_whisper_textcast.py b/src/auracast/faster_whisper_textcast.py new file mode 100644 index 0000000..a85567a --- /dev/null +++ b/src/auracast/faster_whisper_textcast.py @@ -0,0 +1,259 @@ +"""faster-whisper speech-to-text → TextCast streamer. + +Captures mono audio from an analog ALSA/sounddevice input, runs +faster-whisper offline ASR in a background thread (chunked, every +CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE +broadcast using the same SDU framing as text_multicast.py. + +Usage (CLI): + poetry run python -m auracast.faster_whisper_textcast \\ + --model tiny.en \\ + --device ch1 \\ + --transport serial:/dev/ttyAMA3,1000000,rtscts +""" +from __future__ import annotations + +import asyncio +import logging +import os +import queue +import threading +import time +from typing import Optional + +import numpy as np +import samplerate +import sounddevice as sd + +from auracast import auracast_config, multicast +from auracast.text_multicast import ( + SDU_SIZE, + SDU_INTERVAL_US, + _make_text_frame, + _make_idle_frame, +) + +log = logging.getLogger('faster_whisper_textcast') + +CAPTURE_SAMPLE_RATE = 48_000 +WHISPER_SAMPLE_RATE = 16_000 +BLOCK_FRAMES_48K = 4800 # 100 ms capture blocks +CHUNK_S = 3.0 # transcribe every N seconds of audio +CAPTION_HOLD_S = 4.0 # keep caption visible after last transcription +SILENCE_RMS = 0.003 # skip transcription if chunk is below this RMS +BROADCAST_NAME = 'LiveCaption' + +VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small'] + + +def _tail_to_fit(text: str, max_bytes: int) -> str: + """Return the tail of *text* that fits in *max_bytes* UTF-8 bytes.""" + encoded = text.encode('utf-8') + if len(encoded) <= max_bytes: + return text + tail = encoded[-max_bytes:].decode('utf-8', errors='ignore') + sp = tail.find(' ') + return tail[sp + 1:] if sp != -1 else tail + + +def _resolve_device(device: str) -> Optional[int]: + """Return sounddevice index for a name or numeric string, or None for default.""" + if not device: + return None + if device.isdigit(): + return int(device) + for i, d in enumerate(sd.query_devices()): + if d['name'] == device and d['max_input_channels'] > 0: + return i + log.warning("Device '%s' not found in sounddevice list – using default input", device) + return None + + +async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None: + """ISO SDU write loop – runs at ~10 ms per iteration.""" + iso_queue = bigs['big0']['iso_queue'] + last_sent: str = '' + + while True: + now = time.monotonic() + with lock: + text: str = shared.get('text', '') + expiry: float = shared.get('expiry', 0.0) + + if text and now < expiry: + display_text = _tail_to_fit(text, SDU_SIZE - 2) + if display_text != last_sent: + log.info("Caption: %s", display_text) + last_sent = display_text + frame = _make_text_frame(display_text) + else: + if last_sent: + log.info("Caption cleared") + last_sent = '' + with lock: + shared['text'] = '' + frame = _make_idle_frame() + + await iso_queue.write(frame) + + +def _whisper_thread( + model_size: str, + device: str, + shared: dict, + lock: threading.Lock, + stop_event: threading.Event, +) -> None: + """Blocking audio capture + faster-whisper transcription loop.""" + try: + from faster_whisper import WhisperModel # type: ignore + except ImportError: + log.error("faster-whisper is not installed. Run: poetry add faster-whisper") + return + + log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size) + model = WhisperModel(model_size, device="cpu", compute_type="int8") + log.info("Model '%s' loaded.", model_size) + + audio_q: queue.Queue = queue.Queue() + resampler = samplerate.Resampler('sinc_fastest', channels=1) + ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE + chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE) + audio_buffer = np.zeros(0, dtype=np.float32) + + dev_idx = _resolve_device(device) + + def _cb(indata: np.ndarray, frames: int, time_info, status) -> None: + if status: + log.warning("Audio status: %s", status) + if stop_event.is_set(): + raise sd.CallbackStop() + mono = indata[:, 0].astype(np.float32) + downsampled = resampler.process(mono, ratio, end_of_input=False) + audio_q.put(downsampled.copy()) + + try: + with sd.InputStream( + samplerate=CAPTURE_SAMPLE_RATE, + blocksize=BLOCK_FRAMES_48K, + device=dev_idx, + dtype='float32', + channels=1, + callback=_cb, + ): + log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx) + while not stop_event.is_set(): + try: + chunk = audio_q.get(timeout=0.2) + audio_buffer = np.concatenate([audio_buffer, chunk]) + except queue.Empty: + continue + + if len(audio_buffer) < chunk_frames: + continue + + pcm = audio_buffer[:chunk_frames].copy() + audio_buffer = audio_buffer[chunk_frames:] + + rms = float(np.sqrt(np.mean(pcm ** 2))) + if rms < SILENCE_RMS: + continue + + t0 = time.monotonic() + segments, _ = model.transcribe( + pcm, + beam_size=1, + language="en", + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 300}, + ) + text = ' '.join(s.text.strip() for s in segments).strip() + elapsed = time.monotonic() - t0 + + if text: + log.info("Transcribed (%.2fs): %s", elapsed, text) + with lock: + shared['text'] = text + shared['expiry'] = time.monotonic() + CAPTION_HOLD_S + else: + log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed) + + except Exception as exc: + log.error("WhisperCast thread error: %s", exc, exc_info=True) + + +async def broadcast_whisper( + transport: str, + model_size: str = 'tiny.en', + device: str = 'ch1', +) -> None: + """Start a faster-whisper → TextCast broadcast. Runs until cancelled.""" + if model_size not in VALID_MODELS: + raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}") + + config = auracast_config.AuracastConfigGroup( + bigs=[ + auracast_config.AuracastBigConfig( + name=BROADCAST_NAME, + program_info='Live Captions', + language='eng', + audio_source='file:dummy', + iso_que_len=4, + ), + ], + auracast_sampling_rate_hz=16000, + octets_per_frame=SDU_SIZE, + frame_duration_us=SDU_INTERVAL_US, + presentation_delay_us=40_000, + qos_config=auracast_config.AuracastQosRobust(), + transport=transport, + ) + + shared: dict = {'text': '', 'expiry': 0.0} + lock = threading.Lock() + stop_event = threading.Event() + + async with multicast.create_device(config) as ble_device: + bigs = await multicast.init_broadcast(ble_device, config, config.bigs) + + t = threading.Thread( + target=_whisper_thread, + args=(model_size, device, shared, lock, stop_event), + daemon=True, + ) + t.start() + log.info("WhisperCast started (device=%s, model=%s)", device, model_size) + + try: + await _iso_write_loop(bigs, shared, lock) + except asyncio.CancelledError: + log.info("WhisperCast cancelled – shutting down") + stop_event.set() + t.join(timeout=5.0) + raise + + +def main() -> None: + global CHUNK_S + import argparse + parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast') + parser.add_argument( + '--model', default='tiny.en', choices=VALID_MODELS, + help='Whisper model size (default: tiny.en)', + ) + parser.add_argument('--device', default='ch1', + help='sounddevice input name or index (default: ch1)') + parser.add_argument( + '--transport', + default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'), + help='Bumble HCI transport string', + ) + parser.add_argument('--chunk', type=float, default=CHUNK_S, + help=f'Seconds per transcription chunk (default: {CHUNK_S})') + args = parser.parse_args() + CHUNK_S = args.chunk + multicast.run_async(broadcast_whisper(args.transport, args.model, args.device)) + + +if __name__ == '__main__': + main() diff --git a/src/auracast/server/multicast_frontend.py b/src/auracast/server/multicast_frontend.py index 607b870..c8e22ed 100644 --- a/src/auracast/server/multicast_frontend.py +++ b/src/auracast/server/multicast_frontend.py @@ -142,6 +142,8 @@ except Exception: # Define is_streaming early from the fetched status for use throughout the UI is_streaming = bool(saved_settings.get("is_streaming", False)) textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False)) +voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False)) +whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False)) # Extract secondary status, if provided by the backend /status endpoint. secondary_status = saved_settings.get("secondary") or {} @@ -187,6 +189,8 @@ options = [ "Analog", "Network - Dante", "TextCast", + "VoskCast", + "WhisperCast", ] saved_audio_mode = saved_settings.get("audio_mode", "Demo") if saved_audio_mode not in options: @@ -198,7 +202,7 @@ audio_mode = st.selectbox( "Audio Mode", options, index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"), - disabled=is_streaming or textcast_is_streaming, + disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming, help=( "Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), " "'Network' (AES67) for network RTP/AES67 sources, " @@ -230,6 +234,8 @@ else: # Start/Stop buttons and status (moved to top) if audio_mode == "TextCast": start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False) +elif audio_mode == "VoskCast": + start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False) elif audio_mode == "Demo": start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming) else: @@ -266,6 +272,54 @@ if audio_mode == "TextCast": except Exception as _e: st.error(f"Could not load sample: {_e}") +# WhisperCast: model size + input device +if audio_mode == "WhisperCast": + st.markdown("#### Live Speech Recognition (faster-whisper)") + _whisper_default_model = saved_settings.get("whispercast_model", "tiny.en") + _whisper_default_device = saved_settings.get("whispercast_device", "ch1") + col_wm, col_wd = st.columns([2, 1]) + with col_wm: + whisper_model_size = st.selectbox( + "Whisper Model", + ["tiny.en", "base.en", "small.en"], + index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model) + if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0, + disabled=whispercast_is_streaming, + help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)", + ) + with col_wd: + whisper_device = st.selectbox( + "Input", + ["ch1", "ch2"], + index=0 if _whisper_default_device == "ch1" else 1, + disabled=whispercast_is_streaming, + help="Analog input channel", + ) + st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.") + +# VoskCast: model path + input device +if audio_mode == "VoskCast": + st.markdown("#### Live Speech Recognition (Vosk)") + _vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us") + _vosk_default_device = saved_settings.get("voskcast_device", "ch1") + col_model, col_dev = st.columns([3, 1]) + with col_model: + vosk_model_path = st.text_input( + "Vosk Model Path", + value=_vosk_default_model, + disabled=voskcast_is_streaming, + help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models", + ) + with col_dev: + vosk_device = st.selectbox( + "Input", + ["ch1", "ch2"], + index=0 if _vosk_default_device == "ch1" else 1, + disabled=voskcast_is_streaming, + help="Analog input channel (ch1 = left, ch2 = right)", + ) + st.caption("Partial results appear immediately; final results are held for 4 s then cleared.") + # Analog gain control (only for Analog mode, placed below start button) analog_gain_db_left = 0 # default (dB) analog_gain_db_right = 0 # default (dB) @@ -1830,6 +1884,10 @@ if stop_stream: try: if audio_mode == "TextCast": r = requests.post(f"{BACKEND_URL}/stop_textcast").json() + elif audio_mode == "VoskCast": + r = requests.post(f"{BACKEND_URL}/stop_voskcast").json() + elif audio_mode == "WhisperCast": + r = requests.post(f"{BACKEND_URL}/stop_whispercast").json() else: r = requests.post(f"{BACKEND_URL}/stop_audio").json() if audio_mode == "Demo": @@ -1860,6 +1918,34 @@ if start_stream: except Exception as e: st.error(f"Error: {e}") + elif audio_mode == "VoskCast": + try: + rs = requests.post( + f"{BACKEND_URL}/start_voskcast", + json={"model": vosk_model_path, "device": vosk_device}, + ) + if rs.ok: + st.success("VoskCast started.") + st.rerun() + else: + st.error(f"Start failed: {rs.text}") + except Exception as e: + st.error(f"Error: {e}") + + elif audio_mode == "WhisperCast": + try: + rs = requests.post( + f"{BACKEND_URL}/start_whispercast", + json={"model": whisper_model_size, "device": whisper_device}, + ) + if rs.ok: + st.success("WhisperCast started.") + st.rerun() + else: + st.error(f"Start failed: {rs.text}") + except Exception as e: + st.error(f"Error: {e}") + else: # Always send stop to ensure backend is in a clean state, regardless of current status @@ -2084,7 +2170,7 @@ if start_stream: st.error(f"Failed to initialize Dante Radio 2: {r2.text}") except Exception as e: st.error(f"Error while starting Dante radios: {e}") - if audio_mode not in ("Demo", "Analog", "Network - Dante"): + if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"): # USB/Network: single config as before, using shared controls q = QUALITY_MAP[quality] config = auracast_config.AuracastConfigGroup( diff --git a/src/auracast/server/multicast_server.py b/src/auracast/server/multicast_server.py index b4d4b72..a6d3e06 100644 --- a/src/auracast/server/multicast_server.py +++ b/src/auracast/server/multicast_server.py @@ -213,6 +213,12 @@ _stream_lock = asyncio.Lock() # serialize initialize/stop_audio on API side _textcast_task: asyncio.Task | None = None DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml') +# VoskCast state +_voskcast_task: asyncio.Task | None = None + +# WhisperCast state +_whispercast_task: asyncio.Task | None = None + # BLE / audio event loop – set in __main__ before uvicorn starts. # All coroutines that touch Bumble objects or the audio pipeline MUST run # on this loop. HTTP handlers call _on_ble_loop() to cross into it. @@ -784,6 +790,133 @@ async def _stop_textcast_impl(): return {"status": "stopped", "was_running": was_running} +@app.post("/start_voskcast") +async def start_voskcast(body: dict = {}): + """Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}""" + return await _on_ble_loop(_start_voskcast_impl(body)) + + +async def _start_voskcast_impl(body: dict) -> dict: + global _voskcast_task + from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH + + model = body.get('model') or DEFAULT_MODEL_PATH + device = body.get('device', 'ch1') + + await _stop_all() + await _stop_textcast_impl() + await _stop_voskcast_impl() + + _voskcast_task = asyncio.get_event_loop().create_task( + broadcast_vosk(TRANSPORT1, model, device) + ) + + settings = { + 'is_streaming': True, + 'audio_mode': 'VoskCast', + 'voskcast_is_streaming': True, + 'voskcast_device': device, + 'voskcast_model': model, + 'timestamp': datetime.utcnow().isoformat(), + } + save_stream_settings(settings) + _led_on() + log.info("VoskCast started (device=%s, model=%s)", device, model) + return {"status": "started"} + + +@app.post("/stop_voskcast") +async def stop_voskcast(): + """Stop an active VoskCast broadcast.""" + return await _on_ble_loop(_stop_voskcast_impl()) + + +async def _stop_voskcast_impl() -> dict: + global _voskcast_task + was_running = False + if _voskcast_task is not None and not _voskcast_task.done(): + was_running = True + _voskcast_task.cancel() + try: + await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0) + except (asyncio.CancelledError, asyncio.TimeoutError, Exception): + pass + _voskcast_task = None + _led_off() + settings = load_stream_settings() or {} + if settings.get('audio_mode') == 'VoskCast': + settings['is_streaming'] = False + settings['voskcast_is_streaming'] = False + settings['timestamp'] = datetime.utcnow().isoformat() + save_stream_settings(settings) + log.info("VoskCast stopped") + return {"status": "stopped", "was_running": was_running} + + +@app.post("/start_whispercast") +async def start_whispercast(body: dict = {}): + """Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}""" + return await _on_ble_loop(_start_whispercast_impl(body)) + + +async def _start_whispercast_impl(body: dict) -> dict: + global _whispercast_task + from auracast.faster_whisper_textcast import broadcast_whisper + + model = body.get('model', 'tiny.en') + device = body.get('device', 'ch1') + + await _stop_all() + await _stop_textcast_impl() + await _stop_voskcast_impl() + await _stop_whispercast_impl() + + _whispercast_task = asyncio.get_event_loop().create_task( + broadcast_whisper(TRANSPORT1, model, device) + ) + + settings = { + 'is_streaming': True, + 'audio_mode': 'WhisperCast', + 'whispercast_is_streaming': True, + 'whispercast_device': device, + 'whispercast_model': model, + 'timestamp': datetime.utcnow().isoformat(), + } + save_stream_settings(settings) + _led_on() + log.info("WhisperCast started (device=%s, model=%s)", device, model) + return {"status": "started"} + + +@app.post("/stop_whispercast") +async def stop_whispercast(): + """Stop an active WhisperCast broadcast.""" + return await _on_ble_loop(_stop_whispercast_impl()) + + +async def _stop_whispercast_impl() -> dict: + global _whispercast_task + was_running = False + if _whispercast_task is not None and not _whispercast_task.done(): + was_running = True + _whispercast_task.cancel() + try: + await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0) + except (asyncio.CancelledError, asyncio.TimeoutError, Exception): + pass + _whispercast_task = None + _led_off() + settings = load_stream_settings() or {} + if settings.get('audio_mode') == 'WhisperCast': + settings['is_streaming'] = False + settings['whispercast_is_streaming'] = False + settings['timestamp'] = datetime.utcnow().isoformat() + save_stream_settings(settings) + log.info("WhisperCast stopped") + return {"status": "stopped", "was_running": was_running} + + @app.post("/adc_gain") async def set_adc_gain(payload: dict): """Set ADC gain in dB for left and right channels without restarting the stream. @@ -845,6 +978,12 @@ async def get_status(): status["textcast_is_streaming"] = ( _textcast_task is not None and not _textcast_task.done() ) + status["voskcast_is_streaming"] = ( + _voskcast_task is not None and not _voskcast_task.done() + ) + status["whispercast_is_streaming"] = ( + _whispercast_task is not None and not _whispercast_task.done() + ) return status diff --git a/src/auracast/vosk_textcast.py b/src/auracast/vosk_textcast.py new file mode 100644 index 0000000..b74a974 --- /dev/null +++ b/src/auracast/vosk_textcast.py @@ -0,0 +1,270 @@ +"""Vosk speech-to-text → TextCast streamer. + +Captures mono audio from an analog ALSA/sounddevice input, runs Vosk +offline ASR in a background thread, and broadcasts recognised text over +the TextCast BLE broadcast using the same SDU framing as text_multicast.py. + +Usage (CLI): + poetry run python -m auracast.vosk_textcast \\ + --model /path/to/vosk-model-en-us \\ + --device ch1 \\ + --transport serial:/dev/ttyAMA3,1000000,rtscts + +Environment: + VOSK_MODEL_PATH – default Vosk model directory + AURACAST_TRANSPORT – default HCI transport string +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import os +import threading +import time +from typing import Optional + +import numpy as np +import samplerate +import sounddevice as sd + +from auracast import auracast_config, multicast +from auracast.text_multicast import ( + SDU_SIZE, + SDU_INTERVAL_US, + _make_text_frame, + _make_idle_frame, +) + +log = logging.getLogger('vosk_textcast') + +VOSK_SAMPLE_RATE = 16_000 # Vosk models expect 16 kHz +CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz) +BLOCK_FRAMES_48K = 4800 # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz +CAPTION_HOLD_S = 4.0 # Keep caption visible N seconds after last speech +BROADCAST_NAME = 'LiveCaption' + +DEFAULT_MODEL_PATH = os.environ.get( + 'VOSK_MODEL_PATH', + os.path.expanduser('~/vosk-model-en-us'), +) + + +def _tail_to_fit(text: str, max_bytes: int) -> str: + """Return the tail of *text* that fits in *max_bytes* UTF-8 bytes.""" + encoded = text.encode('utf-8') + if len(encoded) <= max_bytes: + return text + tail = encoded[-max_bytes:].decode('utf-8', errors='ignore') + sp = tail.find(' ') + return tail[sp + 1:] if sp != -1 else tail + + +def _new_words(old: str, new: str) -> str: + """Return the words appended to *new* beyond the shared prefix with *old*. + + If *new* doesn't start with *old* (different utterance), return *new* in full. + """ + old_words = old.split() + new_words = new.split() + if new_words[:len(old_words)] == old_words: + extra = new_words[len(old_words):] + return ' '.join(extra) + return new + + +def _resolve_device(device: str) -> Optional[int]: + """Return sounddevice index for a name or numeric string, or None for default.""" + if not device: + return None + if device.isdigit(): + return int(device) + for i, d in enumerate(sd.query_devices()): + if d['name'] == device and d['max_input_channels'] > 0: + return i + log.warning("Device '%s' not found in sounddevice list – using default input", device) + return None + + +async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None: + """ISO SDU write loop. + + Runs at ~10 ms per iteration (flow-controlled by the BLE controller). + Sends the current recognised text (partial or final) as-is. + """ + iso_queue = bigs['big0']['iso_queue'] + last_sent: str = '' + + while True: + now = time.monotonic() + with lock: + text: str = shared.get('text', '') + expiry: float = shared.get('expiry', 0.0) + + if text and now < expiry: + display_text = _tail_to_fit(text, SDU_SIZE - 2) + if display_text != last_sent: + log.info("Caption: %s", display_text) + last_sent = display_text + frame = _make_text_frame(display_text) + else: + if last_sent: + log.info("Caption cleared") + last_sent = '' + with lock: + shared['text'] = '' + frame = _make_idle_frame() + + await iso_queue.write(frame) + + +def _vosk_thread( + model_path: str, + device: str, + shared: dict, + lock: threading.Lock, + stop_event: threading.Event, +) -> None: + """Blocking audio capture + Vosk recognition loop. Runs in a daemon thread.""" + try: + from vosk import KaldiRecognizer, Model # type: ignore + except ImportError: + log.error("vosk is not installed. Run: poetry add vosk") + return + + log.info("Loading Vosk model from %s …", model_path) + model = Model(model_path) + rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE) + rec.SetMaxAlternatives(0) + rec.SetWords(False) + + resampler = samplerate.Resampler('sinc_fastest', channels=1) + ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE + + dev_idx = _resolve_device(device) + last_word_count = [0] # word count of last partial sent to display + + def _cb(indata: np.ndarray, frames: int, time_info, status) -> None: + if status: + log.warning("Audio status: %s", status) + if stop_event.is_set(): + raise sd.CallbackStop() + + # Resample 48 kHz → 16 kHz + mono = indata[:, 0].astype(np.float32) + downsampled = resampler.process(mono, ratio, end_of_input=False) + pcm16 = (downsampled * 32767).astype(np.int16).tobytes() + + if rec.AcceptWaveform(pcm16): + result = json.loads(rec.Result()) + final_text = result.get('text', '').strip() + if final_text: + log.info("Final: %s", final_text) + with lock: + shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2) + shared['expiry'] = time.monotonic() + CAPTION_HOLD_S + last_word_count[0] = 0 # reset for next sentence + else: + partial_text = json.loads(rec.PartialResult()).get('partial', '').strip() + if partial_text: + wc = len(partial_text.split()) + if wc > last_word_count[0]: # new word arrived + last_word_count[0] = wc + with lock: + shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2) + shared['expiry'] = time.monotonic() + CAPTION_HOLD_S + + try: + with sd.InputStream( + samplerate=CAPTURE_SAMPLE_RATE, + blocksize=BLOCK_FRAMES_48K, + device=dev_idx, + dtype='float32', + channels=1, + callback=_cb, + ): + log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx) + stop_event.wait() + except Exception as exc: + log.error("Vosk audio thread error: %s", exc, exc_info=True) + + +async def broadcast_vosk( + transport: str, + model_path: str = DEFAULT_MODEL_PATH, + device: str = 'ch1', +) -> None: + """Start a Vosk STT → TextCast broadcast. Runs until cancelled.""" + model_path = os.path.expanduser(model_path) + if not os.path.exists(model_path): + raise FileNotFoundError( + f"Vosk model not found at '{model_path}'. " + "Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH." + ) + + config = auracast_config.AuracastConfigGroup( + bigs=[ + auracast_config.AuracastBigConfig( + name=BROADCAST_NAME, + program_info='Live Captions', + language='eng', + audio_source='file:dummy', + iso_que_len=4, + ), + ], + auracast_sampling_rate_hz=16000, + octets_per_frame=SDU_SIZE, + frame_duration_us=SDU_INTERVAL_US, + presentation_delay_us=40_000, + qos_config=auracast_config.AuracastQosRobust(), + transport=transport, + ) + + shared: dict = {'text': '', 'expiry': 0.0} + lock = threading.Lock() + stop_event = threading.Event() + + async with multicast.create_device(config) as ble_device: + bigs = await multicast.init_broadcast(ble_device, config, config.bigs) + + t = threading.Thread( + target=_vosk_thread, + args=(model_path, device, shared, lock, stop_event), + daemon=True, + ) + t.start() + log.info("VoskCast started (device=%s, model=%s)", device, model_path) + + try: + await _iso_write_loop(bigs, shared, lock) + except asyncio.CancelledError: + log.info("VoskCast cancelled – shutting down") + stop_event.set() + t.join(timeout=3.0) + raise + + +def main() -> None: + parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast') + parser.add_argument( + '--model', + default=DEFAULT_MODEL_PATH, + help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})', + ) + parser.add_argument( + '--device', + default='ch1', + help='sounddevice input device name or index (default: ch1)', + ) + parser.add_argument( + '--transport', + default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'), + help='Bumble HCI transport string', + ) + args = parser.parse_args() + multicast.run_async(broadcast_vosk(args.transport, args.model, args.device)) + + +if __name__ == '__main__': + main()