STT experiments, implements both vosx and whisper; whisper is too slow (cpu heavy) vosx has low acc; bigger vosx needs too much storage.

2026-05-28 10:12:51 +02:00
parent d1471fae79
commit 53fd074d22
5 changed files with 759 additions and 3 deletions
@@ -19,7 +19,9 @@ dependencies = [
    "smbus2 (>=0.5.0,<0.6.0)",
    "samplerate (>=0.2.2,<0.3.0)",
    "rpi-gpio (>=0.7.1,<0.8.0)",
-    "pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc"
+    "pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc",
+    "vosk (>=0.3.45)",
+    "faster-whisper (>=1.0.0)"
 ]

 [project.optional-dependencies]
@@ -0,0 +1,259 @@
+"""faster-whisper speech-to-text → TextCast streamer.
+
+Captures mono audio from an analog ALSA/sounddevice input, runs
+faster-whisper offline ASR in a background thread (chunked, every
+CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE
+broadcast using the same SDU framing as text_multicast.py.
+
+Usage (CLI):
+    poetry run python -m auracast.faster_whisper_textcast \\
+        --model tiny.en \\
+        --device ch1 \\
+        --transport serial:/dev/ttyAMA3,1000000,rtscts
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import queue
+import threading
+import time
+from typing import Optional
+
+import numpy as np
+import samplerate
+import sounddevice as sd
+
+from auracast import auracast_config, multicast
+from auracast.text_multicast import (
+    SDU_SIZE,
+    SDU_INTERVAL_US,
+    _make_text_frame,
+    _make_idle_frame,
+)
+
+log = logging.getLogger('faster_whisper_textcast')
+
+CAPTURE_SAMPLE_RATE = 48_000
+WHISPER_SAMPLE_RATE = 16_000
+BLOCK_FRAMES_48K = 4800       # 100 ms capture blocks
+CHUNK_S = 3.0                  # transcribe every N seconds of audio
+CAPTION_HOLD_S = 4.0           # keep caption visible after last transcription
+SILENCE_RMS = 0.003            # skip transcription if chunk is below this RMS
+BROADCAST_NAME = 'LiveCaption'
+
+VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small']
+
+
+def _tail_to_fit(text: str, max_bytes: int) -> str:
+    """Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
+    encoded = text.encode('utf-8')
+    if len(encoded) <= max_bytes:
+        return text
+    tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
+    sp = tail.find(' ')
+    return tail[sp + 1:] if sp != -1 else tail
+
+
+def _resolve_device(device: str) -> Optional[int]:
+    """Return sounddevice index for a name or numeric string, or None for default."""
+    if not device:
+        return None
+    if device.isdigit():
+        return int(device)
+    for i, d in enumerate(sd.query_devices()):
+        if d['name'] == device and d['max_input_channels'] > 0:
+            return i
+    log.warning("Device '%s' not found in sounddevice list – using default input", device)
+    return None
+
+
+async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
+    """ISO SDU write loop – runs at ~10 ms per iteration."""
+    iso_queue = bigs['big0']['iso_queue']
+    last_sent: str = ''
+
+    while True:
+        now = time.monotonic()
+        with lock:
+            text: str = shared.get('text', '')
+            expiry: float = shared.get('expiry', 0.0)
+
+        if text and now < expiry:
+            display_text = _tail_to_fit(text, SDU_SIZE - 2)
+            if display_text != last_sent:
+                log.info("Caption: %s", display_text)
+                last_sent = display_text
+            frame = _make_text_frame(display_text)
+        else:
+            if last_sent:
+                log.info("Caption cleared")
+                last_sent = ''
+                with lock:
+                    shared['text'] = ''
+            frame = _make_idle_frame()
+
+        await iso_queue.write(frame)
+
+
+def _whisper_thread(
+    model_size: str,
+    device: str,
+    shared: dict,
+    lock: threading.Lock,
+    stop_event: threading.Event,
+) -> None:
+    """Blocking audio capture + faster-whisper transcription loop."""
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+    except ImportError:
+        log.error("faster-whisper is not installed. Run: poetry add faster-whisper")
+        return
+
+    log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size)
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+    log.info("Model '%s' loaded.", model_size)
+
+    audio_q: queue.Queue = queue.Queue()
+    resampler = samplerate.Resampler('sinc_fastest', channels=1)
+    ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
+    chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE)
+    audio_buffer = np.zeros(0, dtype=np.float32)
+
+    dev_idx = _resolve_device(device)
+
+    def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
+        if status:
+            log.warning("Audio status: %s", status)
+        if stop_event.is_set():
+            raise sd.CallbackStop()
+        mono = indata[:, 0].astype(np.float32)
+        downsampled = resampler.process(mono, ratio, end_of_input=False)
+        audio_q.put(downsampled.copy())
+
+    try:
+        with sd.InputStream(
+            samplerate=CAPTURE_SAMPLE_RATE,
+            blocksize=BLOCK_FRAMES_48K,
+            device=dev_idx,
+            dtype='float32',
+            channels=1,
+            callback=_cb,
+        ):
+            log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx)
+            while not stop_event.is_set():
+                try:
+                    chunk = audio_q.get(timeout=0.2)
+                    audio_buffer = np.concatenate([audio_buffer, chunk])
+                except queue.Empty:
+                    continue
+
+                if len(audio_buffer) < chunk_frames:
+                    continue
+
+                pcm = audio_buffer[:chunk_frames].copy()
+                audio_buffer = audio_buffer[chunk_frames:]
+
+                rms = float(np.sqrt(np.mean(pcm ** 2)))
+                if rms < SILENCE_RMS:
+                    continue
+
+                t0 = time.monotonic()
+                segments, _ = model.transcribe(
+                    pcm,
+                    beam_size=1,
+                    language="en",
+                    vad_filter=True,
+                    vad_parameters={"min_silence_duration_ms": 300},
+                )
+                text = ' '.join(s.text.strip() for s in segments).strip()
+                elapsed = time.monotonic() - t0
+
+                if text:
+                    log.info("Transcribed (%.2fs): %s", elapsed, text)
+                    with lock:
+                        shared['text'] = text
+                        shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
+                else:
+                    log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed)
+
+    except Exception as exc:
+        log.error("WhisperCast thread error: %s", exc, exc_info=True)
+
+
+async def broadcast_whisper(
+    transport: str,
+    model_size: str = 'tiny.en',
+    device: str = 'ch1',
+) -> None:
+    """Start a faster-whisper → TextCast broadcast. Runs until cancelled."""
+    if model_size not in VALID_MODELS:
+        raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}")
+
+    config = auracast_config.AuracastConfigGroup(
+        bigs=[
+            auracast_config.AuracastBigConfig(
+                name=BROADCAST_NAME,
+                program_info='Live Captions',
+                language='eng',
+                audio_source='file:dummy',
+                iso_que_len=4,
+            ),
+        ],
+        auracast_sampling_rate_hz=16000,
+        octets_per_frame=SDU_SIZE,
+        frame_duration_us=SDU_INTERVAL_US,
+        presentation_delay_us=40_000,
+        qos_config=auracast_config.AuracastQosRobust(),
+        transport=transport,
+    )
+
+    shared: dict = {'text': '', 'expiry': 0.0}
+    lock = threading.Lock()
+    stop_event = threading.Event()
+
+    async with multicast.create_device(config) as ble_device:
+        bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
+
+        t = threading.Thread(
+            target=_whisper_thread,
+            args=(model_size, device, shared, lock, stop_event),
+            daemon=True,
+        )
+        t.start()
+        log.info("WhisperCast started (device=%s, model=%s)", device, model_size)
+
+        try:
+            await _iso_write_loop(bigs, shared, lock)
+        except asyncio.CancelledError:
+            log.info("WhisperCast cancelled – shutting down")
+            stop_event.set()
+            t.join(timeout=5.0)
+            raise
+
+
+def main() -> None:
+    global CHUNK_S
+    import argparse
+    parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast')
+    parser.add_argument(
+        '--model', default='tiny.en', choices=VALID_MODELS,
+        help='Whisper model size (default: tiny.en)',
+    )
+    parser.add_argument('--device', default='ch1',
+                        help='sounddevice input name or index (default: ch1)')
+    parser.add_argument(
+        '--transport',
+        default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
+        help='Bumble HCI transport string',
+    )
+    parser.add_argument('--chunk', type=float, default=CHUNK_S,
+                        help=f'Seconds per transcription chunk (default: {CHUNK_S})')
+    args = parser.parse_args()
+    CHUNK_S = args.chunk
+    multicast.run_async(broadcast_whisper(args.transport, args.model, args.device))
+
+
+if __name__ == '__main__':
+    main()
@@ -142,6 +142,8 @@ except Exception:
 # Define is_streaming early from the fetched status for use throughout the UI
 is_streaming = bool(saved_settings.get("is_streaming", False))
 textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False))
+voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False))
+whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False))

 # Extract secondary status, if provided by the backend /status endpoint.
 secondary_status = saved_settings.get("secondary") or {}
@@ -187,6 +189,8 @@ options = [
    "Analog",
    "Network - Dante",
    "TextCast",
+    "VoskCast",
+    "WhisperCast",
    ]
 saved_audio_mode = saved_settings.get("audio_mode", "Demo")
 if saved_audio_mode not in options:
@@ -198,7 +202,7 @@ audio_mode = st.selectbox(
    "Audio Mode",
    options,
    index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"),
-    disabled=is_streaming or textcast_is_streaming,
+    disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming,
    help=(
        "Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), "
        "'Network' (AES67) for network RTP/AES67 sources, "
@@ -230,6 +234,8 @@ else:
 # Start/Stop buttons and status (moved to top)
 if audio_mode == "TextCast":
    start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False)
+elif audio_mode == "VoskCast":
+    start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False)
 elif audio_mode == "Demo":
    start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming)
 else:
@@ -266,6 +272,54 @@ if audio_mode == "TextCast":
            except Exception as _e:
                st.error(f"Could not load sample: {_e}")

+# WhisperCast: model size + input device
+if audio_mode == "WhisperCast":
+    st.markdown("#### Live Speech Recognition (faster-whisper)")
+    _whisper_default_model = saved_settings.get("whispercast_model", "tiny.en")
+    _whisper_default_device = saved_settings.get("whispercast_device", "ch1")
+    col_wm, col_wd = st.columns([2, 1])
+    with col_wm:
+        whisper_model_size = st.selectbox(
+            "Whisper Model",
+            ["tiny.en", "base.en", "small.en"],
+            index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model)
+                  if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0,
+            disabled=whispercast_is_streaming,
+            help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)",
+        )
+    with col_wd:
+        whisper_device = st.selectbox(
+            "Input",
+            ["ch1", "ch2"],
+            index=0 if _whisper_default_device == "ch1" else 1,
+            disabled=whispercast_is_streaming,
+            help="Analog input channel",
+        )
+    st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.")
+
+# VoskCast: model path + input device
+if audio_mode == "VoskCast":
+    st.markdown("#### Live Speech Recognition (Vosk)")
+    _vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us")
+    _vosk_default_device = saved_settings.get("voskcast_device", "ch1")
+    col_model, col_dev = st.columns([3, 1])
+    with col_model:
+        vosk_model_path = st.text_input(
+            "Vosk Model Path",
+            value=_vosk_default_model,
+            disabled=voskcast_is_streaming,
+            help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models",
+        )
+    with col_dev:
+        vosk_device = st.selectbox(
+            "Input",
+            ["ch1", "ch2"],
+            index=0 if _vosk_default_device == "ch1" else 1,
+            disabled=voskcast_is_streaming,
+            help="Analog input channel (ch1 = left, ch2 = right)",
+        )
+    st.caption("Partial results appear immediately; final results are held for 4 s then cleared.")
+
 # Analog gain control (only for Analog mode, placed below start button)
 analog_gain_db_left = 0  # default (dB)
 analog_gain_db_right = 0  # default (dB)
@@ -1830,6 +1884,10 @@ if stop_stream:
    try:
        if audio_mode == "TextCast":
            r = requests.post(f"{BACKEND_URL}/stop_textcast").json()
+        elif audio_mode == "VoskCast":
+            r = requests.post(f"{BACKEND_URL}/stop_voskcast").json()
+        elif audio_mode == "WhisperCast":
+            r = requests.post(f"{BACKEND_URL}/stop_whispercast").json()
        else:
            r = requests.post(f"{BACKEND_URL}/stop_audio").json()
            if audio_mode == "Demo":
@@ -1860,6 +1918,34 @@ if start_stream:
            except Exception as e:
                st.error(f"Error: {e}")

+    elif audio_mode == "VoskCast":
+        try:
+            rs = requests.post(
+                f"{BACKEND_URL}/start_voskcast",
+                json={"model": vosk_model_path, "device": vosk_device},
+            )
+            if rs.ok:
+                st.success("VoskCast started.")
+                st.rerun()
+            else:
+                st.error(f"Start failed: {rs.text}")
+        except Exception as e:
+            st.error(f"Error: {e}")
+
+    elif audio_mode == "WhisperCast":
+        try:
+            rs = requests.post(
+                f"{BACKEND_URL}/start_whispercast",
+                json={"model": whisper_model_size, "device": whisper_device},
+            )
+            if rs.ok:
+                st.success("WhisperCast started.")
+                st.rerun()
+            else:
+                st.error(f"Start failed: {rs.text}")
+        except Exception as e:
+            st.error(f"Error: {e}")
+
    else:

    # Always send stop to ensure backend is in a clean state, regardless of current status
@@ -2084,7 +2170,7 @@ if start_stream:
                    st.error(f"Failed to initialize Dante Radio 2: {r2.text}")
        except Exception as e:
            st.error(f"Error while starting Dante radios: {e}")
-    if audio_mode not in ("Demo", "Analog", "Network - Dante"):
+    if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"):
        # USB/Network: single config as before, using shared controls
        q = QUALITY_MAP[quality]
        config = auracast_config.AuracastConfigGroup(
@@ -213,6 +213,12 @@ _stream_lock = asyncio.Lock()  # serialize initialize/stop_audio on API side
 _textcast_task: asyncio.Task | None = None
 DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml')

+# VoskCast state
+_voskcast_task: asyncio.Task | None = None
+
+# WhisperCast state
+_whispercast_task: asyncio.Task | None = None
+
 # BLE / audio event loop – set in __main__ before uvicorn starts.
 # All coroutines that touch Bumble objects or the audio pipeline MUST run
 # on this loop.  HTTP handlers call _on_ble_loop() to cross into it.
@@ -784,6 +790,133 @@ async def _stop_textcast_impl():
    return {"status": "stopped", "was_running": was_running}


+@app.post("/start_voskcast")
+async def start_voskcast(body: dict = {}):
+    """Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}"""
+    return await _on_ble_loop(_start_voskcast_impl(body))
+
+
+async def _start_voskcast_impl(body: dict) -> dict:
+    global _voskcast_task
+    from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH
+
+    model = body.get('model') or DEFAULT_MODEL_PATH
+    device = body.get('device', 'ch1')
+
+    await _stop_all()
+    await _stop_textcast_impl()
+    await _stop_voskcast_impl()
+
+    _voskcast_task = asyncio.get_event_loop().create_task(
+        broadcast_vosk(TRANSPORT1, model, device)
+    )
+
+    settings = {
+        'is_streaming': True,
+        'audio_mode': 'VoskCast',
+        'voskcast_is_streaming': True,
+        'voskcast_device': device,
+        'voskcast_model': model,
+        'timestamp': datetime.utcnow().isoformat(),
+    }
+    save_stream_settings(settings)
+    _led_on()
+    log.info("VoskCast started (device=%s, model=%s)", device, model)
+    return {"status": "started"}
+
+
+@app.post("/stop_voskcast")
+async def stop_voskcast():
+    """Stop an active VoskCast broadcast."""
+    return await _on_ble_loop(_stop_voskcast_impl())
+
+
+async def _stop_voskcast_impl() -> dict:
+    global _voskcast_task
+    was_running = False
+    if _voskcast_task is not None and not _voskcast_task.done():
+        was_running = True
+        _voskcast_task.cancel()
+        try:
+            await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0)
+        except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
+            pass
+        _voskcast_task = None
+        _led_off()
+        settings = load_stream_settings() or {}
+        if settings.get('audio_mode') == 'VoskCast':
+            settings['is_streaming'] = False
+            settings['voskcast_is_streaming'] = False
+            settings['timestamp'] = datetime.utcnow().isoformat()
+            save_stream_settings(settings)
+        log.info("VoskCast stopped")
+    return {"status": "stopped", "was_running": was_running}
+
+
+@app.post("/start_whispercast")
+async def start_whispercast(body: dict = {}):
+    """Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}"""
+    return await _on_ble_loop(_start_whispercast_impl(body))
+
+
+async def _start_whispercast_impl(body: dict) -> dict:
+    global _whispercast_task
+    from auracast.faster_whisper_textcast import broadcast_whisper
+
+    model = body.get('model', 'tiny.en')
+    device = body.get('device', 'ch1')
+
+    await _stop_all()
+    await _stop_textcast_impl()
+    await _stop_voskcast_impl()
+    await _stop_whispercast_impl()
+
+    _whispercast_task = asyncio.get_event_loop().create_task(
+        broadcast_whisper(TRANSPORT1, model, device)
+    )
+
+    settings = {
+        'is_streaming': True,
+        'audio_mode': 'WhisperCast',
+        'whispercast_is_streaming': True,
+        'whispercast_device': device,
+        'whispercast_model': model,
+        'timestamp': datetime.utcnow().isoformat(),
+    }
+    save_stream_settings(settings)
+    _led_on()
+    log.info("WhisperCast started (device=%s, model=%s)", device, model)
+    return {"status": "started"}
+
+
+@app.post("/stop_whispercast")
+async def stop_whispercast():
+    """Stop an active WhisperCast broadcast."""
+    return await _on_ble_loop(_stop_whispercast_impl())
+
+
+async def _stop_whispercast_impl() -> dict:
+    global _whispercast_task
+    was_running = False
+    if _whispercast_task is not None and not _whispercast_task.done():
+        was_running = True
+        _whispercast_task.cancel()
+        try:
+            await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0)
+        except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
+            pass
+        _whispercast_task = None
+        _led_off()
+        settings = load_stream_settings() or {}
+        if settings.get('audio_mode') == 'WhisperCast':
+            settings['is_streaming'] = False
+            settings['whispercast_is_streaming'] = False
+            settings['timestamp'] = datetime.utcnow().isoformat()
+            save_stream_settings(settings)
+        log.info("WhisperCast stopped")
+    return {"status": "stopped", "was_running": was_running}
+
+
@app.post("/adc_gain")
 async def set_adc_gain(payload: dict):
    """Set ADC gain in dB for left and right channels without restarting the stream.
@@ -845,6 +978,12 @@ async def get_status():
    status["textcast_is_streaming"] = (
        _textcast_task is not None and not _textcast_task.done()
    )
+    status["voskcast_is_streaming"] = (
+        _voskcast_task is not None and not _voskcast_task.done()
+    )
+    status["whispercast_is_streaming"] = (
+        _whispercast_task is not None and not _whispercast_task.done()
+    )

    return status

@@ -0,0 +1,270 @@
+"""Vosk speech-to-text → TextCast streamer.
+
+Captures mono audio from an analog ALSA/sounddevice input, runs Vosk
+offline ASR in a background thread, and broadcasts recognised text over
+the TextCast BLE broadcast using the same SDU framing as text_multicast.py.
+
+Usage (CLI):
+    poetry run python -m auracast.vosk_textcast \\
+        --model /path/to/vosk-model-en-us \\
+        --device ch1 \\
+        --transport serial:/dev/ttyAMA3,1000000,rtscts
+
+Environment:
+    VOSK_MODEL_PATH   – default Vosk model directory
+    AURACAST_TRANSPORT – default HCI transport string
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import threading
+import time
+from typing import Optional
+
+import numpy as np
+import samplerate
+import sounddevice as sd
+
+from auracast import auracast_config, multicast
+from auracast.text_multicast import (
+    SDU_SIZE,
+    SDU_INTERVAL_US,
+    _make_text_frame,
+    _make_idle_frame,
+)
+
+log = logging.getLogger('vosk_textcast')
+
+VOSK_SAMPLE_RATE = 16_000   # Vosk models expect 16 kHz
+CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz)
+BLOCK_FRAMES_48K = 4800      # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz
+CAPTION_HOLD_S = 4.0         # Keep caption visible N seconds after last speech
+BROADCAST_NAME = 'LiveCaption'
+
+DEFAULT_MODEL_PATH = os.environ.get(
+    'VOSK_MODEL_PATH',
+    os.path.expanduser('~/vosk-model-en-us'),
+)
+
+
+def _tail_to_fit(text: str, max_bytes: int) -> str:
+    """Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
+    encoded = text.encode('utf-8')
+    if len(encoded) <= max_bytes:
+        return text
+    tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
+    sp = tail.find(' ')
+    return tail[sp + 1:] if sp != -1 else tail
+
+
+def _new_words(old: str, new: str) -> str:
+    """Return the words appended to *new* beyond the shared prefix with *old*.
+
+    If *new* doesn't start with *old* (different utterance), return *new* in full.
+    """
+    old_words = old.split()
+    new_words = new.split()
+    if new_words[:len(old_words)] == old_words:
+        extra = new_words[len(old_words):]
+        return ' '.join(extra)
+    return new
+
+
+def _resolve_device(device: str) -> Optional[int]:
+    """Return sounddevice index for a name or numeric string, or None for default."""
+    if not device:
+        return None
+    if device.isdigit():
+        return int(device)
+    for i, d in enumerate(sd.query_devices()):
+        if d['name'] == device and d['max_input_channels'] > 0:
+            return i
+    log.warning("Device '%s' not found in sounddevice list – using default input", device)
+    return None
+
+
+async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
+    """ISO SDU write loop.
+
+    Runs at ~10 ms per iteration (flow-controlled by the BLE controller).
+    Sends the current recognised text (partial or final) as-is.
+    """
+    iso_queue = bigs['big0']['iso_queue']
+    last_sent: str = ''
+
+    while True:
+        now = time.monotonic()
+        with lock:
+            text: str = shared.get('text', '')
+            expiry: float = shared.get('expiry', 0.0)
+
+        if text and now < expiry:
+            display_text = _tail_to_fit(text, SDU_SIZE - 2)
+            if display_text != last_sent:
+                log.info("Caption: %s", display_text)
+                last_sent = display_text
+            frame = _make_text_frame(display_text)
+        else:
+            if last_sent:
+                log.info("Caption cleared")
+                last_sent = ''
+                with lock:
+                    shared['text'] = ''
+            frame = _make_idle_frame()
+
+        await iso_queue.write(frame)
+
+
+def _vosk_thread(
+    model_path: str,
+    device: str,
+    shared: dict,
+    lock: threading.Lock,
+    stop_event: threading.Event,
+) -> None:
+    """Blocking audio capture + Vosk recognition loop.  Runs in a daemon thread."""
+    try:
+        from vosk import KaldiRecognizer, Model  # type: ignore
+    except ImportError:
+        log.error("vosk is not installed. Run: poetry add vosk")
+        return
+
+    log.info("Loading Vosk model from %s …", model_path)
+    model = Model(model_path)
+    rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE)
+    rec.SetMaxAlternatives(0)
+    rec.SetWords(False)
+
+    resampler = samplerate.Resampler('sinc_fastest', channels=1)
+    ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
+
+    dev_idx = _resolve_device(device)
+    last_word_count = [0]  # word count of last partial sent to display
+
+    def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
+        if status:
+            log.warning("Audio status: %s", status)
+        if stop_event.is_set():
+            raise sd.CallbackStop()
+
+        # Resample 48 kHz → 16 kHz
+        mono = indata[:, 0].astype(np.float32)
+        downsampled = resampler.process(mono, ratio, end_of_input=False)
+        pcm16 = (downsampled * 32767).astype(np.int16).tobytes()
+
+        if rec.AcceptWaveform(pcm16):
+            result = json.loads(rec.Result())
+            final_text = result.get('text', '').strip()
+            if final_text:
+                log.info("Final: %s", final_text)
+                with lock:
+                    shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2)
+                    shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
+                last_word_count[0] = 0  # reset for next sentence
+        else:
+            partial_text = json.loads(rec.PartialResult()).get('partial', '').strip()
+            if partial_text:
+                wc = len(partial_text.split())
+                if wc > last_word_count[0]:  # new word arrived
+                    last_word_count[0] = wc
+                    with lock:
+                        shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2)
+                        shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
+
+    try:
+        with sd.InputStream(
+            samplerate=CAPTURE_SAMPLE_RATE,
+            blocksize=BLOCK_FRAMES_48K,
+            device=dev_idx,
+            dtype='float32',
+            channels=1,
+            callback=_cb,
+        ):
+            log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx)
+            stop_event.wait()
+    except Exception as exc:
+        log.error("Vosk audio thread error: %s", exc, exc_info=True)
+
+
+async def broadcast_vosk(
+    transport: str,
+    model_path: str = DEFAULT_MODEL_PATH,
+    device: str = 'ch1',
+) -> None:
+    """Start a Vosk STT → TextCast broadcast.  Runs until cancelled."""
+    model_path = os.path.expanduser(model_path)
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(
+            f"Vosk model not found at '{model_path}'. "
+            "Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH."
+        )
+
+    config = auracast_config.AuracastConfigGroup(
+        bigs=[
+            auracast_config.AuracastBigConfig(
+                name=BROADCAST_NAME,
+                program_info='Live Captions',
+                language='eng',
+                audio_source='file:dummy',
+                iso_que_len=4,
+            ),
+        ],
+        auracast_sampling_rate_hz=16000,
+        octets_per_frame=SDU_SIZE,
+        frame_duration_us=SDU_INTERVAL_US,
+        presentation_delay_us=40_000,
+        qos_config=auracast_config.AuracastQosRobust(),
+        transport=transport,
+    )
+
+    shared: dict = {'text': '', 'expiry': 0.0}
+    lock = threading.Lock()
+    stop_event = threading.Event()
+
+    async with multicast.create_device(config) as ble_device:
+        bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
+
+        t = threading.Thread(
+            target=_vosk_thread,
+            args=(model_path, device, shared, lock, stop_event),
+            daemon=True,
+        )
+        t.start()
+        log.info("VoskCast started (device=%s, model=%s)", device, model_path)
+
+        try:
+            await _iso_write_loop(bigs, shared, lock)
+        except asyncio.CancelledError:
+            log.info("VoskCast cancelled – shutting down")
+            stop_event.set()
+            t.join(timeout=3.0)
+            raise
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast')
+    parser.add_argument(
+        '--model',
+        default=DEFAULT_MODEL_PATH,
+        help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})',
+    )
+    parser.add_argument(
+        '--device',
+        default='ch1',
+        help='sounddevice input device name or index (default: ch1)',
+    )
+    parser.add_argument(
+        '--transport',
+        default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
+        help='Bumble HCI transport string',
+    )
+    args = parser.parse_args()
+    multicast.run_async(broadcast_vosk(args.transport, args.model, args.device))
+
+
+if __name__ == '__main__':
+    main()