STT experiments, implements both vosx and whisper; whisper is too slow (cpu heavy) vosx has low acc; bigger vosx needs too much storage.

This commit is contained in:
2026-05-28 10:12:51 +02:00
parent d1471fae79
commit 53fd074d22
5 changed files with 759 additions and 3 deletions
+3 -1
View File
@@ -19,7 +19,9 @@ dependencies = [
"smbus2 (>=0.5.0,<0.6.0)",
"samplerate (>=0.2.2,<0.3.0)",
"rpi-gpio (>=0.7.1,<0.8.0)",
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc"
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc",
"vosk (>=0.3.45)",
"faster-whisper (>=1.0.0)"
]
[project.optional-dependencies]
+259
View File
@@ -0,0 +1,259 @@
"""faster-whisper speech-to-text → TextCast streamer.
Captures mono audio from an analog ALSA/sounddevice input, runs
faster-whisper offline ASR in a background thread (chunked, every
CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE
broadcast using the same SDU framing as text_multicast.py.
Usage (CLI):
poetry run python -m auracast.faster_whisper_textcast \\
--model tiny.en \\
--device ch1 \\
--transport serial:/dev/ttyAMA3,1000000,rtscts
"""
from __future__ import annotations
import asyncio
import logging
import os
import queue
import threading
import time
from typing import Optional
import numpy as np
import samplerate
import sounddevice as sd
from auracast import auracast_config, multicast
from auracast.text_multicast import (
SDU_SIZE,
SDU_INTERVAL_US,
_make_text_frame,
_make_idle_frame,
)
log = logging.getLogger('faster_whisper_textcast')
CAPTURE_SAMPLE_RATE = 48_000
WHISPER_SAMPLE_RATE = 16_000
BLOCK_FRAMES_48K = 4800 # 100 ms capture blocks
CHUNK_S = 3.0 # transcribe every N seconds of audio
CAPTION_HOLD_S = 4.0 # keep caption visible after last transcription
SILENCE_RMS = 0.003 # skip transcription if chunk is below this RMS
BROADCAST_NAME = 'LiveCaption'
VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small']
def _tail_to_fit(text: str, max_bytes: int) -> str:
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
encoded = text.encode('utf-8')
if len(encoded) <= max_bytes:
return text
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
sp = tail.find(' ')
return tail[sp + 1:] if sp != -1 else tail
def _resolve_device(device: str) -> Optional[int]:
"""Return sounddevice index for a name or numeric string, or None for default."""
if not device:
return None
if device.isdigit():
return int(device)
for i, d in enumerate(sd.query_devices()):
if d['name'] == device and d['max_input_channels'] > 0:
return i
log.warning("Device '%s' not found in sounddevice list using default input", device)
return None
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
"""ISO SDU write loop runs at ~10 ms per iteration."""
iso_queue = bigs['big0']['iso_queue']
last_sent: str = ''
while True:
now = time.monotonic()
with lock:
text: str = shared.get('text', '')
expiry: float = shared.get('expiry', 0.0)
if text and now < expiry:
display_text = _tail_to_fit(text, SDU_SIZE - 2)
if display_text != last_sent:
log.info("Caption: %s", display_text)
last_sent = display_text
frame = _make_text_frame(display_text)
else:
if last_sent:
log.info("Caption cleared")
last_sent = ''
with lock:
shared['text'] = ''
frame = _make_idle_frame()
await iso_queue.write(frame)
def _whisper_thread(
model_size: str,
device: str,
shared: dict,
lock: threading.Lock,
stop_event: threading.Event,
) -> None:
"""Blocking audio capture + faster-whisper transcription loop."""
try:
from faster_whisper import WhisperModel # type: ignore
except ImportError:
log.error("faster-whisper is not installed. Run: poetry add faster-whisper")
return
log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size)
model = WhisperModel(model_size, device="cpu", compute_type="int8")
log.info("Model '%s' loaded.", model_size)
audio_q: queue.Queue = queue.Queue()
resampler = samplerate.Resampler('sinc_fastest', channels=1)
ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE)
audio_buffer = np.zeros(0, dtype=np.float32)
dev_idx = _resolve_device(device)
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
if status:
log.warning("Audio status: %s", status)
if stop_event.is_set():
raise sd.CallbackStop()
mono = indata[:, 0].astype(np.float32)
downsampled = resampler.process(mono, ratio, end_of_input=False)
audio_q.put(downsampled.copy())
try:
with sd.InputStream(
samplerate=CAPTURE_SAMPLE_RATE,
blocksize=BLOCK_FRAMES_48K,
device=dev_idx,
dtype='float32',
channels=1,
callback=_cb,
):
log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx)
while not stop_event.is_set():
try:
chunk = audio_q.get(timeout=0.2)
audio_buffer = np.concatenate([audio_buffer, chunk])
except queue.Empty:
continue
if len(audio_buffer) < chunk_frames:
continue
pcm = audio_buffer[:chunk_frames].copy()
audio_buffer = audio_buffer[chunk_frames:]
rms = float(np.sqrt(np.mean(pcm ** 2)))
if rms < SILENCE_RMS:
continue
t0 = time.monotonic()
segments, _ = model.transcribe(
pcm,
beam_size=1,
language="en",
vad_filter=True,
vad_parameters={"min_silence_duration_ms": 300},
)
text = ' '.join(s.text.strip() for s in segments).strip()
elapsed = time.monotonic() - t0
if text:
log.info("Transcribed (%.2fs): %s", elapsed, text)
with lock:
shared['text'] = text
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
else:
log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed)
except Exception as exc:
log.error("WhisperCast thread error: %s", exc, exc_info=True)
async def broadcast_whisper(
transport: str,
model_size: str = 'tiny.en',
device: str = 'ch1',
) -> None:
"""Start a faster-whisper → TextCast broadcast. Runs until cancelled."""
if model_size not in VALID_MODELS:
raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}")
config = auracast_config.AuracastConfigGroup(
bigs=[
auracast_config.AuracastBigConfig(
name=BROADCAST_NAME,
program_info='Live Captions',
language='eng',
audio_source='file:dummy',
iso_que_len=4,
),
],
auracast_sampling_rate_hz=16000,
octets_per_frame=SDU_SIZE,
frame_duration_us=SDU_INTERVAL_US,
presentation_delay_us=40_000,
qos_config=auracast_config.AuracastQosRobust(),
transport=transport,
)
shared: dict = {'text': '', 'expiry': 0.0}
lock = threading.Lock()
stop_event = threading.Event()
async with multicast.create_device(config) as ble_device:
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
t = threading.Thread(
target=_whisper_thread,
args=(model_size, device, shared, lock, stop_event),
daemon=True,
)
t.start()
log.info("WhisperCast started (device=%s, model=%s)", device, model_size)
try:
await _iso_write_loop(bigs, shared, lock)
except asyncio.CancelledError:
log.info("WhisperCast cancelled shutting down")
stop_event.set()
t.join(timeout=5.0)
raise
def main() -> None:
global CHUNK_S
import argparse
parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast')
parser.add_argument(
'--model', default='tiny.en', choices=VALID_MODELS,
help='Whisper model size (default: tiny.en)',
)
parser.add_argument('--device', default='ch1',
help='sounddevice input name or index (default: ch1)')
parser.add_argument(
'--transport',
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
help='Bumble HCI transport string',
)
parser.add_argument('--chunk', type=float, default=CHUNK_S,
help=f'Seconds per transcription chunk (default: {CHUNK_S})')
args = parser.parse_args()
CHUNK_S = args.chunk
multicast.run_async(broadcast_whisper(args.transport, args.model, args.device))
if __name__ == '__main__':
main()
+88 -2
View File
@@ -142,6 +142,8 @@ except Exception:
# Define is_streaming early from the fetched status for use throughout the UI
is_streaming = bool(saved_settings.get("is_streaming", False))
textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False))
voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False))
whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False))
# Extract secondary status, if provided by the backend /status endpoint.
secondary_status = saved_settings.get("secondary") or {}
@@ -187,6 +189,8 @@ options = [
"Analog",
"Network - Dante",
"TextCast",
"VoskCast",
"WhisperCast",
]
saved_audio_mode = saved_settings.get("audio_mode", "Demo")
if saved_audio_mode not in options:
@@ -198,7 +202,7 @@ audio_mode = st.selectbox(
"Audio Mode",
options,
index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"),
disabled=is_streaming or textcast_is_streaming,
disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming,
help=(
"Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), "
"'Network' (AES67) for network RTP/AES67 sources, "
@@ -230,6 +234,8 @@ else:
# Start/Stop buttons and status (moved to top)
if audio_mode == "TextCast":
start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False)
elif audio_mode == "VoskCast":
start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False)
elif audio_mode == "Demo":
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming)
else:
@@ -266,6 +272,54 @@ if audio_mode == "TextCast":
except Exception as _e:
st.error(f"Could not load sample: {_e}")
# WhisperCast: model size + input device
if audio_mode == "WhisperCast":
st.markdown("#### Live Speech Recognition (faster-whisper)")
_whisper_default_model = saved_settings.get("whispercast_model", "tiny.en")
_whisper_default_device = saved_settings.get("whispercast_device", "ch1")
col_wm, col_wd = st.columns([2, 1])
with col_wm:
whisper_model_size = st.selectbox(
"Whisper Model",
["tiny.en", "base.en", "small.en"],
index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model)
if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0,
disabled=whispercast_is_streaming,
help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)",
)
with col_wd:
whisper_device = st.selectbox(
"Input",
["ch1", "ch2"],
index=0 if _whisper_default_device == "ch1" else 1,
disabled=whispercast_is_streaming,
help="Analog input channel",
)
st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.")
# VoskCast: model path + input device
if audio_mode == "VoskCast":
st.markdown("#### Live Speech Recognition (Vosk)")
_vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us")
_vosk_default_device = saved_settings.get("voskcast_device", "ch1")
col_model, col_dev = st.columns([3, 1])
with col_model:
vosk_model_path = st.text_input(
"Vosk Model Path",
value=_vosk_default_model,
disabled=voskcast_is_streaming,
help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models",
)
with col_dev:
vosk_device = st.selectbox(
"Input",
["ch1", "ch2"],
index=0 if _vosk_default_device == "ch1" else 1,
disabled=voskcast_is_streaming,
help="Analog input channel (ch1 = left, ch2 = right)",
)
st.caption("Partial results appear immediately; final results are held for 4 s then cleared.")
# Analog gain control (only for Analog mode, placed below start button)
analog_gain_db_left = 0 # default (dB)
analog_gain_db_right = 0 # default (dB)
@@ -1830,6 +1884,10 @@ if stop_stream:
try:
if audio_mode == "TextCast":
r = requests.post(f"{BACKEND_URL}/stop_textcast").json()
elif audio_mode == "VoskCast":
r = requests.post(f"{BACKEND_URL}/stop_voskcast").json()
elif audio_mode == "WhisperCast":
r = requests.post(f"{BACKEND_URL}/stop_whispercast").json()
else:
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
if audio_mode == "Demo":
@@ -1860,6 +1918,34 @@ if start_stream:
except Exception as e:
st.error(f"Error: {e}")
elif audio_mode == "VoskCast":
try:
rs = requests.post(
f"{BACKEND_URL}/start_voskcast",
json={"model": vosk_model_path, "device": vosk_device},
)
if rs.ok:
st.success("VoskCast started.")
st.rerun()
else:
st.error(f"Start failed: {rs.text}")
except Exception as e:
st.error(f"Error: {e}")
elif audio_mode == "WhisperCast":
try:
rs = requests.post(
f"{BACKEND_URL}/start_whispercast",
json={"model": whisper_model_size, "device": whisper_device},
)
if rs.ok:
st.success("WhisperCast started.")
st.rerun()
else:
st.error(f"Start failed: {rs.text}")
except Exception as e:
st.error(f"Error: {e}")
else:
# Always send stop to ensure backend is in a clean state, regardless of current status
@@ -2084,7 +2170,7 @@ if start_stream:
st.error(f"Failed to initialize Dante Radio 2: {r2.text}")
except Exception as e:
st.error(f"Error while starting Dante radios: {e}")
if audio_mode not in ("Demo", "Analog", "Network - Dante"):
if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"):
# USB/Network: single config as before, using shared controls
q = QUALITY_MAP[quality]
config = auracast_config.AuracastConfigGroup(
+139
View File
@@ -213,6 +213,12 @@ _stream_lock = asyncio.Lock() # serialize initialize/stop_audio on API side
_textcast_task: asyncio.Task | None = None
DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml')
# VoskCast state
_voskcast_task: asyncio.Task | None = None
# WhisperCast state
_whispercast_task: asyncio.Task | None = None
# BLE / audio event loop set in __main__ before uvicorn starts.
# All coroutines that touch Bumble objects or the audio pipeline MUST run
# on this loop. HTTP handlers call _on_ble_loop() to cross into it.
@@ -784,6 +790,133 @@ async def _stop_textcast_impl():
return {"status": "stopped", "was_running": was_running}
@app.post("/start_voskcast")
async def start_voskcast(body: dict = {}):
"""Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}"""
return await _on_ble_loop(_start_voskcast_impl(body))
async def _start_voskcast_impl(body: dict) -> dict:
global _voskcast_task
from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH
model = body.get('model') or DEFAULT_MODEL_PATH
device = body.get('device', 'ch1')
await _stop_all()
await _stop_textcast_impl()
await _stop_voskcast_impl()
_voskcast_task = asyncio.get_event_loop().create_task(
broadcast_vosk(TRANSPORT1, model, device)
)
settings = {
'is_streaming': True,
'audio_mode': 'VoskCast',
'voskcast_is_streaming': True,
'voskcast_device': device,
'voskcast_model': model,
'timestamp': datetime.utcnow().isoformat(),
}
save_stream_settings(settings)
_led_on()
log.info("VoskCast started (device=%s, model=%s)", device, model)
return {"status": "started"}
@app.post("/stop_voskcast")
async def stop_voskcast():
"""Stop an active VoskCast broadcast."""
return await _on_ble_loop(_stop_voskcast_impl())
async def _stop_voskcast_impl() -> dict:
global _voskcast_task
was_running = False
if _voskcast_task is not None and not _voskcast_task.done():
was_running = True
_voskcast_task.cancel()
try:
await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0)
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
pass
_voskcast_task = None
_led_off()
settings = load_stream_settings() or {}
if settings.get('audio_mode') == 'VoskCast':
settings['is_streaming'] = False
settings['voskcast_is_streaming'] = False
settings['timestamp'] = datetime.utcnow().isoformat()
save_stream_settings(settings)
log.info("VoskCast stopped")
return {"status": "stopped", "was_running": was_running}
@app.post("/start_whispercast")
async def start_whispercast(body: dict = {}):
"""Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}"""
return await _on_ble_loop(_start_whispercast_impl(body))
async def _start_whispercast_impl(body: dict) -> dict:
global _whispercast_task
from auracast.faster_whisper_textcast import broadcast_whisper
model = body.get('model', 'tiny.en')
device = body.get('device', 'ch1')
await _stop_all()
await _stop_textcast_impl()
await _stop_voskcast_impl()
await _stop_whispercast_impl()
_whispercast_task = asyncio.get_event_loop().create_task(
broadcast_whisper(TRANSPORT1, model, device)
)
settings = {
'is_streaming': True,
'audio_mode': 'WhisperCast',
'whispercast_is_streaming': True,
'whispercast_device': device,
'whispercast_model': model,
'timestamp': datetime.utcnow().isoformat(),
}
save_stream_settings(settings)
_led_on()
log.info("WhisperCast started (device=%s, model=%s)", device, model)
return {"status": "started"}
@app.post("/stop_whispercast")
async def stop_whispercast():
"""Stop an active WhisperCast broadcast."""
return await _on_ble_loop(_stop_whispercast_impl())
async def _stop_whispercast_impl() -> dict:
global _whispercast_task
was_running = False
if _whispercast_task is not None and not _whispercast_task.done():
was_running = True
_whispercast_task.cancel()
try:
await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0)
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
pass
_whispercast_task = None
_led_off()
settings = load_stream_settings() or {}
if settings.get('audio_mode') == 'WhisperCast':
settings['is_streaming'] = False
settings['whispercast_is_streaming'] = False
settings['timestamp'] = datetime.utcnow().isoformat()
save_stream_settings(settings)
log.info("WhisperCast stopped")
return {"status": "stopped", "was_running": was_running}
@app.post("/adc_gain")
async def set_adc_gain(payload: dict):
"""Set ADC gain in dB for left and right channels without restarting the stream.
@@ -845,6 +978,12 @@ async def get_status():
status["textcast_is_streaming"] = (
_textcast_task is not None and not _textcast_task.done()
)
status["voskcast_is_streaming"] = (
_voskcast_task is not None and not _voskcast_task.done()
)
status["whispercast_is_streaming"] = (
_whispercast_task is not None and not _whispercast_task.done()
)
return status
+270
View File
@@ -0,0 +1,270 @@
"""Vosk speech-to-text → TextCast streamer.
Captures mono audio from an analog ALSA/sounddevice input, runs Vosk
offline ASR in a background thread, and broadcasts recognised text over
the TextCast BLE broadcast using the same SDU framing as text_multicast.py.
Usage (CLI):
poetry run python -m auracast.vosk_textcast \\
--model /path/to/vosk-model-en-us \\
--device ch1 \\
--transport serial:/dev/ttyAMA3,1000000,rtscts
Environment:
VOSK_MODEL_PATH default Vosk model directory
AURACAST_TRANSPORT default HCI transport string
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import threading
import time
from typing import Optional
import numpy as np
import samplerate
import sounddevice as sd
from auracast import auracast_config, multicast
from auracast.text_multicast import (
SDU_SIZE,
SDU_INTERVAL_US,
_make_text_frame,
_make_idle_frame,
)
log = logging.getLogger('vosk_textcast')
VOSK_SAMPLE_RATE = 16_000 # Vosk models expect 16 kHz
CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz)
BLOCK_FRAMES_48K = 4800 # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz
CAPTION_HOLD_S = 4.0 # Keep caption visible N seconds after last speech
BROADCAST_NAME = 'LiveCaption'
DEFAULT_MODEL_PATH = os.environ.get(
'VOSK_MODEL_PATH',
os.path.expanduser('~/vosk-model-en-us'),
)
def _tail_to_fit(text: str, max_bytes: int) -> str:
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
encoded = text.encode('utf-8')
if len(encoded) <= max_bytes:
return text
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
sp = tail.find(' ')
return tail[sp + 1:] if sp != -1 else tail
def _new_words(old: str, new: str) -> str:
"""Return the words appended to *new* beyond the shared prefix with *old*.
If *new* doesn't start with *old* (different utterance), return *new* in full.
"""
old_words = old.split()
new_words = new.split()
if new_words[:len(old_words)] == old_words:
extra = new_words[len(old_words):]
return ' '.join(extra)
return new
def _resolve_device(device: str) -> Optional[int]:
"""Return sounddevice index for a name or numeric string, or None for default."""
if not device:
return None
if device.isdigit():
return int(device)
for i, d in enumerate(sd.query_devices()):
if d['name'] == device and d['max_input_channels'] > 0:
return i
log.warning("Device '%s' not found in sounddevice list using default input", device)
return None
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
"""ISO SDU write loop.
Runs at ~10 ms per iteration (flow-controlled by the BLE controller).
Sends the current recognised text (partial or final) as-is.
"""
iso_queue = bigs['big0']['iso_queue']
last_sent: str = ''
while True:
now = time.monotonic()
with lock:
text: str = shared.get('text', '')
expiry: float = shared.get('expiry', 0.0)
if text and now < expiry:
display_text = _tail_to_fit(text, SDU_SIZE - 2)
if display_text != last_sent:
log.info("Caption: %s", display_text)
last_sent = display_text
frame = _make_text_frame(display_text)
else:
if last_sent:
log.info("Caption cleared")
last_sent = ''
with lock:
shared['text'] = ''
frame = _make_idle_frame()
await iso_queue.write(frame)
def _vosk_thread(
model_path: str,
device: str,
shared: dict,
lock: threading.Lock,
stop_event: threading.Event,
) -> None:
"""Blocking audio capture + Vosk recognition loop. Runs in a daemon thread."""
try:
from vosk import KaldiRecognizer, Model # type: ignore
except ImportError:
log.error("vosk is not installed. Run: poetry add vosk")
return
log.info("Loading Vosk model from %s", model_path)
model = Model(model_path)
rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE)
rec.SetMaxAlternatives(0)
rec.SetWords(False)
resampler = samplerate.Resampler('sinc_fastest', channels=1)
ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
dev_idx = _resolve_device(device)
last_word_count = [0] # word count of last partial sent to display
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
if status:
log.warning("Audio status: %s", status)
if stop_event.is_set():
raise sd.CallbackStop()
# Resample 48 kHz → 16 kHz
mono = indata[:, 0].astype(np.float32)
downsampled = resampler.process(mono, ratio, end_of_input=False)
pcm16 = (downsampled * 32767).astype(np.int16).tobytes()
if rec.AcceptWaveform(pcm16):
result = json.loads(rec.Result())
final_text = result.get('text', '').strip()
if final_text:
log.info("Final: %s", final_text)
with lock:
shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2)
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
last_word_count[0] = 0 # reset for next sentence
else:
partial_text = json.loads(rec.PartialResult()).get('partial', '').strip()
if partial_text:
wc = len(partial_text.split())
if wc > last_word_count[0]: # new word arrived
last_word_count[0] = wc
with lock:
shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2)
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
try:
with sd.InputStream(
samplerate=CAPTURE_SAMPLE_RATE,
blocksize=BLOCK_FRAMES_48K,
device=dev_idx,
dtype='float32',
channels=1,
callback=_cb,
):
log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx)
stop_event.wait()
except Exception as exc:
log.error("Vosk audio thread error: %s", exc, exc_info=True)
async def broadcast_vosk(
transport: str,
model_path: str = DEFAULT_MODEL_PATH,
device: str = 'ch1',
) -> None:
"""Start a Vosk STT → TextCast broadcast. Runs until cancelled."""
model_path = os.path.expanduser(model_path)
if not os.path.exists(model_path):
raise FileNotFoundError(
f"Vosk model not found at '{model_path}'. "
"Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH."
)
config = auracast_config.AuracastConfigGroup(
bigs=[
auracast_config.AuracastBigConfig(
name=BROADCAST_NAME,
program_info='Live Captions',
language='eng',
audio_source='file:dummy',
iso_que_len=4,
),
],
auracast_sampling_rate_hz=16000,
octets_per_frame=SDU_SIZE,
frame_duration_us=SDU_INTERVAL_US,
presentation_delay_us=40_000,
qos_config=auracast_config.AuracastQosRobust(),
transport=transport,
)
shared: dict = {'text': '', 'expiry': 0.0}
lock = threading.Lock()
stop_event = threading.Event()
async with multicast.create_device(config) as ble_device:
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
t = threading.Thread(
target=_vosk_thread,
args=(model_path, device, shared, lock, stop_event),
daemon=True,
)
t.start()
log.info("VoskCast started (device=%s, model=%s)", device, model_path)
try:
await _iso_write_loop(bigs, shared, lock)
except asyncio.CancelledError:
log.info("VoskCast cancelled shutting down")
stop_event.set()
t.join(timeout=3.0)
raise
def main() -> None:
parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast')
parser.add_argument(
'--model',
default=DEFAULT_MODEL_PATH,
help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})',
)
parser.add_argument(
'--device',
default='ch1',
help='sounddevice input device name or index (default: ch1)',
)
parser.add_argument(
'--transport',
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
help='Bumble HCI transport string',
)
args = parser.parse_args()
multicast.run_async(broadcast_vosk(args.transport, args.model, args.device))
if __name__ == '__main__':
main()