STT experiments, implements both vosx and whisper; whisper is too slow (cpu heavy) vosx has low acc; bigger vosx needs too much storage.
This commit is contained in:
+3
-1
@@ -19,7 +19,9 @@ dependencies = [
|
||||
"smbus2 (>=0.5.0,<0.6.0)",
|
||||
"samplerate (>=0.2.2,<0.3.0)",
|
||||
"rpi-gpio (>=0.7.1,<0.8.0)",
|
||||
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc"
|
||||
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc",
|
||||
"vosk (>=0.3.45)",
|
||||
"faster-whisper (>=1.0.0)"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -0,0 +1,259 @@
|
||||
"""faster-whisper speech-to-text → TextCast streamer.
|
||||
|
||||
Captures mono audio from an analog ALSA/sounddevice input, runs
|
||||
faster-whisper offline ASR in a background thread (chunked, every
|
||||
CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE
|
||||
broadcast using the same SDU framing as text_multicast.py.
|
||||
|
||||
Usage (CLI):
|
||||
poetry run python -m auracast.faster_whisper_textcast \\
|
||||
--model tiny.en \\
|
||||
--device ch1 \\
|
||||
--transport serial:/dev/ttyAMA3,1000000,rtscts
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import samplerate
|
||||
import sounddevice as sd
|
||||
|
||||
from auracast import auracast_config, multicast
|
||||
from auracast.text_multicast import (
|
||||
SDU_SIZE,
|
||||
SDU_INTERVAL_US,
|
||||
_make_text_frame,
|
||||
_make_idle_frame,
|
||||
)
|
||||
|
||||
log = logging.getLogger('faster_whisper_textcast')
|
||||
|
||||
CAPTURE_SAMPLE_RATE = 48_000
|
||||
WHISPER_SAMPLE_RATE = 16_000
|
||||
BLOCK_FRAMES_48K = 4800 # 100 ms capture blocks
|
||||
CHUNK_S = 3.0 # transcribe every N seconds of audio
|
||||
CAPTION_HOLD_S = 4.0 # keep caption visible after last transcription
|
||||
SILENCE_RMS = 0.003 # skip transcription if chunk is below this RMS
|
||||
BROADCAST_NAME = 'LiveCaption'
|
||||
|
||||
VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small']
|
||||
|
||||
|
||||
def _tail_to_fit(text: str, max_bytes: int) -> str:
|
||||
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
|
||||
encoded = text.encode('utf-8')
|
||||
if len(encoded) <= max_bytes:
|
||||
return text
|
||||
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
|
||||
sp = tail.find(' ')
|
||||
return tail[sp + 1:] if sp != -1 else tail
|
||||
|
||||
|
||||
def _resolve_device(device: str) -> Optional[int]:
|
||||
"""Return sounddevice index for a name or numeric string, or None for default."""
|
||||
if not device:
|
||||
return None
|
||||
if device.isdigit():
|
||||
return int(device)
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d['name'] == device and d['max_input_channels'] > 0:
|
||||
return i
|
||||
log.warning("Device '%s' not found in sounddevice list – using default input", device)
|
||||
return None
|
||||
|
||||
|
||||
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
|
||||
"""ISO SDU write loop – runs at ~10 ms per iteration."""
|
||||
iso_queue = bigs['big0']['iso_queue']
|
||||
last_sent: str = ''
|
||||
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
with lock:
|
||||
text: str = shared.get('text', '')
|
||||
expiry: float = shared.get('expiry', 0.0)
|
||||
|
||||
if text and now < expiry:
|
||||
display_text = _tail_to_fit(text, SDU_SIZE - 2)
|
||||
if display_text != last_sent:
|
||||
log.info("Caption: %s", display_text)
|
||||
last_sent = display_text
|
||||
frame = _make_text_frame(display_text)
|
||||
else:
|
||||
if last_sent:
|
||||
log.info("Caption cleared")
|
||||
last_sent = ''
|
||||
with lock:
|
||||
shared['text'] = ''
|
||||
frame = _make_idle_frame()
|
||||
|
||||
await iso_queue.write(frame)
|
||||
|
||||
|
||||
def _whisper_thread(
|
||||
model_size: str,
|
||||
device: str,
|
||||
shared: dict,
|
||||
lock: threading.Lock,
|
||||
stop_event: threading.Event,
|
||||
) -> None:
|
||||
"""Blocking audio capture + faster-whisper transcription loop."""
|
||||
try:
|
||||
from faster_whisper import WhisperModel # type: ignore
|
||||
except ImportError:
|
||||
log.error("faster-whisper is not installed. Run: poetry add faster-whisper")
|
||||
return
|
||||
|
||||
log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size)
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
log.info("Model '%s' loaded.", model_size)
|
||||
|
||||
audio_q: queue.Queue = queue.Queue()
|
||||
resampler = samplerate.Resampler('sinc_fastest', channels=1)
|
||||
ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
|
||||
chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE)
|
||||
audio_buffer = np.zeros(0, dtype=np.float32)
|
||||
|
||||
dev_idx = _resolve_device(device)
|
||||
|
||||
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
|
||||
if status:
|
||||
log.warning("Audio status: %s", status)
|
||||
if stop_event.is_set():
|
||||
raise sd.CallbackStop()
|
||||
mono = indata[:, 0].astype(np.float32)
|
||||
downsampled = resampler.process(mono, ratio, end_of_input=False)
|
||||
audio_q.put(downsampled.copy())
|
||||
|
||||
try:
|
||||
with sd.InputStream(
|
||||
samplerate=CAPTURE_SAMPLE_RATE,
|
||||
blocksize=BLOCK_FRAMES_48K,
|
||||
device=dev_idx,
|
||||
dtype='float32',
|
||||
channels=1,
|
||||
callback=_cb,
|
||||
):
|
||||
log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx)
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
chunk = audio_q.get(timeout=0.2)
|
||||
audio_buffer = np.concatenate([audio_buffer, chunk])
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
if len(audio_buffer) < chunk_frames:
|
||||
continue
|
||||
|
||||
pcm = audio_buffer[:chunk_frames].copy()
|
||||
audio_buffer = audio_buffer[chunk_frames:]
|
||||
|
||||
rms = float(np.sqrt(np.mean(pcm ** 2)))
|
||||
if rms < SILENCE_RMS:
|
||||
continue
|
||||
|
||||
t0 = time.monotonic()
|
||||
segments, _ = model.transcribe(
|
||||
pcm,
|
||||
beam_size=1,
|
||||
language="en",
|
||||
vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 300},
|
||||
)
|
||||
text = ' '.join(s.text.strip() for s in segments).strip()
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
if text:
|
||||
log.info("Transcribed (%.2fs): %s", elapsed, text)
|
||||
with lock:
|
||||
shared['text'] = text
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
else:
|
||||
log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed)
|
||||
|
||||
except Exception as exc:
|
||||
log.error("WhisperCast thread error: %s", exc, exc_info=True)
|
||||
|
||||
|
||||
async def broadcast_whisper(
|
||||
transport: str,
|
||||
model_size: str = 'tiny.en',
|
||||
device: str = 'ch1',
|
||||
) -> None:
|
||||
"""Start a faster-whisper → TextCast broadcast. Runs until cancelled."""
|
||||
if model_size not in VALID_MODELS:
|
||||
raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}")
|
||||
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
bigs=[
|
||||
auracast_config.AuracastBigConfig(
|
||||
name=BROADCAST_NAME,
|
||||
program_info='Live Captions',
|
||||
language='eng',
|
||||
audio_source='file:dummy',
|
||||
iso_que_len=4,
|
||||
),
|
||||
],
|
||||
auracast_sampling_rate_hz=16000,
|
||||
octets_per_frame=SDU_SIZE,
|
||||
frame_duration_us=SDU_INTERVAL_US,
|
||||
presentation_delay_us=40_000,
|
||||
qos_config=auracast_config.AuracastQosRobust(),
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
shared: dict = {'text': '', 'expiry': 0.0}
|
||||
lock = threading.Lock()
|
||||
stop_event = threading.Event()
|
||||
|
||||
async with multicast.create_device(config) as ble_device:
|
||||
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
|
||||
|
||||
t = threading.Thread(
|
||||
target=_whisper_thread,
|
||||
args=(model_size, device, shared, lock, stop_event),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
log.info("WhisperCast started (device=%s, model=%s)", device, model_size)
|
||||
|
||||
try:
|
||||
await _iso_write_loop(bigs, shared, lock)
|
||||
except asyncio.CancelledError:
|
||||
log.info("WhisperCast cancelled – shutting down")
|
||||
stop_event.set()
|
||||
t.join(timeout=5.0)
|
||||
raise
|
||||
|
||||
|
||||
def main() -> None:
|
||||
global CHUNK_S
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast')
|
||||
parser.add_argument(
|
||||
'--model', default='tiny.en', choices=VALID_MODELS,
|
||||
help='Whisper model size (default: tiny.en)',
|
||||
)
|
||||
parser.add_argument('--device', default='ch1',
|
||||
help='sounddevice input name or index (default: ch1)')
|
||||
parser.add_argument(
|
||||
'--transport',
|
||||
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
|
||||
help='Bumble HCI transport string',
|
||||
)
|
||||
parser.add_argument('--chunk', type=float, default=CHUNK_S,
|
||||
help=f'Seconds per transcription chunk (default: {CHUNK_S})')
|
||||
args = parser.parse_args()
|
||||
CHUNK_S = args.chunk
|
||||
multicast.run_async(broadcast_whisper(args.transport, args.model, args.device))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -142,6 +142,8 @@ except Exception:
|
||||
# Define is_streaming early from the fetched status for use throughout the UI
|
||||
is_streaming = bool(saved_settings.get("is_streaming", False))
|
||||
textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False))
|
||||
voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False))
|
||||
whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False))
|
||||
|
||||
# Extract secondary status, if provided by the backend /status endpoint.
|
||||
secondary_status = saved_settings.get("secondary") or {}
|
||||
@@ -187,6 +189,8 @@ options = [
|
||||
"Analog",
|
||||
"Network - Dante",
|
||||
"TextCast",
|
||||
"VoskCast",
|
||||
"WhisperCast",
|
||||
]
|
||||
saved_audio_mode = saved_settings.get("audio_mode", "Demo")
|
||||
if saved_audio_mode not in options:
|
||||
@@ -198,7 +202,7 @@ audio_mode = st.selectbox(
|
||||
"Audio Mode",
|
||||
options,
|
||||
index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"),
|
||||
disabled=is_streaming or textcast_is_streaming,
|
||||
disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming,
|
||||
help=(
|
||||
"Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), "
|
||||
"'Network' (AES67) for network RTP/AES67 sources, "
|
||||
@@ -230,6 +234,8 @@ else:
|
||||
# Start/Stop buttons and status (moved to top)
|
||||
if audio_mode == "TextCast":
|
||||
start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False)
|
||||
elif audio_mode == "VoskCast":
|
||||
start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False)
|
||||
elif audio_mode == "Demo":
|
||||
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming)
|
||||
else:
|
||||
@@ -266,6 +272,54 @@ if audio_mode == "TextCast":
|
||||
except Exception as _e:
|
||||
st.error(f"Could not load sample: {_e}")
|
||||
|
||||
# WhisperCast: model size + input device
|
||||
if audio_mode == "WhisperCast":
|
||||
st.markdown("#### Live Speech Recognition (faster-whisper)")
|
||||
_whisper_default_model = saved_settings.get("whispercast_model", "tiny.en")
|
||||
_whisper_default_device = saved_settings.get("whispercast_device", "ch1")
|
||||
col_wm, col_wd = st.columns([2, 1])
|
||||
with col_wm:
|
||||
whisper_model_size = st.selectbox(
|
||||
"Whisper Model",
|
||||
["tiny.en", "base.en", "small.en"],
|
||||
index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model)
|
||||
if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0,
|
||||
disabled=whispercast_is_streaming,
|
||||
help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)",
|
||||
)
|
||||
with col_wd:
|
||||
whisper_device = st.selectbox(
|
||||
"Input",
|
||||
["ch1", "ch2"],
|
||||
index=0 if _whisper_default_device == "ch1" else 1,
|
||||
disabled=whispercast_is_streaming,
|
||||
help="Analog input channel",
|
||||
)
|
||||
st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.")
|
||||
|
||||
# VoskCast: model path + input device
|
||||
if audio_mode == "VoskCast":
|
||||
st.markdown("#### Live Speech Recognition (Vosk)")
|
||||
_vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us")
|
||||
_vosk_default_device = saved_settings.get("voskcast_device", "ch1")
|
||||
col_model, col_dev = st.columns([3, 1])
|
||||
with col_model:
|
||||
vosk_model_path = st.text_input(
|
||||
"Vosk Model Path",
|
||||
value=_vosk_default_model,
|
||||
disabled=voskcast_is_streaming,
|
||||
help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models",
|
||||
)
|
||||
with col_dev:
|
||||
vosk_device = st.selectbox(
|
||||
"Input",
|
||||
["ch1", "ch2"],
|
||||
index=0 if _vosk_default_device == "ch1" else 1,
|
||||
disabled=voskcast_is_streaming,
|
||||
help="Analog input channel (ch1 = left, ch2 = right)",
|
||||
)
|
||||
st.caption("Partial results appear immediately; final results are held for 4 s then cleared.")
|
||||
|
||||
# Analog gain control (only for Analog mode, placed below start button)
|
||||
analog_gain_db_left = 0 # default (dB)
|
||||
analog_gain_db_right = 0 # default (dB)
|
||||
@@ -1830,6 +1884,10 @@ if stop_stream:
|
||||
try:
|
||||
if audio_mode == "TextCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_textcast").json()
|
||||
elif audio_mode == "VoskCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_voskcast").json()
|
||||
elif audio_mode == "WhisperCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_whispercast").json()
|
||||
else:
|
||||
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
|
||||
if audio_mode == "Demo":
|
||||
@@ -1860,6 +1918,34 @@ if start_stream:
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
elif audio_mode == "VoskCast":
|
||||
try:
|
||||
rs = requests.post(
|
||||
f"{BACKEND_URL}/start_voskcast",
|
||||
json={"model": vosk_model_path, "device": vosk_device},
|
||||
)
|
||||
if rs.ok:
|
||||
st.success("VoskCast started.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(f"Start failed: {rs.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
elif audio_mode == "WhisperCast":
|
||||
try:
|
||||
rs = requests.post(
|
||||
f"{BACKEND_URL}/start_whispercast",
|
||||
json={"model": whisper_model_size, "device": whisper_device},
|
||||
)
|
||||
if rs.ok:
|
||||
st.success("WhisperCast started.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(f"Start failed: {rs.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
else:
|
||||
|
||||
# Always send stop to ensure backend is in a clean state, regardless of current status
|
||||
@@ -2084,7 +2170,7 @@ if start_stream:
|
||||
st.error(f"Failed to initialize Dante Radio 2: {r2.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error while starting Dante radios: {e}")
|
||||
if audio_mode not in ("Demo", "Analog", "Network - Dante"):
|
||||
if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"):
|
||||
# USB/Network: single config as before, using shared controls
|
||||
q = QUALITY_MAP[quality]
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
|
||||
@@ -213,6 +213,12 @@ _stream_lock = asyncio.Lock() # serialize initialize/stop_audio on API side
|
||||
_textcast_task: asyncio.Task | None = None
|
||||
DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml')
|
||||
|
||||
# VoskCast state
|
||||
_voskcast_task: asyncio.Task | None = None
|
||||
|
||||
# WhisperCast state
|
||||
_whispercast_task: asyncio.Task | None = None
|
||||
|
||||
# BLE / audio event loop – set in __main__ before uvicorn starts.
|
||||
# All coroutines that touch Bumble objects or the audio pipeline MUST run
|
||||
# on this loop. HTTP handlers call _on_ble_loop() to cross into it.
|
||||
@@ -784,6 +790,133 @@ async def _stop_textcast_impl():
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/start_voskcast")
|
||||
async def start_voskcast(body: dict = {}):
|
||||
"""Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}"""
|
||||
return await _on_ble_loop(_start_voskcast_impl(body))
|
||||
|
||||
|
||||
async def _start_voskcast_impl(body: dict) -> dict:
|
||||
global _voskcast_task
|
||||
from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH
|
||||
|
||||
model = body.get('model') or DEFAULT_MODEL_PATH
|
||||
device = body.get('device', 'ch1')
|
||||
|
||||
await _stop_all()
|
||||
await _stop_textcast_impl()
|
||||
await _stop_voskcast_impl()
|
||||
|
||||
_voskcast_task = asyncio.get_event_loop().create_task(
|
||||
broadcast_vosk(TRANSPORT1, model, device)
|
||||
)
|
||||
|
||||
settings = {
|
||||
'is_streaming': True,
|
||||
'audio_mode': 'VoskCast',
|
||||
'voskcast_is_streaming': True,
|
||||
'voskcast_device': device,
|
||||
'voskcast_model': model,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
save_stream_settings(settings)
|
||||
_led_on()
|
||||
log.info("VoskCast started (device=%s, model=%s)", device, model)
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.post("/stop_voskcast")
|
||||
async def stop_voskcast():
|
||||
"""Stop an active VoskCast broadcast."""
|
||||
return await _on_ble_loop(_stop_voskcast_impl())
|
||||
|
||||
|
||||
async def _stop_voskcast_impl() -> dict:
|
||||
global _voskcast_task
|
||||
was_running = False
|
||||
if _voskcast_task is not None and not _voskcast_task.done():
|
||||
was_running = True
|
||||
_voskcast_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
|
||||
pass
|
||||
_voskcast_task = None
|
||||
_led_off()
|
||||
settings = load_stream_settings() or {}
|
||||
if settings.get('audio_mode') == 'VoskCast':
|
||||
settings['is_streaming'] = False
|
||||
settings['voskcast_is_streaming'] = False
|
||||
settings['timestamp'] = datetime.utcnow().isoformat()
|
||||
save_stream_settings(settings)
|
||||
log.info("VoskCast stopped")
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/start_whispercast")
|
||||
async def start_whispercast(body: dict = {}):
|
||||
"""Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}"""
|
||||
return await _on_ble_loop(_start_whispercast_impl(body))
|
||||
|
||||
|
||||
async def _start_whispercast_impl(body: dict) -> dict:
|
||||
global _whispercast_task
|
||||
from auracast.faster_whisper_textcast import broadcast_whisper
|
||||
|
||||
model = body.get('model', 'tiny.en')
|
||||
device = body.get('device', 'ch1')
|
||||
|
||||
await _stop_all()
|
||||
await _stop_textcast_impl()
|
||||
await _stop_voskcast_impl()
|
||||
await _stop_whispercast_impl()
|
||||
|
||||
_whispercast_task = asyncio.get_event_loop().create_task(
|
||||
broadcast_whisper(TRANSPORT1, model, device)
|
||||
)
|
||||
|
||||
settings = {
|
||||
'is_streaming': True,
|
||||
'audio_mode': 'WhisperCast',
|
||||
'whispercast_is_streaming': True,
|
||||
'whispercast_device': device,
|
||||
'whispercast_model': model,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
save_stream_settings(settings)
|
||||
_led_on()
|
||||
log.info("WhisperCast started (device=%s, model=%s)", device, model)
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.post("/stop_whispercast")
|
||||
async def stop_whispercast():
|
||||
"""Stop an active WhisperCast broadcast."""
|
||||
return await _on_ble_loop(_stop_whispercast_impl())
|
||||
|
||||
|
||||
async def _stop_whispercast_impl() -> dict:
|
||||
global _whispercast_task
|
||||
was_running = False
|
||||
if _whispercast_task is not None and not _whispercast_task.done():
|
||||
was_running = True
|
||||
_whispercast_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
|
||||
pass
|
||||
_whispercast_task = None
|
||||
_led_off()
|
||||
settings = load_stream_settings() or {}
|
||||
if settings.get('audio_mode') == 'WhisperCast':
|
||||
settings['is_streaming'] = False
|
||||
settings['whispercast_is_streaming'] = False
|
||||
settings['timestamp'] = datetime.utcnow().isoformat()
|
||||
save_stream_settings(settings)
|
||||
log.info("WhisperCast stopped")
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/adc_gain")
|
||||
async def set_adc_gain(payload: dict):
|
||||
"""Set ADC gain in dB for left and right channels without restarting the stream.
|
||||
@@ -845,6 +978,12 @@ async def get_status():
|
||||
status["textcast_is_streaming"] = (
|
||||
_textcast_task is not None and not _textcast_task.done()
|
||||
)
|
||||
status["voskcast_is_streaming"] = (
|
||||
_voskcast_task is not None and not _voskcast_task.done()
|
||||
)
|
||||
status["whispercast_is_streaming"] = (
|
||||
_whispercast_task is not None and not _whispercast_task.done()
|
||||
)
|
||||
|
||||
return status
|
||||
|
||||
|
||||
@@ -0,0 +1,270 @@
|
||||
"""Vosk speech-to-text → TextCast streamer.
|
||||
|
||||
Captures mono audio from an analog ALSA/sounddevice input, runs Vosk
|
||||
offline ASR in a background thread, and broadcasts recognised text over
|
||||
the TextCast BLE broadcast using the same SDU framing as text_multicast.py.
|
||||
|
||||
Usage (CLI):
|
||||
poetry run python -m auracast.vosk_textcast \\
|
||||
--model /path/to/vosk-model-en-us \\
|
||||
--device ch1 \\
|
||||
--transport serial:/dev/ttyAMA3,1000000,rtscts
|
||||
|
||||
Environment:
|
||||
VOSK_MODEL_PATH – default Vosk model directory
|
||||
AURACAST_TRANSPORT – default HCI transport string
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import samplerate
|
||||
import sounddevice as sd
|
||||
|
||||
from auracast import auracast_config, multicast
|
||||
from auracast.text_multicast import (
|
||||
SDU_SIZE,
|
||||
SDU_INTERVAL_US,
|
||||
_make_text_frame,
|
||||
_make_idle_frame,
|
||||
)
|
||||
|
||||
log = logging.getLogger('vosk_textcast')
|
||||
|
||||
VOSK_SAMPLE_RATE = 16_000 # Vosk models expect 16 kHz
|
||||
CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz)
|
||||
BLOCK_FRAMES_48K = 4800 # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz
|
||||
CAPTION_HOLD_S = 4.0 # Keep caption visible N seconds after last speech
|
||||
BROADCAST_NAME = 'LiveCaption'
|
||||
|
||||
DEFAULT_MODEL_PATH = os.environ.get(
|
||||
'VOSK_MODEL_PATH',
|
||||
os.path.expanduser('~/vosk-model-en-us'),
|
||||
)
|
||||
|
||||
|
||||
def _tail_to_fit(text: str, max_bytes: int) -> str:
|
||||
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
|
||||
encoded = text.encode('utf-8')
|
||||
if len(encoded) <= max_bytes:
|
||||
return text
|
||||
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
|
||||
sp = tail.find(' ')
|
||||
return tail[sp + 1:] if sp != -1 else tail
|
||||
|
||||
|
||||
def _new_words(old: str, new: str) -> str:
|
||||
"""Return the words appended to *new* beyond the shared prefix with *old*.
|
||||
|
||||
If *new* doesn't start with *old* (different utterance), return *new* in full.
|
||||
"""
|
||||
old_words = old.split()
|
||||
new_words = new.split()
|
||||
if new_words[:len(old_words)] == old_words:
|
||||
extra = new_words[len(old_words):]
|
||||
return ' '.join(extra)
|
||||
return new
|
||||
|
||||
|
||||
def _resolve_device(device: str) -> Optional[int]:
|
||||
"""Return sounddevice index for a name or numeric string, or None for default."""
|
||||
if not device:
|
||||
return None
|
||||
if device.isdigit():
|
||||
return int(device)
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d['name'] == device and d['max_input_channels'] > 0:
|
||||
return i
|
||||
log.warning("Device '%s' not found in sounddevice list – using default input", device)
|
||||
return None
|
||||
|
||||
|
||||
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
|
||||
"""ISO SDU write loop.
|
||||
|
||||
Runs at ~10 ms per iteration (flow-controlled by the BLE controller).
|
||||
Sends the current recognised text (partial or final) as-is.
|
||||
"""
|
||||
iso_queue = bigs['big0']['iso_queue']
|
||||
last_sent: str = ''
|
||||
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
with lock:
|
||||
text: str = shared.get('text', '')
|
||||
expiry: float = shared.get('expiry', 0.0)
|
||||
|
||||
if text and now < expiry:
|
||||
display_text = _tail_to_fit(text, SDU_SIZE - 2)
|
||||
if display_text != last_sent:
|
||||
log.info("Caption: %s", display_text)
|
||||
last_sent = display_text
|
||||
frame = _make_text_frame(display_text)
|
||||
else:
|
||||
if last_sent:
|
||||
log.info("Caption cleared")
|
||||
last_sent = ''
|
||||
with lock:
|
||||
shared['text'] = ''
|
||||
frame = _make_idle_frame()
|
||||
|
||||
await iso_queue.write(frame)
|
||||
|
||||
|
||||
def _vosk_thread(
|
||||
model_path: str,
|
||||
device: str,
|
||||
shared: dict,
|
||||
lock: threading.Lock,
|
||||
stop_event: threading.Event,
|
||||
) -> None:
|
||||
"""Blocking audio capture + Vosk recognition loop. Runs in a daemon thread."""
|
||||
try:
|
||||
from vosk import KaldiRecognizer, Model # type: ignore
|
||||
except ImportError:
|
||||
log.error("vosk is not installed. Run: poetry add vosk")
|
||||
return
|
||||
|
||||
log.info("Loading Vosk model from %s …", model_path)
|
||||
model = Model(model_path)
|
||||
rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE)
|
||||
rec.SetMaxAlternatives(0)
|
||||
rec.SetWords(False)
|
||||
|
||||
resampler = samplerate.Resampler('sinc_fastest', channels=1)
|
||||
ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
|
||||
|
||||
dev_idx = _resolve_device(device)
|
||||
last_word_count = [0] # word count of last partial sent to display
|
||||
|
||||
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
|
||||
if status:
|
||||
log.warning("Audio status: %s", status)
|
||||
if stop_event.is_set():
|
||||
raise sd.CallbackStop()
|
||||
|
||||
# Resample 48 kHz → 16 kHz
|
||||
mono = indata[:, 0].astype(np.float32)
|
||||
downsampled = resampler.process(mono, ratio, end_of_input=False)
|
||||
pcm16 = (downsampled * 32767).astype(np.int16).tobytes()
|
||||
|
||||
if rec.AcceptWaveform(pcm16):
|
||||
result = json.loads(rec.Result())
|
||||
final_text = result.get('text', '').strip()
|
||||
if final_text:
|
||||
log.info("Final: %s", final_text)
|
||||
with lock:
|
||||
shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2)
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
last_word_count[0] = 0 # reset for next sentence
|
||||
else:
|
||||
partial_text = json.loads(rec.PartialResult()).get('partial', '').strip()
|
||||
if partial_text:
|
||||
wc = len(partial_text.split())
|
||||
if wc > last_word_count[0]: # new word arrived
|
||||
last_word_count[0] = wc
|
||||
with lock:
|
||||
shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2)
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
|
||||
try:
|
||||
with sd.InputStream(
|
||||
samplerate=CAPTURE_SAMPLE_RATE,
|
||||
blocksize=BLOCK_FRAMES_48K,
|
||||
device=dev_idx,
|
||||
dtype='float32',
|
||||
channels=1,
|
||||
callback=_cb,
|
||||
):
|
||||
log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx)
|
||||
stop_event.wait()
|
||||
except Exception as exc:
|
||||
log.error("Vosk audio thread error: %s", exc, exc_info=True)
|
||||
|
||||
|
||||
async def broadcast_vosk(
|
||||
transport: str,
|
||||
model_path: str = DEFAULT_MODEL_PATH,
|
||||
device: str = 'ch1',
|
||||
) -> None:
|
||||
"""Start a Vosk STT → TextCast broadcast. Runs until cancelled."""
|
||||
model_path = os.path.expanduser(model_path)
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(
|
||||
f"Vosk model not found at '{model_path}'. "
|
||||
"Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH."
|
||||
)
|
||||
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
bigs=[
|
||||
auracast_config.AuracastBigConfig(
|
||||
name=BROADCAST_NAME,
|
||||
program_info='Live Captions',
|
||||
language='eng',
|
||||
audio_source='file:dummy',
|
||||
iso_que_len=4,
|
||||
),
|
||||
],
|
||||
auracast_sampling_rate_hz=16000,
|
||||
octets_per_frame=SDU_SIZE,
|
||||
frame_duration_us=SDU_INTERVAL_US,
|
||||
presentation_delay_us=40_000,
|
||||
qos_config=auracast_config.AuracastQosRobust(),
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
shared: dict = {'text': '', 'expiry': 0.0}
|
||||
lock = threading.Lock()
|
||||
stop_event = threading.Event()
|
||||
|
||||
async with multicast.create_device(config) as ble_device:
|
||||
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
|
||||
|
||||
t = threading.Thread(
|
||||
target=_vosk_thread,
|
||||
args=(model_path, device, shared, lock, stop_event),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
log.info("VoskCast started (device=%s, model=%s)", device, model_path)
|
||||
|
||||
try:
|
||||
await _iso_write_loop(bigs, shared, lock)
|
||||
except asyncio.CancelledError:
|
||||
log.info("VoskCast cancelled – shutting down")
|
||||
stop_event.set()
|
||||
t.join(timeout=3.0)
|
||||
raise
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast')
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
default=DEFAULT_MODEL_PATH,
|
||||
help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
default='ch1',
|
||||
help='sounddevice input device name or index (default: ch1)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--transport',
|
||||
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
|
||||
help='Bumble HCI transport string',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
multicast.run_async(broadcast_vosk(args.transport, args.model, args.device))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user