2 Commits

8 changed files with 1208 additions and 13 deletions
+3 -1
View File
@@ -19,7 +19,9 @@ dependencies = [
"smbus2 (>=0.5.0,<0.6.0)",
"samplerate (>=0.2.2,<0.3.0)",
"rpi-gpio (>=0.7.1,<0.8.0)",
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc"
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc",
"vosk (>=0.3.45)",
"faster-whisper (>=1.0.0)"
]
[project.optional-dependencies]
+72
View File
@@ -0,0 +1,72 @@
"""DCP XML subtitle file parser (Interop and SMPTE 428-7 formats).
Timecode format: HH:MM:SS:FF (frame-based, default 24 fps)
HH:MM:SS.mmm (millisecond decimal, also accepted)
"""
from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import List
@dataclass
class Subtitle:
time_in: float # seconds (float)
time_out: float # seconds (float)
text: str
def _parse_timecode(tc: str, fps: int = 24) -> float:
"""Parse a DCP timecode string to float seconds."""
# HH:MM:SS:FF
m = re.match(r'^(\d+):(\d+):(\d+):(\d+)$', tc.strip())
if m:
h, mi, s, f = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))
return h * 3600 + mi * 60 + s + f / fps
# HH:MM:SS.mmm
m = re.match(r'^(\d+):(\d+):(\d+)\.(\d+)$', tc.strip())
if m:
h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3))
frac = float('0.' + m.group(4))
return h * 3600 + mi * 60 + s + frac
raise ValueError(f"Unrecognized DCP timecode: {tc!r}")
def parse_dcp_xml(path: str, fps: int = 24) -> List[Subtitle]:
"""Parse a DCP XML subtitle file and return a time-sorted list of Subtitles."""
tree = ET.parse(path)
root = tree.getroot()
# Strip namespace so element lookups work regardless of schema version
ns_match = re.match(r'\{(.+?)\}', root.tag)
ns = ns_match.group(0) if ns_match else ''
subtitles: List[Subtitle] = []
for subtitle_el in root.iter(f'{ns}Subtitle'):
time_in_str = subtitle_el.get('TimeIn', '')
time_out_str = subtitle_el.get('TimeOut', '')
if not time_in_str or not time_out_str:
continue
parts: List[str] = []
for text_el in subtitle_el.iter(f'{ns}Text'):
t = (text_el.text or '').strip()
if t:
parts.append(t)
text = ' '.join(parts)
if not text:
continue
subtitles.append(Subtitle(
time_in=_parse_timecode(time_in_str, fps),
time_out=_parse_timecode(time_out_str, fps),
text=text,
))
return sorted(subtitles, key=lambda s: s.time_in)
+259
View File
@@ -0,0 +1,259 @@
"""faster-whisper speech-to-text → TextCast streamer.
Captures mono audio from an analog ALSA/sounddevice input, runs
faster-whisper offline ASR in a background thread (chunked, every
CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE
broadcast using the same SDU framing as text_multicast.py.
Usage (CLI):
poetry run python -m auracast.faster_whisper_textcast \\
--model tiny.en \\
--device ch1 \\
--transport serial:/dev/ttyAMA3,1000000,rtscts
"""
from __future__ import annotations
import asyncio
import logging
import os
import queue
import threading
import time
from typing import Optional
import numpy as np
import samplerate
import sounddevice as sd
from auracast import auracast_config, multicast
from auracast.text_multicast import (
SDU_SIZE,
SDU_INTERVAL_US,
_make_text_frame,
_make_idle_frame,
)
log = logging.getLogger('faster_whisper_textcast')
CAPTURE_SAMPLE_RATE = 48_000
WHISPER_SAMPLE_RATE = 16_000
BLOCK_FRAMES_48K = 4800 # 100 ms capture blocks
CHUNK_S = 3.0 # transcribe every N seconds of audio
CAPTION_HOLD_S = 4.0 # keep caption visible after last transcription
SILENCE_RMS = 0.003 # skip transcription if chunk is below this RMS
BROADCAST_NAME = 'LiveCaption'
VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small']
def _tail_to_fit(text: str, max_bytes: int) -> str:
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
encoded = text.encode('utf-8')
if len(encoded) <= max_bytes:
return text
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
sp = tail.find(' ')
return tail[sp + 1:] if sp != -1 else tail
def _resolve_device(device: str) -> Optional[int]:
"""Return sounddevice index for a name or numeric string, or None for default."""
if not device:
return None
if device.isdigit():
return int(device)
for i, d in enumerate(sd.query_devices()):
if d['name'] == device and d['max_input_channels'] > 0:
return i
log.warning("Device '%s' not found in sounddevice list using default input", device)
return None
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
"""ISO SDU write loop runs at ~10 ms per iteration."""
iso_queue = bigs['big0']['iso_queue']
last_sent: str = ''
while True:
now = time.monotonic()
with lock:
text: str = shared.get('text', '')
expiry: float = shared.get('expiry', 0.0)
if text and now < expiry:
display_text = _tail_to_fit(text, SDU_SIZE - 2)
if display_text != last_sent:
log.info("Caption: %s", display_text)
last_sent = display_text
frame = _make_text_frame(display_text)
else:
if last_sent:
log.info("Caption cleared")
last_sent = ''
with lock:
shared['text'] = ''
frame = _make_idle_frame()
await iso_queue.write(frame)
def _whisper_thread(
model_size: str,
device: str,
shared: dict,
lock: threading.Lock,
stop_event: threading.Event,
) -> None:
"""Blocking audio capture + faster-whisper transcription loop."""
try:
from faster_whisper import WhisperModel # type: ignore
except ImportError:
log.error("faster-whisper is not installed. Run: poetry add faster-whisper")
return
log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size)
model = WhisperModel(model_size, device="cpu", compute_type="int8")
log.info("Model '%s' loaded.", model_size)
audio_q: queue.Queue = queue.Queue()
resampler = samplerate.Resampler('sinc_fastest', channels=1)
ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE)
audio_buffer = np.zeros(0, dtype=np.float32)
dev_idx = _resolve_device(device)
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
if status:
log.warning("Audio status: %s", status)
if stop_event.is_set():
raise sd.CallbackStop()
mono = indata[:, 0].astype(np.float32)
downsampled = resampler.process(mono, ratio, end_of_input=False)
audio_q.put(downsampled.copy())
try:
with sd.InputStream(
samplerate=CAPTURE_SAMPLE_RATE,
blocksize=BLOCK_FRAMES_48K,
device=dev_idx,
dtype='float32',
channels=1,
callback=_cb,
):
log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx)
while not stop_event.is_set():
try:
chunk = audio_q.get(timeout=0.2)
audio_buffer = np.concatenate([audio_buffer, chunk])
except queue.Empty:
continue
if len(audio_buffer) < chunk_frames:
continue
pcm = audio_buffer[:chunk_frames].copy()
audio_buffer = audio_buffer[chunk_frames:]
rms = float(np.sqrt(np.mean(pcm ** 2)))
if rms < SILENCE_RMS:
continue
t0 = time.monotonic()
segments, _ = model.transcribe(
pcm,
beam_size=1,
language="en",
vad_filter=True,
vad_parameters={"min_silence_duration_ms": 300},
)
text = ' '.join(s.text.strip() for s in segments).strip()
elapsed = time.monotonic() - t0
if text:
log.info("Transcribed (%.2fs): %s", elapsed, text)
with lock:
shared['text'] = text
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
else:
log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed)
except Exception as exc:
log.error("WhisperCast thread error: %s", exc, exc_info=True)
async def broadcast_whisper(
transport: str,
model_size: str = 'tiny.en',
device: str = 'ch1',
) -> None:
"""Start a faster-whisper → TextCast broadcast. Runs until cancelled."""
if model_size not in VALID_MODELS:
raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}")
config = auracast_config.AuracastConfigGroup(
bigs=[
auracast_config.AuracastBigConfig(
name=BROADCAST_NAME,
program_info='Live Captions',
language='eng',
audio_source='file:dummy',
iso_que_len=4,
),
],
auracast_sampling_rate_hz=16000,
octets_per_frame=SDU_SIZE,
frame_duration_us=SDU_INTERVAL_US,
presentation_delay_us=40_000,
qos_config=auracast_config.AuracastQosRobust(),
transport=transport,
)
shared: dict = {'text': '', 'expiry': 0.0}
lock = threading.Lock()
stop_event = threading.Event()
async with multicast.create_device(config) as ble_device:
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
t = threading.Thread(
target=_whisper_thread,
args=(model_size, device, shared, lock, stop_event),
daemon=True,
)
t.start()
log.info("WhisperCast started (device=%s, model=%s)", device, model_size)
try:
await _iso_write_loop(bigs, shared, lock)
except asyncio.CancelledError:
log.info("WhisperCast cancelled shutting down")
stop_event.set()
t.join(timeout=5.0)
raise
def main() -> None:
global CHUNK_S
import argparse
parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast')
parser.add_argument(
'--model', default='tiny.en', choices=VALID_MODELS,
help='Whisper model size (default: tiny.en)',
)
parser.add_argument('--device', default='ch1',
help='sounddevice input name or index (default: ch1)')
parser.add_argument(
'--transport',
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
help='Bumble HCI transport string',
)
parser.add_argument('--chunk', type=float, default=CHUNK_S,
help=f'Seconds per transcription chunk (default: {CHUNK_S})')
args = parser.parse_args()
CHUNK_S = args.chunk
multicast.run_async(broadcast_whisper(args.transport, args.model, args.device))
if __name__ == '__main__':
main()
+157 -12
View File
@@ -141,6 +141,9 @@ except Exception:
# Define is_streaming early from the fetched status for use throughout the UI
is_streaming = bool(saved_settings.get("is_streaming", False))
textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False))
voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False))
whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False))
# Extract secondary status, if provided by the backend /status endpoint.
secondary_status = saved_settings.get("secondary") or {}
@@ -185,6 +188,9 @@ options = [
"Demo",
"Analog",
"Network - Dante",
"TextCast",
"VoskCast",
"WhisperCast",
]
saved_audio_mode = saved_settings.get("audio_mode", "Demo")
if saved_audio_mode not in options:
@@ -196,7 +202,7 @@ audio_mode = st.selectbox(
"Audio Mode",
options,
index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"),
disabled=is_streaming,
disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming,
help=(
"Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), "
"'Network' (AES67) for network RTP/AES67 sources, "
@@ -226,11 +232,94 @@ else:
running_mode = backend_mode_mapped if (is_streaming and backend_mode_mapped) else audio_mode
# Start/Stop buttons and status (moved to top)
if audio_mode == "Demo":
if audio_mode == "TextCast":
start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False)
elif audio_mode == "VoskCast":
start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False)
elif audio_mode == "Demo":
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming)
else:
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Auracast", "Stop Auracast", running_mode, secondary_is_streaming)
# TextCast: DCP XML file uploader
if audio_mode == "TextCast":
st.markdown("#### DCP Subtitle File")
dcp_file = st.file_uploader(
"Upload DCP XML subtitle file (.xml)",
type=["xml"],
disabled=textcast_is_streaming,
help="Upload a DCP-compliant subtitle XML file. Subtitles will be broadcast over Auracast.",
)
if dcp_file is not None:
content = dcp_file.read().decode("utf-8", errors="replace")
st.session_state['_textcast_dcp_content'] = content
st.session_state['_textcast_dcp_name'] = dcp_file.name
st.success(f"Loaded: {dcp_file.name} ({len(content):,} bytes)")
elif st.session_state.get('_textcast_dcp_name'):
st.info(f"Using previously uploaded file: {st.session_state['_textcast_dcp_name']}")
else:
st.warning("No subtitle file loaded. Upload a DCP XML file or use the sample below.")
if st.button("Load sample subtitle file", disabled=textcast_is_streaming):
import os as _os
_sample = _os.path.abspath(_os.path.join(
_os.path.dirname(__file__), '..', 'testdata', 'sample_subtitles.xml'))
try:
with open(_sample, 'r', encoding='utf-8') as _f:
_content = _f.read()
st.session_state['_textcast_dcp_content'] = _content
st.session_state['_textcast_dcp_name'] = 'sample_subtitles.xml'
st.rerun()
except Exception as _e:
st.error(f"Could not load sample: {_e}")
# WhisperCast: model size + input device
if audio_mode == "WhisperCast":
st.markdown("#### Live Speech Recognition (faster-whisper)")
_whisper_default_model = saved_settings.get("whispercast_model", "tiny.en")
_whisper_default_device = saved_settings.get("whispercast_device", "ch1")
col_wm, col_wd = st.columns([2, 1])
with col_wm:
whisper_model_size = st.selectbox(
"Whisper Model",
["tiny.en", "base.en", "small.en"],
index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model)
if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0,
disabled=whispercast_is_streaming,
help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)",
)
with col_wd:
whisper_device = st.selectbox(
"Input",
["ch1", "ch2"],
index=0 if _whisper_default_device == "ch1" else 1,
disabled=whispercast_is_streaming,
help="Analog input channel",
)
st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.")
# VoskCast: model path + input device
if audio_mode == "VoskCast":
st.markdown("#### Live Speech Recognition (Vosk)")
_vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us")
_vosk_default_device = saved_settings.get("voskcast_device", "ch1")
col_model, col_dev = st.columns([3, 1])
with col_model:
vosk_model_path = st.text_input(
"Vosk Model Path",
value=_vosk_default_model,
disabled=voskcast_is_streaming,
help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models",
)
with col_dev:
vosk_device = st.selectbox(
"Input",
["ch1", "ch2"],
index=0 if _vosk_default_device == "ch1" else 1,
disabled=voskcast_is_streaming,
help="Analog input channel (ch1 = left, ch2 = right)",
)
st.caption("Partial results appear immediately; final results are held for 4 s then cleared.")
# Analog gain control (only for Analog mode, placed below start button)
analog_gain_db_left = 0 # default (dB)
analog_gain_db_right = 0 # default (dB)
@@ -1793,22 +1882,78 @@ else:
if stop_stream:
st.session_state['stream_started'] = False
try:
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
if audio_mode == "Demo":
st.session_state['demo_stream_started'] = False
if r['was_running']:
if audio_mode == "TextCast":
r = requests.post(f"{BACKEND_URL}/stop_textcast").json()
elif audio_mode == "VoskCast":
r = requests.post(f"{BACKEND_URL}/stop_voskcast").json()
elif audio_mode == "WhisperCast":
r = requests.post(f"{BACKEND_URL}/stop_whispercast").json()
else:
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
if audio_mode == "Demo":
st.session_state['demo_stream_started'] = False
if r.get('was_running'):
is_stopped = True
except Exception as e:
st.error(f"Error: {e}")
if start_stream:
# Always send stop to ensure backend is in a clean state, regardless of current status
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
# Small pause lets backend fully release audio devices before re-init
time.sleep(1)
if audio_mode == "TextCast":
uploaded = st.session_state.get('_textcast_dcp_content')
if not uploaded:
st.error("Upload a DCP XML file first.")
else:
try:
ru = requests.post(f"{BACKEND_URL}/upload_dcp", json={"xml": uploaded})
if not ru.ok:
st.error(f"Upload failed: {ru.text}")
else:
rs = requests.post(f"{BACKEND_URL}/start_textcast")
if rs.ok:
st.success("TextCast started.")
st.rerun()
else:
st.error(f"Start failed: {rs.text}")
except Exception as e:
st.error(f"Error: {e}")
if audio_mode == "Demo":
elif audio_mode == "VoskCast":
try:
rs = requests.post(
f"{BACKEND_URL}/start_voskcast",
json={"model": vosk_model_path, "device": vosk_device},
)
if rs.ok:
st.success("VoskCast started.")
st.rerun()
else:
st.error(f"Start failed: {rs.text}")
except Exception as e:
st.error(f"Error: {e}")
elif audio_mode == "WhisperCast":
try:
rs = requests.post(
f"{BACKEND_URL}/start_whispercast",
json={"model": whisper_model_size, "device": whisper_device},
)
if rs.ok:
st.success("WhisperCast started.")
st.rerun()
else:
st.error(f"Start failed: {rs.text}")
except Exception as e:
st.error(f"Error: {e}")
else:
# Always send stop to ensure backend is in a clean state, regardless of current status
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
# Small pause lets backend fully release audio devices before re-init
time.sleep(1)
if audio_mode == "Demo":
demo_cfg = demo_stream_map[demo_selected]
q = QUALITY_MAP[demo_cfg['quality']]
@@ -2025,7 +2170,7 @@ if start_stream:
st.error(f"Failed to initialize Dante Radio 2: {r2.text}")
except Exception as e:
st.error(f"Error while starting Dante radios: {e}")
if audio_mode not in ("Demo", "Analog", "Network - Dante"):
if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"):
# USB/Network: single config as before, using shared controls
q = QUALITY_MAP[quality]
config = auracast_config.AuracastConfigGroup(
+221
View File
@@ -209,6 +209,16 @@ multicaster1: multicast_control.Multicaster | None = None
multicaster2: multicast_control.Multicaster | None = None
_stream_lock = asyncio.Lock() # serialize initialize/stop_audio on API side
# TextCast state
_textcast_task: asyncio.Task | None = None
DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml')
# VoskCast state
_voskcast_task: asyncio.Task | None = None
# WhisperCast state
_whispercast_task: asyncio.Task | None = None
# BLE / audio event loop set in __main__ before uvicorn starts.
# All coroutines that touch Bumble objects or the audio pipeline MUST run
# on this loop. HTTP handlers call _on_ble_loop() to cross into it.
@@ -705,6 +715,208 @@ async def _stop_audio_impl():
log.error("Exception in /stop_audio: %s", traceback.format_exc())
raise HTTPException(status_code=500, detail=str(e))
@app.post("/upload_dcp")
async def upload_dcp(payload: dict):
"""Save DCP XML content for TextCast. Body: {"xml": "<DCSubtitle>..."}"""
xml_content = payload.get("xml", "")
if not xml_content.strip():
raise HTTPException(status_code=400, detail="Empty XML content")
try:
with open(DCP_UPLOAD_PATH, 'w', encoding='utf-8') as f:
f.write(xml_content)
log.info("DCP XML saved to %s (%d bytes)", DCP_UPLOAD_PATH, len(xml_content))
return {"status": "ok", "path": DCP_UPLOAD_PATH}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/start_textcast")
async def start_textcast():
"""Start text-over-Auracast broadcast using the uploaded DCP XML file."""
return await _on_ble_loop(_start_textcast_impl())
async def _start_textcast_impl():
global _textcast_task
if not os.path.exists(DCP_UPLOAD_PATH):
raise HTTPException(status_code=400, detail="No DCP file uploaded. Use /upload_dcp first.")
# Stop any running audio/textcast first
await _stop_all()
await _stop_textcast_impl()
from auracast.text_multicast import broadcast_text
_textcast_task = asyncio.get_event_loop().create_task(
broadcast_text(DCP_UPLOAD_PATH, TRANSPORT1)
)
settings = {
'is_streaming': True,
'audio_mode': 'TextCast',
'textcast_is_streaming': True,
'timestamp': datetime.utcnow().isoformat(),
}
save_stream_settings(settings)
_led_on()
log.info("TextCast started (DCP: %s)", DCP_UPLOAD_PATH)
return {"status": "started"}
@app.post("/stop_textcast")
async def stop_textcast():
"""Stop an active TextCast broadcast."""
return await _on_ble_loop(_stop_textcast_impl())
async def _stop_textcast_impl():
global _textcast_task
was_running = False
if _textcast_task is not None and not _textcast_task.done():
was_running = True
_textcast_task.cancel()
try:
await asyncio.wait_for(asyncio.shield(_textcast_task), timeout=3.0)
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
pass
_textcast_task = None
_led_off()
settings = load_stream_settings() or {}
if settings.get('audio_mode') == 'TextCast':
settings['is_streaming'] = False
settings['textcast_is_streaming'] = False
settings['timestamp'] = datetime.utcnow().isoformat()
save_stream_settings(settings)
log.info("TextCast stopped")
return {"status": "stopped", "was_running": was_running}
@app.post("/start_voskcast")
async def start_voskcast(body: dict = {}):
"""Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}"""
return await _on_ble_loop(_start_voskcast_impl(body))
async def _start_voskcast_impl(body: dict) -> dict:
global _voskcast_task
from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH
model = body.get('model') or DEFAULT_MODEL_PATH
device = body.get('device', 'ch1')
await _stop_all()
await _stop_textcast_impl()
await _stop_voskcast_impl()
_voskcast_task = asyncio.get_event_loop().create_task(
broadcast_vosk(TRANSPORT1, model, device)
)
settings = {
'is_streaming': True,
'audio_mode': 'VoskCast',
'voskcast_is_streaming': True,
'voskcast_device': device,
'voskcast_model': model,
'timestamp': datetime.utcnow().isoformat(),
}
save_stream_settings(settings)
_led_on()
log.info("VoskCast started (device=%s, model=%s)", device, model)
return {"status": "started"}
@app.post("/stop_voskcast")
async def stop_voskcast():
"""Stop an active VoskCast broadcast."""
return await _on_ble_loop(_stop_voskcast_impl())
async def _stop_voskcast_impl() -> dict:
global _voskcast_task
was_running = False
if _voskcast_task is not None and not _voskcast_task.done():
was_running = True
_voskcast_task.cancel()
try:
await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0)
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
pass
_voskcast_task = None
_led_off()
settings = load_stream_settings() or {}
if settings.get('audio_mode') == 'VoskCast':
settings['is_streaming'] = False
settings['voskcast_is_streaming'] = False
settings['timestamp'] = datetime.utcnow().isoformat()
save_stream_settings(settings)
log.info("VoskCast stopped")
return {"status": "stopped", "was_running": was_running}
@app.post("/start_whispercast")
async def start_whispercast(body: dict = {}):
"""Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}"""
return await _on_ble_loop(_start_whispercast_impl(body))
async def _start_whispercast_impl(body: dict) -> dict:
global _whispercast_task
from auracast.faster_whisper_textcast import broadcast_whisper
model = body.get('model', 'tiny.en')
device = body.get('device', 'ch1')
await _stop_all()
await _stop_textcast_impl()
await _stop_voskcast_impl()
await _stop_whispercast_impl()
_whispercast_task = asyncio.get_event_loop().create_task(
broadcast_whisper(TRANSPORT1, model, device)
)
settings = {
'is_streaming': True,
'audio_mode': 'WhisperCast',
'whispercast_is_streaming': True,
'whispercast_device': device,
'whispercast_model': model,
'timestamp': datetime.utcnow().isoformat(),
}
save_stream_settings(settings)
_led_on()
log.info("WhisperCast started (device=%s, model=%s)", device, model)
return {"status": "started"}
@app.post("/stop_whispercast")
async def stop_whispercast():
"""Stop an active WhisperCast broadcast."""
return await _on_ble_loop(_stop_whispercast_impl())
async def _stop_whispercast_impl() -> dict:
global _whispercast_task
was_running = False
if _whispercast_task is not None and not _whispercast_task.done():
was_running = True
_whispercast_task.cancel()
try:
await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0)
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
pass
_whispercast_task = None
_led_off()
settings = load_stream_settings() or {}
if settings.get('audio_mode') == 'WhisperCast':
settings['is_streaming'] = False
settings['whispercast_is_streaming'] = False
settings['timestamp'] = datetime.utcnow().isoformat()
save_stream_settings(settings)
log.info("WhisperCast stopped")
return {"status": "stopped", "was_running": was_running}
@app.post("/adc_gain")
async def set_adc_gain(payload: dict):
"""Set ADC gain in dB for left and right channels without restarting the stream.
@@ -763,6 +975,15 @@ async def get_status():
status["secondary"] = secondary
status["secondary_is_streaming"] = bool(secondary.get("is_streaming", False))
status["led_enabled"] = _LED_ENABLED
status["textcast_is_streaming"] = (
_textcast_task is not None and not _textcast_task.done()
)
status["voskcast_is_streaming"] = (
_voskcast_task is not None and not _voskcast_task.done()
)
status["whispercast_is_streaming"] = (
_whispercast_task is not None and not _whispercast_task.done()
)
return status
+71
View File
@@ -0,0 +1,71 @@
<?xml version="1.0" encoding="UTF-8"?>
<DCSubtitle Version="1.0">
<SubtitleID>a1b2c3d4-e5f6-7890-abcd-ef1234567890</SubtitleID>
<MovieTitle>Sample TextCast Subtitles</MovieTitle>
<ReelNumber>1</ReelNumber>
<Language>en</Language>
<LoadFont Id="Font1" URI="Arial.ttf"/>
<Font Id="Font1" Color="FFFFFFFF" Effect="none" Size="42" Italic="no">
<Subtitle SpotNumber="1" TimeIn="00:00:02:00" TimeOut="00:00:05:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Welcome to TextCast.</Text>
</Subtitle>
<Subtitle SpotNumber="2" TimeIn="00:00:06:00" TimeOut="00:00:09:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Text transmitted over Auracast BLE.</Text>
</Subtitle>
<Subtitle SpotNumber="3" TimeIn="00:00:10:00" TimeOut="00:00:13:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">No LC3 audio codec involved.</Text>
</Subtitle>
<Subtitle SpotNumber="4" TimeIn="00:00:14:00" TimeOut="00:00:17:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Raw ISO SDUs carry UTF-8 text.</Text>
</Subtitle>
<Subtitle SpotNumber="5" TimeIn="00:00:18:00" TimeOut="00:00:21:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">100 frames per second at 40 bytes.</Text>
</Subtitle>
<Subtitle SpotNumber="6" TimeIn="00:00:22:00" TimeOut="00:00:25:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Scrolling display on SH1106 OLED.</Text>
</Subtitle>
<Subtitle SpotNumber="7" TimeIn="00:00:26:00" TimeOut="00:00:29:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Each new line scrolls up the screen.</Text>
</Subtitle>
<Subtitle SpotNumber="8" TimeIn="00:00:30:00" TimeOut="00:00:33:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">The quick brown fox jumps over</Text>
</Subtitle>
<Subtitle SpotNumber="9" TimeIn="00:00:34:00" TimeOut="00:00:37:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">the lazy dog.</Text>
</Subtitle>
<Subtitle SpotNumber="10" TimeIn="00:00:38:00" TimeOut="00:00:41:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Speech-to-text output goes here.</Text>
</Subtitle>
<Subtitle SpotNumber="11" TimeIn="00:00:42:00" TimeOut="00:00:45:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Latency is dominated by BLE BIG.</Text>
</Subtitle>
<Subtitle SpotNumber="12" TimeIn="00:00:46:00" TimeOut="00:00:49:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Typical end-to-end: under 50 ms.</Text>
</Subtitle>
<Subtitle SpotNumber="13" TimeIn="00:00:50:00" TimeOut="00:00:53:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">One transmitter, many receivers.</Text>
</Subtitle>
<Subtitle SpotNumber="14" TimeIn="00:00:54:00" TimeOut="00:00:57:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">Built on Bumble and Zephyr RTOS.</Text>
</Subtitle>
<Subtitle SpotNumber="15" TimeIn="00:00:58:00" TimeOut="00:01:01:00" FadeUpTime="0" FadeDownTime="0">
<Text HAlign="center" VAlign="bottom">End of demonstration. Thank you.</Text>
</Subtitle>
</Font>
</DCSubtitle>
+155
View File
@@ -0,0 +1,155 @@
"""Text-over-Auracast transmitter.
Reads a DCP XML subtitle file and broadcasts each subtitle as raw ISO SDUs.
No LC3 encoding is used. The BIG is advertised with codec_id=LC3 (required
for BAP sync) but the SDU payload is plain UTF-8 text with a magic header.
Frame format (SDU_SIZE bytes total):
Byte 0 : TEXT_MAGIC (0xAA) identifies this as a text SDU
Byte 1 : text length N 0 means idle/clear
Bytes 2..N+1: UTF-8 text
Bytes N+2.. : zero padding to SDU_SIZE
Usage:
poetry run python -m auracast.text_multicast \\
--dcp ./auracast/testdata/sample_subtitles.xml \\
--transport serial:/dev/ttyAMA3,1000000,rtscts
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import os
from auracast import auracast_config, multicast
from auracast.dcp_parser import parse_dcp_xml
TEXT_MAGIC = 0xAA
SDU_SIZE = 64 # octets_per_frame; 62 usable text bytes per frame
SDU_INTERVAL_US = 10_000 # 10 ms → 100 SDUs/sec
BROADCAST_NAME = 'TextCast'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s: %(message)s',
)
log = logging.getLogger('text_multicast')
def _make_text_frame(text: str) -> bytes:
"""Encode a subtitle string into a fixed-size TEXT SDU."""
text_bytes = text.encode('utf-8')[: SDU_SIZE - 2]
frame = bytes([TEXT_MAGIC, len(text_bytes)]) + text_bytes
return frame + bytes(SDU_SIZE - len(frame))
def _make_idle_frame() -> bytes:
"""Return an idle frame (magic=0, signals 'no active subtitle')."""
return bytes(SDU_SIZE)
async def _text_stream(bigs: dict, subtitles: list, loop: bool = True) -> None:
"""Main text streaming loop.
Writes one SDU every ~10 ms (flow-controlled by the BLE controller).
Subtitle timing is derived from the frame counter: frame N ≈ N × 10 ms.
When *loop* is True (default) the subtitle list repeats indefinitely.
"""
iso_queue = bigs['big0']['iso_queue']
frame_interval_s = SDU_INTERVAL_US / 1_000_000
frame_count = 0
sub_idx = 0
n = len(subtitles)
last_log_sub = -1
loop_count = 0
# Total duration of one pass: end of last subtitle + 2 s gap before restart
_loop_gap_s = 2.0
_pass_duration_s = subtitles[-1].time_out + _loop_gap_s if n > 0 else 0.0
log.info("Streaming %d subtitle(s) (loop=%s). Press Ctrl-C to stop.", n, loop)
while True:
now_s = frame_count * frame_interval_s
# Advance past subtitles whose time_out has passed
while sub_idx < n and now_s >= subtitles[sub_idx].time_out:
sub_idx += 1
# Determine what to send
if sub_idx < n and now_s >= subtitles[sub_idx].time_in:
frame = _make_text_frame(subtitles[sub_idx].text)
if sub_idx != last_log_sub:
log.info("[loop %d %05.1fs] %s", loop_count, now_s, subtitles[sub_idx].text)
last_log_sub = sub_idx
else:
frame = _make_idle_frame()
await iso_queue.write(frame)
frame_count += 1
# End of pass
if n > 0 and now_s >= _pass_duration_s:
if loop:
loop_count += 1
log.info("Loop %d complete restarting.", loop_count)
frame_count = 0
sub_idx = 0
last_log_sub = -1
else:
log.info("All subtitles transmitted. Exiting.")
break
async def broadcast_text(dcp_path: str, transport: str, loop: bool = True) -> None:
subtitles = parse_dcp_xml(dcp_path)
if not subtitles:
log.error("No subtitles found in %s", dcp_path)
return
log.info("Loaded %d subtitle(s) from %s", len(subtitles), dcp_path)
config = auracast_config.AuracastConfigGroup(
bigs=[
auracast_config.AuracastBigConfig(
name=BROADCAST_NAME,
program_info='Text Broadcast',
language='eng',
audio_source='file:dummy', # not used streamer loop is replaced
iso_que_len=4,
),
],
auracast_sampling_rate_hz=16000,
octets_per_frame=SDU_SIZE,
frame_duration_us=SDU_INTERVAL_US,
presentation_delay_us=40_000,
qos_config=auracast_config.AuracastQosRobust(),
transport=transport,
)
async with multicast.create_device(config) as device:
bigs = await multicast.init_broadcast(device, config, config.bigs)
await _text_stream(bigs, subtitles, loop=loop)
def main() -> None:
parser = argparse.ArgumentParser(description='Auracast text (subtitle) transmitter')
parser.add_argument('--dcp', required=True, help='Path to DCP XML subtitle file')
parser.add_argument(
'--transport',
default=os.environ.get(
'AURACAST_TRANSPORT',
'serial:/dev/ttyAMA3,1000000,rtscts',
),
help='Bumble HCI transport string (default: $AURACAST_TRANSPORT or ttyAMA3)',
)
parser.add_argument(
'--no-loop',
action='store_true',
help='Play subtitles once and exit instead of looping indefinitely',
)
args = parser.parse_args()
multicast.run_async(broadcast_text(args.dcp, args.transport, loop=not args.no_loop))
if __name__ == '__main__':
main()
+270
View File
@@ -0,0 +1,270 @@
"""Vosk speech-to-text → TextCast streamer.
Captures mono audio from an analog ALSA/sounddevice input, runs Vosk
offline ASR in a background thread, and broadcasts recognised text over
the TextCast BLE broadcast using the same SDU framing as text_multicast.py.
Usage (CLI):
poetry run python -m auracast.vosk_textcast \\
--model /path/to/vosk-model-en-us \\
--device ch1 \\
--transport serial:/dev/ttyAMA3,1000000,rtscts
Environment:
VOSK_MODEL_PATH default Vosk model directory
AURACAST_TRANSPORT default HCI transport string
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import threading
import time
from typing import Optional
import numpy as np
import samplerate
import sounddevice as sd
from auracast import auracast_config, multicast
from auracast.text_multicast import (
SDU_SIZE,
SDU_INTERVAL_US,
_make_text_frame,
_make_idle_frame,
)
log = logging.getLogger('vosk_textcast')
VOSK_SAMPLE_RATE = 16_000 # Vosk models expect 16 kHz
CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz)
BLOCK_FRAMES_48K = 4800 # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz
CAPTION_HOLD_S = 4.0 # Keep caption visible N seconds after last speech
BROADCAST_NAME = 'LiveCaption'
DEFAULT_MODEL_PATH = os.environ.get(
'VOSK_MODEL_PATH',
os.path.expanduser('~/vosk-model-en-us'),
)
def _tail_to_fit(text: str, max_bytes: int) -> str:
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
encoded = text.encode('utf-8')
if len(encoded) <= max_bytes:
return text
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
sp = tail.find(' ')
return tail[sp + 1:] if sp != -1 else tail
def _new_words(old: str, new: str) -> str:
"""Return the words appended to *new* beyond the shared prefix with *old*.
If *new* doesn't start with *old* (different utterance), return *new* in full.
"""
old_words = old.split()
new_words = new.split()
if new_words[:len(old_words)] == old_words:
extra = new_words[len(old_words):]
return ' '.join(extra)
return new
def _resolve_device(device: str) -> Optional[int]:
"""Return sounddevice index for a name or numeric string, or None for default."""
if not device:
return None
if device.isdigit():
return int(device)
for i, d in enumerate(sd.query_devices()):
if d['name'] == device and d['max_input_channels'] > 0:
return i
log.warning("Device '%s' not found in sounddevice list using default input", device)
return None
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
"""ISO SDU write loop.
Runs at ~10 ms per iteration (flow-controlled by the BLE controller).
Sends the current recognised text (partial or final) as-is.
"""
iso_queue = bigs['big0']['iso_queue']
last_sent: str = ''
while True:
now = time.monotonic()
with lock:
text: str = shared.get('text', '')
expiry: float = shared.get('expiry', 0.0)
if text and now < expiry:
display_text = _tail_to_fit(text, SDU_SIZE - 2)
if display_text != last_sent:
log.info("Caption: %s", display_text)
last_sent = display_text
frame = _make_text_frame(display_text)
else:
if last_sent:
log.info("Caption cleared")
last_sent = ''
with lock:
shared['text'] = ''
frame = _make_idle_frame()
await iso_queue.write(frame)
def _vosk_thread(
model_path: str,
device: str,
shared: dict,
lock: threading.Lock,
stop_event: threading.Event,
) -> None:
"""Blocking audio capture + Vosk recognition loop. Runs in a daemon thread."""
try:
from vosk import KaldiRecognizer, Model # type: ignore
except ImportError:
log.error("vosk is not installed. Run: poetry add vosk")
return
log.info("Loading Vosk model from %s", model_path)
model = Model(model_path)
rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE)
rec.SetMaxAlternatives(0)
rec.SetWords(False)
resampler = samplerate.Resampler('sinc_fastest', channels=1)
ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
dev_idx = _resolve_device(device)
last_word_count = [0] # word count of last partial sent to display
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
if status:
log.warning("Audio status: %s", status)
if stop_event.is_set():
raise sd.CallbackStop()
# Resample 48 kHz → 16 kHz
mono = indata[:, 0].astype(np.float32)
downsampled = resampler.process(mono, ratio, end_of_input=False)
pcm16 = (downsampled * 32767).astype(np.int16).tobytes()
if rec.AcceptWaveform(pcm16):
result = json.loads(rec.Result())
final_text = result.get('text', '').strip()
if final_text:
log.info("Final: %s", final_text)
with lock:
shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2)
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
last_word_count[0] = 0 # reset for next sentence
else:
partial_text = json.loads(rec.PartialResult()).get('partial', '').strip()
if partial_text:
wc = len(partial_text.split())
if wc > last_word_count[0]: # new word arrived
last_word_count[0] = wc
with lock:
shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2)
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
try:
with sd.InputStream(
samplerate=CAPTURE_SAMPLE_RATE,
blocksize=BLOCK_FRAMES_48K,
device=dev_idx,
dtype='float32',
channels=1,
callback=_cb,
):
log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx)
stop_event.wait()
except Exception as exc:
log.error("Vosk audio thread error: %s", exc, exc_info=True)
async def broadcast_vosk(
transport: str,
model_path: str = DEFAULT_MODEL_PATH,
device: str = 'ch1',
) -> None:
"""Start a Vosk STT → TextCast broadcast. Runs until cancelled."""
model_path = os.path.expanduser(model_path)
if not os.path.exists(model_path):
raise FileNotFoundError(
f"Vosk model not found at '{model_path}'. "
"Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH."
)
config = auracast_config.AuracastConfigGroup(
bigs=[
auracast_config.AuracastBigConfig(
name=BROADCAST_NAME,
program_info='Live Captions',
language='eng',
audio_source='file:dummy',
iso_que_len=4,
),
],
auracast_sampling_rate_hz=16000,
octets_per_frame=SDU_SIZE,
frame_duration_us=SDU_INTERVAL_US,
presentation_delay_us=40_000,
qos_config=auracast_config.AuracastQosRobust(),
transport=transport,
)
shared: dict = {'text': '', 'expiry': 0.0}
lock = threading.Lock()
stop_event = threading.Event()
async with multicast.create_device(config) as ble_device:
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
t = threading.Thread(
target=_vosk_thread,
args=(model_path, device, shared, lock, stop_event),
daemon=True,
)
t.start()
log.info("VoskCast started (device=%s, model=%s)", device, model_path)
try:
await _iso_write_loop(bigs, shared, lock)
except asyncio.CancelledError:
log.info("VoskCast cancelled shutting down")
stop_event.set()
t.join(timeout=3.0)
raise
def main() -> None:
parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast')
parser.add_argument(
'--model',
default=DEFAULT_MODEL_PATH,
help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})',
)
parser.add_argument(
'--device',
default='ch1',
help='sounddevice input device name or index (default: ch1)',
)
parser.add_argument(
'--transport',
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
help='Bumble HCI transport string',
)
args = parser.parse_args()
multicast.run_async(broadcast_vosk(args.transport, args.model, args.device))
if __name__ == '__main__':
main()