Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 53fd074d22 | |||
| d1471fae79 |
+3
-1
@@ -19,7 +19,9 @@ dependencies = [
|
||||
"smbus2 (>=0.5.0,<0.6.0)",
|
||||
"samplerate (>=0.2.2,<0.3.0)",
|
||||
"rpi-gpio (>=0.7.1,<0.8.0)",
|
||||
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc"
|
||||
"pyalsaaudio @ git+ssh://git@gitea.summitwave.work:222/auracaster/sw_pyalsaaudio.git@b3d11582e03df6929b2e7acbaa1306afc7b8a6bc",
|
||||
"vosk (>=0.3.45)",
|
||||
"faster-whisper (>=1.0.0)"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
"""DCP XML subtitle file parser (Interop and SMPTE 428-7 formats).
|
||||
|
||||
Timecode format: HH:MM:SS:FF (frame-based, default 24 fps)
|
||||
HH:MM:SS.mmm (millisecond decimal, also accepted)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Subtitle:
|
||||
time_in: float # seconds (float)
|
||||
time_out: float # seconds (float)
|
||||
text: str
|
||||
|
||||
|
||||
def _parse_timecode(tc: str, fps: int = 24) -> float:
|
||||
"""Parse a DCP timecode string to float seconds."""
|
||||
# HH:MM:SS:FF
|
||||
m = re.match(r'^(\d+):(\d+):(\d+):(\d+)$', tc.strip())
|
||||
if m:
|
||||
h, mi, s, f = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))
|
||||
return h * 3600 + mi * 60 + s + f / fps
|
||||
|
||||
# HH:MM:SS.mmm
|
||||
m = re.match(r'^(\d+):(\d+):(\d+)\.(\d+)$', tc.strip())
|
||||
if m:
|
||||
h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||
frac = float('0.' + m.group(4))
|
||||
return h * 3600 + mi * 60 + s + frac
|
||||
|
||||
raise ValueError(f"Unrecognized DCP timecode: {tc!r}")
|
||||
|
||||
|
||||
def parse_dcp_xml(path: str, fps: int = 24) -> List[Subtitle]:
|
||||
"""Parse a DCP XML subtitle file and return a time-sorted list of Subtitles."""
|
||||
tree = ET.parse(path)
|
||||
root = tree.getroot()
|
||||
|
||||
# Strip namespace so element lookups work regardless of schema version
|
||||
ns_match = re.match(r'\{(.+?)\}', root.tag)
|
||||
ns = ns_match.group(0) if ns_match else ''
|
||||
|
||||
subtitles: List[Subtitle] = []
|
||||
|
||||
for subtitle_el in root.iter(f'{ns}Subtitle'):
|
||||
time_in_str = subtitle_el.get('TimeIn', '')
|
||||
time_out_str = subtitle_el.get('TimeOut', '')
|
||||
if not time_in_str or not time_out_str:
|
||||
continue
|
||||
|
||||
parts: List[str] = []
|
||||
for text_el in subtitle_el.iter(f'{ns}Text'):
|
||||
t = (text_el.text or '').strip()
|
||||
if t:
|
||||
parts.append(t)
|
||||
|
||||
text = ' '.join(parts)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
subtitles.append(Subtitle(
|
||||
time_in=_parse_timecode(time_in_str, fps),
|
||||
time_out=_parse_timecode(time_out_str, fps),
|
||||
text=text,
|
||||
))
|
||||
|
||||
return sorted(subtitles, key=lambda s: s.time_in)
|
||||
@@ -0,0 +1,259 @@
|
||||
"""faster-whisper speech-to-text → TextCast streamer.
|
||||
|
||||
Captures mono audio from an analog ALSA/sounddevice input, runs
|
||||
faster-whisper offline ASR in a background thread (chunked, every
|
||||
CHUNK_S seconds), and broadcasts recognised text over the TextCast BLE
|
||||
broadcast using the same SDU framing as text_multicast.py.
|
||||
|
||||
Usage (CLI):
|
||||
poetry run python -m auracast.faster_whisper_textcast \\
|
||||
--model tiny.en \\
|
||||
--device ch1 \\
|
||||
--transport serial:/dev/ttyAMA3,1000000,rtscts
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import samplerate
|
||||
import sounddevice as sd
|
||||
|
||||
from auracast import auracast_config, multicast
|
||||
from auracast.text_multicast import (
|
||||
SDU_SIZE,
|
||||
SDU_INTERVAL_US,
|
||||
_make_text_frame,
|
||||
_make_idle_frame,
|
||||
)
|
||||
|
||||
log = logging.getLogger('faster_whisper_textcast')
|
||||
|
||||
CAPTURE_SAMPLE_RATE = 48_000
|
||||
WHISPER_SAMPLE_RATE = 16_000
|
||||
BLOCK_FRAMES_48K = 4800 # 100 ms capture blocks
|
||||
CHUNK_S = 3.0 # transcribe every N seconds of audio
|
||||
CAPTION_HOLD_S = 4.0 # keep caption visible after last transcription
|
||||
SILENCE_RMS = 0.003 # skip transcription if chunk is below this RMS
|
||||
BROADCAST_NAME = 'LiveCaption'
|
||||
|
||||
VALID_MODELS = ['tiny.en', 'base.en', 'small.en', 'tiny', 'base', 'small']
|
||||
|
||||
|
||||
def _tail_to_fit(text: str, max_bytes: int) -> str:
|
||||
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
|
||||
encoded = text.encode('utf-8')
|
||||
if len(encoded) <= max_bytes:
|
||||
return text
|
||||
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
|
||||
sp = tail.find(' ')
|
||||
return tail[sp + 1:] if sp != -1 else tail
|
||||
|
||||
|
||||
def _resolve_device(device: str) -> Optional[int]:
|
||||
"""Return sounddevice index for a name or numeric string, or None for default."""
|
||||
if not device:
|
||||
return None
|
||||
if device.isdigit():
|
||||
return int(device)
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d['name'] == device and d['max_input_channels'] > 0:
|
||||
return i
|
||||
log.warning("Device '%s' not found in sounddevice list – using default input", device)
|
||||
return None
|
||||
|
||||
|
||||
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
|
||||
"""ISO SDU write loop – runs at ~10 ms per iteration."""
|
||||
iso_queue = bigs['big0']['iso_queue']
|
||||
last_sent: str = ''
|
||||
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
with lock:
|
||||
text: str = shared.get('text', '')
|
||||
expiry: float = shared.get('expiry', 0.0)
|
||||
|
||||
if text and now < expiry:
|
||||
display_text = _tail_to_fit(text, SDU_SIZE - 2)
|
||||
if display_text != last_sent:
|
||||
log.info("Caption: %s", display_text)
|
||||
last_sent = display_text
|
||||
frame = _make_text_frame(display_text)
|
||||
else:
|
||||
if last_sent:
|
||||
log.info("Caption cleared")
|
||||
last_sent = ''
|
||||
with lock:
|
||||
shared['text'] = ''
|
||||
frame = _make_idle_frame()
|
||||
|
||||
await iso_queue.write(frame)
|
||||
|
||||
|
||||
def _whisper_thread(
|
||||
model_size: str,
|
||||
device: str,
|
||||
shared: dict,
|
||||
lock: threading.Lock,
|
||||
stop_event: threading.Event,
|
||||
) -> None:
|
||||
"""Blocking audio capture + faster-whisper transcription loop."""
|
||||
try:
|
||||
from faster_whisper import WhisperModel # type: ignore
|
||||
except ImportError:
|
||||
log.error("faster-whisper is not installed. Run: poetry add faster-whisper")
|
||||
return
|
||||
|
||||
log.info("Loading faster-whisper model '%s' (int8, CPU) …", model_size)
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
log.info("Model '%s' loaded.", model_size)
|
||||
|
||||
audio_q: queue.Queue = queue.Queue()
|
||||
resampler = samplerate.Resampler('sinc_fastest', channels=1)
|
||||
ratio = WHISPER_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
|
||||
chunk_frames = int(CHUNK_S * WHISPER_SAMPLE_RATE)
|
||||
audio_buffer = np.zeros(0, dtype=np.float32)
|
||||
|
||||
dev_idx = _resolve_device(device)
|
||||
|
||||
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
|
||||
if status:
|
||||
log.warning("Audio status: %s", status)
|
||||
if stop_event.is_set():
|
||||
raise sd.CallbackStop()
|
||||
mono = indata[:, 0].astype(np.float32)
|
||||
downsampled = resampler.process(mono, ratio, end_of_input=False)
|
||||
audio_q.put(downsampled.copy())
|
||||
|
||||
try:
|
||||
with sd.InputStream(
|
||||
samplerate=CAPTURE_SAMPLE_RATE,
|
||||
blocksize=BLOCK_FRAMES_48K,
|
||||
device=dev_idx,
|
||||
dtype='float32',
|
||||
channels=1,
|
||||
callback=_cb,
|
||||
):
|
||||
log.info("WhisperCast listening on device '%s' (idx=%s) …", device, dev_idx)
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
chunk = audio_q.get(timeout=0.2)
|
||||
audio_buffer = np.concatenate([audio_buffer, chunk])
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
if len(audio_buffer) < chunk_frames:
|
||||
continue
|
||||
|
||||
pcm = audio_buffer[:chunk_frames].copy()
|
||||
audio_buffer = audio_buffer[chunk_frames:]
|
||||
|
||||
rms = float(np.sqrt(np.mean(pcm ** 2)))
|
||||
if rms < SILENCE_RMS:
|
||||
continue
|
||||
|
||||
t0 = time.monotonic()
|
||||
segments, _ = model.transcribe(
|
||||
pcm,
|
||||
beam_size=1,
|
||||
language="en",
|
||||
vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 300},
|
||||
)
|
||||
text = ' '.join(s.text.strip() for s in segments).strip()
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
if text:
|
||||
log.info("Transcribed (%.2fs): %s", elapsed, text)
|
||||
with lock:
|
||||
shared['text'] = text
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
else:
|
||||
log.debug("Silent chunk skipped (rms=%.4f, took=%.2fs)", rms, elapsed)
|
||||
|
||||
except Exception as exc:
|
||||
log.error("WhisperCast thread error: %s", exc, exc_info=True)
|
||||
|
||||
|
||||
async def broadcast_whisper(
|
||||
transport: str,
|
||||
model_size: str = 'tiny.en',
|
||||
device: str = 'ch1',
|
||||
) -> None:
|
||||
"""Start a faster-whisper → TextCast broadcast. Runs until cancelled."""
|
||||
if model_size not in VALID_MODELS:
|
||||
raise ValueError(f"Unknown model '{model_size}'. Valid: {VALID_MODELS}")
|
||||
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
bigs=[
|
||||
auracast_config.AuracastBigConfig(
|
||||
name=BROADCAST_NAME,
|
||||
program_info='Live Captions',
|
||||
language='eng',
|
||||
audio_source='file:dummy',
|
||||
iso_que_len=4,
|
||||
),
|
||||
],
|
||||
auracast_sampling_rate_hz=16000,
|
||||
octets_per_frame=SDU_SIZE,
|
||||
frame_duration_us=SDU_INTERVAL_US,
|
||||
presentation_delay_us=40_000,
|
||||
qos_config=auracast_config.AuracastQosRobust(),
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
shared: dict = {'text': '', 'expiry': 0.0}
|
||||
lock = threading.Lock()
|
||||
stop_event = threading.Event()
|
||||
|
||||
async with multicast.create_device(config) as ble_device:
|
||||
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
|
||||
|
||||
t = threading.Thread(
|
||||
target=_whisper_thread,
|
||||
args=(model_size, device, shared, lock, stop_event),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
log.info("WhisperCast started (device=%s, model=%s)", device, model_size)
|
||||
|
||||
try:
|
||||
await _iso_write_loop(bigs, shared, lock)
|
||||
except asyncio.CancelledError:
|
||||
log.info("WhisperCast cancelled – shutting down")
|
||||
stop_event.set()
|
||||
t.join(timeout=5.0)
|
||||
raise
|
||||
|
||||
|
||||
def main() -> None:
|
||||
global CHUNK_S
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='faster-whisper → Auracast TextCast')
|
||||
parser.add_argument(
|
||||
'--model', default='tiny.en', choices=VALID_MODELS,
|
||||
help='Whisper model size (default: tiny.en)',
|
||||
)
|
||||
parser.add_argument('--device', default='ch1',
|
||||
help='sounddevice input name or index (default: ch1)')
|
||||
parser.add_argument(
|
||||
'--transport',
|
||||
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
|
||||
help='Bumble HCI transport string',
|
||||
)
|
||||
parser.add_argument('--chunk', type=float, default=CHUNK_S,
|
||||
help=f'Seconds per transcription chunk (default: {CHUNK_S})')
|
||||
args = parser.parse_args()
|
||||
CHUNK_S = args.chunk
|
||||
multicast.run_async(broadcast_whisper(args.transport, args.model, args.device))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -141,6 +141,9 @@ except Exception:
|
||||
|
||||
# Define is_streaming early from the fetched status for use throughout the UI
|
||||
is_streaming = bool(saved_settings.get("is_streaming", False))
|
||||
textcast_is_streaming = bool(saved_settings.get("textcast_is_streaming", False))
|
||||
voskcast_is_streaming = bool(saved_settings.get("voskcast_is_streaming", False))
|
||||
whispercast_is_streaming = bool(saved_settings.get("whispercast_is_streaming", False))
|
||||
|
||||
# Extract secondary status, if provided by the backend /status endpoint.
|
||||
secondary_status = saved_settings.get("secondary") or {}
|
||||
@@ -185,6 +188,9 @@ options = [
|
||||
"Demo",
|
||||
"Analog",
|
||||
"Network - Dante",
|
||||
"TextCast",
|
||||
"VoskCast",
|
||||
"WhisperCast",
|
||||
]
|
||||
saved_audio_mode = saved_settings.get("audio_mode", "Demo")
|
||||
if saved_audio_mode not in options:
|
||||
@@ -196,7 +202,7 @@ audio_mode = st.selectbox(
|
||||
"Audio Mode",
|
||||
options,
|
||||
index=options.index(saved_audio_mode) if saved_audio_mode in options else options.index("Demo"),
|
||||
disabled=is_streaming,
|
||||
disabled=is_streaming or textcast_is_streaming or voskcast_is_streaming or whispercast_is_streaming,
|
||||
help=(
|
||||
"Select the audio input source. Choose 'USB' for a connected USB audio device (via PipeWire), "
|
||||
"'Network' (AES67) for network RTP/AES67 sources, "
|
||||
@@ -226,11 +232,94 @@ else:
|
||||
running_mode = backend_mode_mapped if (is_streaming and backend_mode_mapped) else audio_mode
|
||||
|
||||
# Start/Stop buttons and status (moved to top)
|
||||
if audio_mode == "Demo":
|
||||
if audio_mode == "TextCast":
|
||||
start_stream, stop_stream = render_stream_controls(textcast_is_streaming, "Start TextCast", "Stop TextCast", "TextCast", False)
|
||||
elif audio_mode == "VoskCast":
|
||||
start_stream, stop_stream = render_stream_controls(voskcast_is_streaming, "Start VoskCast", "Stop VoskCast", "VoskCast", False)
|
||||
elif audio_mode == "Demo":
|
||||
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Demo", "Stop Demo", running_mode, secondary_is_streaming)
|
||||
else:
|
||||
start_stream, stop_stream = render_stream_controls(is_streaming, "Start Auracast", "Stop Auracast", running_mode, secondary_is_streaming)
|
||||
|
||||
# TextCast: DCP XML file uploader
|
||||
if audio_mode == "TextCast":
|
||||
st.markdown("#### DCP Subtitle File")
|
||||
dcp_file = st.file_uploader(
|
||||
"Upload DCP XML subtitle file (.xml)",
|
||||
type=["xml"],
|
||||
disabled=textcast_is_streaming,
|
||||
help="Upload a DCP-compliant subtitle XML file. Subtitles will be broadcast over Auracast.",
|
||||
)
|
||||
if dcp_file is not None:
|
||||
content = dcp_file.read().decode("utf-8", errors="replace")
|
||||
st.session_state['_textcast_dcp_content'] = content
|
||||
st.session_state['_textcast_dcp_name'] = dcp_file.name
|
||||
st.success(f"Loaded: {dcp_file.name} ({len(content):,} bytes)")
|
||||
elif st.session_state.get('_textcast_dcp_name'):
|
||||
st.info(f"Using previously uploaded file: {st.session_state['_textcast_dcp_name']}")
|
||||
else:
|
||||
st.warning("No subtitle file loaded. Upload a DCP XML file or use the sample below.")
|
||||
if st.button("Load sample subtitle file", disabled=textcast_is_streaming):
|
||||
import os as _os
|
||||
_sample = _os.path.abspath(_os.path.join(
|
||||
_os.path.dirname(__file__), '..', 'testdata', 'sample_subtitles.xml'))
|
||||
try:
|
||||
with open(_sample, 'r', encoding='utf-8') as _f:
|
||||
_content = _f.read()
|
||||
st.session_state['_textcast_dcp_content'] = _content
|
||||
st.session_state['_textcast_dcp_name'] = 'sample_subtitles.xml'
|
||||
st.rerun()
|
||||
except Exception as _e:
|
||||
st.error(f"Could not load sample: {_e}")
|
||||
|
||||
# WhisperCast: model size + input device
|
||||
if audio_mode == "WhisperCast":
|
||||
st.markdown("#### Live Speech Recognition (faster-whisper)")
|
||||
_whisper_default_model = saved_settings.get("whispercast_model", "tiny.en")
|
||||
_whisper_default_device = saved_settings.get("whispercast_device", "ch1")
|
||||
col_wm, col_wd = st.columns([2, 1])
|
||||
with col_wm:
|
||||
whisper_model_size = st.selectbox(
|
||||
"Whisper Model",
|
||||
["tiny.en", "base.en", "small.en"],
|
||||
index=["tiny.en", "base.en", "small.en"].index(_whisper_default_model)
|
||||
if _whisper_default_model in ["tiny.en", "base.en", "small.en"] else 0,
|
||||
disabled=whispercast_is_streaming,
|
||||
help="tiny.en (~39 MB, ~3-5s latency), base.en (~74 MB, ~5-8s latency)",
|
||||
)
|
||||
with col_wd:
|
||||
whisper_device = st.selectbox(
|
||||
"Input",
|
||||
["ch1", "ch2"],
|
||||
index=0 if _whisper_default_device == "ch1" else 1,
|
||||
disabled=whispercast_is_streaming,
|
||||
help="Analog input channel",
|
||||
)
|
||||
st.caption("Model downloads automatically on first use. Each sentence appears after ~3s of speech.")
|
||||
|
||||
# VoskCast: model path + input device
|
||||
if audio_mode == "VoskCast":
|
||||
st.markdown("#### Live Speech Recognition (Vosk)")
|
||||
_vosk_default_model = saved_settings.get("voskcast_model") or os.environ.get("VOSK_MODEL_PATH", "~/vosk-model-en-us")
|
||||
_vosk_default_device = saved_settings.get("voskcast_device", "ch1")
|
||||
col_model, col_dev = st.columns([3, 1])
|
||||
with col_model:
|
||||
vosk_model_path = st.text_input(
|
||||
"Vosk Model Path",
|
||||
value=_vosk_default_model,
|
||||
disabled=voskcast_is_streaming,
|
||||
help="Local path to the Vosk model directory. Download from https://alphacephei.com/vosk/models",
|
||||
)
|
||||
with col_dev:
|
||||
vosk_device = st.selectbox(
|
||||
"Input",
|
||||
["ch1", "ch2"],
|
||||
index=0 if _vosk_default_device == "ch1" else 1,
|
||||
disabled=voskcast_is_streaming,
|
||||
help="Analog input channel (ch1 = left, ch2 = right)",
|
||||
)
|
||||
st.caption("Partial results appear immediately; final results are held for 4 s then cleared.")
|
||||
|
||||
# Analog gain control (only for Analog mode, placed below start button)
|
||||
analog_gain_db_left = 0 # default (dB)
|
||||
analog_gain_db_right = 0 # default (dB)
|
||||
@@ -1793,22 +1882,78 @@ else:
|
||||
if stop_stream:
|
||||
st.session_state['stream_started'] = False
|
||||
try:
|
||||
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
|
||||
if audio_mode == "Demo":
|
||||
st.session_state['demo_stream_started'] = False
|
||||
if r['was_running']:
|
||||
if audio_mode == "TextCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_textcast").json()
|
||||
elif audio_mode == "VoskCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_voskcast").json()
|
||||
elif audio_mode == "WhisperCast":
|
||||
r = requests.post(f"{BACKEND_URL}/stop_whispercast").json()
|
||||
else:
|
||||
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
|
||||
if audio_mode == "Demo":
|
||||
st.session_state['demo_stream_started'] = False
|
||||
if r.get('was_running'):
|
||||
is_stopped = True
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
|
||||
if start_stream:
|
||||
# Always send stop to ensure backend is in a clean state, regardless of current status
|
||||
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
|
||||
# Small pause lets backend fully release audio devices before re-init
|
||||
time.sleep(1)
|
||||
if audio_mode == "TextCast":
|
||||
uploaded = st.session_state.get('_textcast_dcp_content')
|
||||
if not uploaded:
|
||||
st.error("Upload a DCP XML file first.")
|
||||
else:
|
||||
try:
|
||||
ru = requests.post(f"{BACKEND_URL}/upload_dcp", json={"xml": uploaded})
|
||||
if not ru.ok:
|
||||
st.error(f"Upload failed: {ru.text}")
|
||||
else:
|
||||
rs = requests.post(f"{BACKEND_URL}/start_textcast")
|
||||
if rs.ok:
|
||||
st.success("TextCast started.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(f"Start failed: {rs.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
if audio_mode == "Demo":
|
||||
elif audio_mode == "VoskCast":
|
||||
try:
|
||||
rs = requests.post(
|
||||
f"{BACKEND_URL}/start_voskcast",
|
||||
json={"model": vosk_model_path, "device": vosk_device},
|
||||
)
|
||||
if rs.ok:
|
||||
st.success("VoskCast started.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(f"Start failed: {rs.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
elif audio_mode == "WhisperCast":
|
||||
try:
|
||||
rs = requests.post(
|
||||
f"{BACKEND_URL}/start_whispercast",
|
||||
json={"model": whisper_model_size, "device": whisper_device},
|
||||
)
|
||||
if rs.ok:
|
||||
st.success("WhisperCast started.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(f"Start failed: {rs.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
else:
|
||||
|
||||
# Always send stop to ensure backend is in a clean state, regardless of current status
|
||||
r = requests.post(f"{BACKEND_URL}/stop_audio").json()
|
||||
# Small pause lets backend fully release audio devices before re-init
|
||||
time.sleep(1)
|
||||
|
||||
if audio_mode == "Demo":
|
||||
demo_cfg = demo_stream_map[demo_selected]
|
||||
q = QUALITY_MAP[demo_cfg['quality']]
|
||||
|
||||
@@ -2025,7 +2170,7 @@ if start_stream:
|
||||
st.error(f"Failed to initialize Dante Radio 2: {r2.text}")
|
||||
except Exception as e:
|
||||
st.error(f"Error while starting Dante radios: {e}")
|
||||
if audio_mode not in ("Demo", "Analog", "Network - Dante"):
|
||||
if audio_mode not in ("Demo", "Analog", "Network - Dante", "VoskCast", "WhisperCast", "TextCast"):
|
||||
# USB/Network: single config as before, using shared controls
|
||||
q = QUALITY_MAP[quality]
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
|
||||
@@ -209,6 +209,16 @@ multicaster1: multicast_control.Multicaster | None = None
|
||||
multicaster2: multicast_control.Multicaster | None = None
|
||||
_stream_lock = asyncio.Lock() # serialize initialize/stop_audio on API side
|
||||
|
||||
# TextCast state
|
||||
_textcast_task: asyncio.Task | None = None
|
||||
DCP_UPLOAD_PATH = os.path.join(os.path.dirname(__file__), 'uploaded_subtitles.xml')
|
||||
|
||||
# VoskCast state
|
||||
_voskcast_task: asyncio.Task | None = None
|
||||
|
||||
# WhisperCast state
|
||||
_whispercast_task: asyncio.Task | None = None
|
||||
|
||||
# BLE / audio event loop – set in __main__ before uvicorn starts.
|
||||
# All coroutines that touch Bumble objects or the audio pipeline MUST run
|
||||
# on this loop. HTTP handlers call _on_ble_loop() to cross into it.
|
||||
@@ -705,6 +715,208 @@ async def _stop_audio_impl():
|
||||
log.error("Exception in /stop_audio: %s", traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/upload_dcp")
|
||||
async def upload_dcp(payload: dict):
|
||||
"""Save DCP XML content for TextCast. Body: {"xml": "<DCSubtitle>..."}"""
|
||||
xml_content = payload.get("xml", "")
|
||||
if not xml_content.strip():
|
||||
raise HTTPException(status_code=400, detail="Empty XML content")
|
||||
try:
|
||||
with open(DCP_UPLOAD_PATH, 'w', encoding='utf-8') as f:
|
||||
f.write(xml_content)
|
||||
log.info("DCP XML saved to %s (%d bytes)", DCP_UPLOAD_PATH, len(xml_content))
|
||||
return {"status": "ok", "path": DCP_UPLOAD_PATH}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/start_textcast")
|
||||
async def start_textcast():
|
||||
"""Start text-over-Auracast broadcast using the uploaded DCP XML file."""
|
||||
return await _on_ble_loop(_start_textcast_impl())
|
||||
|
||||
|
||||
async def _start_textcast_impl():
|
||||
global _textcast_task
|
||||
if not os.path.exists(DCP_UPLOAD_PATH):
|
||||
raise HTTPException(status_code=400, detail="No DCP file uploaded. Use /upload_dcp first.")
|
||||
|
||||
# Stop any running audio/textcast first
|
||||
await _stop_all()
|
||||
await _stop_textcast_impl()
|
||||
|
||||
from auracast.text_multicast import broadcast_text
|
||||
_textcast_task = asyncio.get_event_loop().create_task(
|
||||
broadcast_text(DCP_UPLOAD_PATH, TRANSPORT1)
|
||||
)
|
||||
|
||||
settings = {
|
||||
'is_streaming': True,
|
||||
'audio_mode': 'TextCast',
|
||||
'textcast_is_streaming': True,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
save_stream_settings(settings)
|
||||
_led_on()
|
||||
log.info("TextCast started (DCP: %s)", DCP_UPLOAD_PATH)
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.post("/stop_textcast")
|
||||
async def stop_textcast():
|
||||
"""Stop an active TextCast broadcast."""
|
||||
return await _on_ble_loop(_stop_textcast_impl())
|
||||
|
||||
|
||||
async def _stop_textcast_impl():
|
||||
global _textcast_task
|
||||
was_running = False
|
||||
if _textcast_task is not None and not _textcast_task.done():
|
||||
was_running = True
|
||||
_textcast_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_textcast_task), timeout=3.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
|
||||
pass
|
||||
_textcast_task = None
|
||||
_led_off()
|
||||
settings = load_stream_settings() or {}
|
||||
if settings.get('audio_mode') == 'TextCast':
|
||||
settings['is_streaming'] = False
|
||||
settings['textcast_is_streaming'] = False
|
||||
settings['timestamp'] = datetime.utcnow().isoformat()
|
||||
save_stream_settings(settings)
|
||||
log.info("TextCast stopped")
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/start_voskcast")
|
||||
async def start_voskcast(body: dict = {}):
|
||||
"""Start Vosk STT → TextCast. Body (optional): {"model": "...", "device": "ch1"}"""
|
||||
return await _on_ble_loop(_start_voskcast_impl(body))
|
||||
|
||||
|
||||
async def _start_voskcast_impl(body: dict) -> dict:
|
||||
global _voskcast_task
|
||||
from auracast.vosk_textcast import broadcast_vosk, DEFAULT_MODEL_PATH
|
||||
|
||||
model = body.get('model') or DEFAULT_MODEL_PATH
|
||||
device = body.get('device', 'ch1')
|
||||
|
||||
await _stop_all()
|
||||
await _stop_textcast_impl()
|
||||
await _stop_voskcast_impl()
|
||||
|
||||
_voskcast_task = asyncio.get_event_loop().create_task(
|
||||
broadcast_vosk(TRANSPORT1, model, device)
|
||||
)
|
||||
|
||||
settings = {
|
||||
'is_streaming': True,
|
||||
'audio_mode': 'VoskCast',
|
||||
'voskcast_is_streaming': True,
|
||||
'voskcast_device': device,
|
||||
'voskcast_model': model,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
save_stream_settings(settings)
|
||||
_led_on()
|
||||
log.info("VoskCast started (device=%s, model=%s)", device, model)
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.post("/stop_voskcast")
|
||||
async def stop_voskcast():
|
||||
"""Stop an active VoskCast broadcast."""
|
||||
return await _on_ble_loop(_stop_voskcast_impl())
|
||||
|
||||
|
||||
async def _stop_voskcast_impl() -> dict:
|
||||
global _voskcast_task
|
||||
was_running = False
|
||||
if _voskcast_task is not None and not _voskcast_task.done():
|
||||
was_running = True
|
||||
_voskcast_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_voskcast_task), timeout=4.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
|
||||
pass
|
||||
_voskcast_task = None
|
||||
_led_off()
|
||||
settings = load_stream_settings() or {}
|
||||
if settings.get('audio_mode') == 'VoskCast':
|
||||
settings['is_streaming'] = False
|
||||
settings['voskcast_is_streaming'] = False
|
||||
settings['timestamp'] = datetime.utcnow().isoformat()
|
||||
save_stream_settings(settings)
|
||||
log.info("VoskCast stopped")
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/start_whispercast")
|
||||
async def start_whispercast(body: dict = {}):
|
||||
"""Start faster-whisper → TextCast. Body (optional): {"model": "tiny.en", "device": "ch1"}"""
|
||||
return await _on_ble_loop(_start_whispercast_impl(body))
|
||||
|
||||
|
||||
async def _start_whispercast_impl(body: dict) -> dict:
|
||||
global _whispercast_task
|
||||
from auracast.faster_whisper_textcast import broadcast_whisper
|
||||
|
||||
model = body.get('model', 'tiny.en')
|
||||
device = body.get('device', 'ch1')
|
||||
|
||||
await _stop_all()
|
||||
await _stop_textcast_impl()
|
||||
await _stop_voskcast_impl()
|
||||
await _stop_whispercast_impl()
|
||||
|
||||
_whispercast_task = asyncio.get_event_loop().create_task(
|
||||
broadcast_whisper(TRANSPORT1, model, device)
|
||||
)
|
||||
|
||||
settings = {
|
||||
'is_streaming': True,
|
||||
'audio_mode': 'WhisperCast',
|
||||
'whispercast_is_streaming': True,
|
||||
'whispercast_device': device,
|
||||
'whispercast_model': model,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
save_stream_settings(settings)
|
||||
_led_on()
|
||||
log.info("WhisperCast started (device=%s, model=%s)", device, model)
|
||||
return {"status": "started"}
|
||||
|
||||
|
||||
@app.post("/stop_whispercast")
|
||||
async def stop_whispercast():
|
||||
"""Stop an active WhisperCast broadcast."""
|
||||
return await _on_ble_loop(_stop_whispercast_impl())
|
||||
|
||||
|
||||
async def _stop_whispercast_impl() -> dict:
|
||||
global _whispercast_task
|
||||
was_running = False
|
||||
if _whispercast_task is not None and not _whispercast_task.done():
|
||||
was_running = True
|
||||
_whispercast_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.shield(_whispercast_task), timeout=5.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError, Exception):
|
||||
pass
|
||||
_whispercast_task = None
|
||||
_led_off()
|
||||
settings = load_stream_settings() or {}
|
||||
if settings.get('audio_mode') == 'WhisperCast':
|
||||
settings['is_streaming'] = False
|
||||
settings['whispercast_is_streaming'] = False
|
||||
settings['timestamp'] = datetime.utcnow().isoformat()
|
||||
save_stream_settings(settings)
|
||||
log.info("WhisperCast stopped")
|
||||
return {"status": "stopped", "was_running": was_running}
|
||||
|
||||
|
||||
@app.post("/adc_gain")
|
||||
async def set_adc_gain(payload: dict):
|
||||
"""Set ADC gain in dB for left and right channels without restarting the stream.
|
||||
@@ -763,6 +975,15 @@ async def get_status():
|
||||
status["secondary"] = secondary
|
||||
status["secondary_is_streaming"] = bool(secondary.get("is_streaming", False))
|
||||
status["led_enabled"] = _LED_ENABLED
|
||||
status["textcast_is_streaming"] = (
|
||||
_textcast_task is not None and not _textcast_task.done()
|
||||
)
|
||||
status["voskcast_is_streaming"] = (
|
||||
_voskcast_task is not None and not _voskcast_task.done()
|
||||
)
|
||||
status["whispercast_is_streaming"] = (
|
||||
_whispercast_task is not None and not _whispercast_task.done()
|
||||
)
|
||||
|
||||
return status
|
||||
|
||||
|
||||
+71
@@ -0,0 +1,71 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<DCSubtitle Version="1.0">
|
||||
<SubtitleID>a1b2c3d4-e5f6-7890-abcd-ef1234567890</SubtitleID>
|
||||
<MovieTitle>Sample TextCast Subtitles</MovieTitle>
|
||||
<ReelNumber>1</ReelNumber>
|
||||
<Language>en</Language>
|
||||
<LoadFont Id="Font1" URI="Arial.ttf"/>
|
||||
<Font Id="Font1" Color="FFFFFFFF" Effect="none" Size="42" Italic="no">
|
||||
|
||||
<Subtitle SpotNumber="1" TimeIn="00:00:02:00" TimeOut="00:00:05:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Welcome to TextCast.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="2" TimeIn="00:00:06:00" TimeOut="00:00:09:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Text transmitted over Auracast BLE.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="3" TimeIn="00:00:10:00" TimeOut="00:00:13:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">No LC3 audio codec involved.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="4" TimeIn="00:00:14:00" TimeOut="00:00:17:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Raw ISO SDUs carry UTF-8 text.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="5" TimeIn="00:00:18:00" TimeOut="00:00:21:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">100 frames per second at 40 bytes.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="6" TimeIn="00:00:22:00" TimeOut="00:00:25:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Scrolling display on SH1106 OLED.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="7" TimeIn="00:00:26:00" TimeOut="00:00:29:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Each new line scrolls up the screen.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="8" TimeIn="00:00:30:00" TimeOut="00:00:33:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">The quick brown fox jumps over</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="9" TimeIn="00:00:34:00" TimeOut="00:00:37:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">the lazy dog.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="10" TimeIn="00:00:38:00" TimeOut="00:00:41:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Speech-to-text output goes here.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="11" TimeIn="00:00:42:00" TimeOut="00:00:45:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Latency is dominated by BLE BIG.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="12" TimeIn="00:00:46:00" TimeOut="00:00:49:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Typical end-to-end: under 50 ms.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="13" TimeIn="00:00:50:00" TimeOut="00:00:53:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">One transmitter, many receivers.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="14" TimeIn="00:00:54:00" TimeOut="00:00:57:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">Built on Bumble and Zephyr RTOS.</Text>
|
||||
</Subtitle>
|
||||
|
||||
<Subtitle SpotNumber="15" TimeIn="00:00:58:00" TimeOut="00:01:01:00" FadeUpTime="0" FadeDownTime="0">
|
||||
<Text HAlign="center" VAlign="bottom">End of demonstration. Thank you.</Text>
|
||||
</Subtitle>
|
||||
|
||||
</Font>
|
||||
</DCSubtitle>
|
||||
@@ -0,0 +1,155 @@
|
||||
"""Text-over-Auracast transmitter.
|
||||
|
||||
Reads a DCP XML subtitle file and broadcasts each subtitle as raw ISO SDUs.
|
||||
No LC3 encoding is used. The BIG is advertised with codec_id=LC3 (required
|
||||
for BAP sync) but the SDU payload is plain UTF-8 text with a magic header.
|
||||
|
||||
Frame format (SDU_SIZE bytes total):
|
||||
Byte 0 : TEXT_MAGIC (0xAA) – identifies this as a text SDU
|
||||
Byte 1 : text length N – 0 means idle/clear
|
||||
Bytes 2..N+1: UTF-8 text
|
||||
Bytes N+2.. : zero padding to SDU_SIZE
|
||||
|
||||
Usage:
|
||||
poetry run python -m auracast.text_multicast \\
|
||||
--dcp ./auracast/testdata/sample_subtitles.xml \\
|
||||
--transport serial:/dev/ttyAMA3,1000000,rtscts
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
from auracast import auracast_config, multicast
|
||||
from auracast.dcp_parser import parse_dcp_xml
|
||||
|
||||
TEXT_MAGIC = 0xAA
|
||||
SDU_SIZE = 64 # octets_per_frame; 62 usable text bytes per frame
|
||||
SDU_INTERVAL_US = 10_000 # 10 ms → 100 SDUs/sec
|
||||
BROADCAST_NAME = 'TextCast'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s %(levelname)s %(name)s: %(message)s',
|
||||
)
|
||||
log = logging.getLogger('text_multicast')
|
||||
|
||||
|
||||
def _make_text_frame(text: str) -> bytes:
|
||||
"""Encode a subtitle string into a fixed-size TEXT SDU."""
|
||||
text_bytes = text.encode('utf-8')[: SDU_SIZE - 2]
|
||||
frame = bytes([TEXT_MAGIC, len(text_bytes)]) + text_bytes
|
||||
return frame + bytes(SDU_SIZE - len(frame))
|
||||
|
||||
|
||||
def _make_idle_frame() -> bytes:
|
||||
"""Return an idle frame (magic=0, signals 'no active subtitle')."""
|
||||
return bytes(SDU_SIZE)
|
||||
|
||||
|
||||
async def _text_stream(bigs: dict, subtitles: list, loop: bool = True) -> None:
|
||||
"""Main text streaming loop.
|
||||
|
||||
Writes one SDU every ~10 ms (flow-controlled by the BLE controller).
|
||||
Subtitle timing is derived from the frame counter: frame N ≈ N × 10 ms.
|
||||
When *loop* is True (default) the subtitle list repeats indefinitely.
|
||||
"""
|
||||
iso_queue = bigs['big0']['iso_queue']
|
||||
frame_interval_s = SDU_INTERVAL_US / 1_000_000
|
||||
frame_count = 0
|
||||
sub_idx = 0
|
||||
n = len(subtitles)
|
||||
last_log_sub = -1
|
||||
loop_count = 0
|
||||
# Total duration of one pass: end of last subtitle + 2 s gap before restart
|
||||
_loop_gap_s = 2.0
|
||||
_pass_duration_s = subtitles[-1].time_out + _loop_gap_s if n > 0 else 0.0
|
||||
|
||||
log.info("Streaming %d subtitle(s) (loop=%s). Press Ctrl-C to stop.", n, loop)
|
||||
|
||||
while True:
|
||||
now_s = frame_count * frame_interval_s
|
||||
|
||||
# Advance past subtitles whose time_out has passed
|
||||
while sub_idx < n and now_s >= subtitles[sub_idx].time_out:
|
||||
sub_idx += 1
|
||||
|
||||
# Determine what to send
|
||||
if sub_idx < n and now_s >= subtitles[sub_idx].time_in:
|
||||
frame = _make_text_frame(subtitles[sub_idx].text)
|
||||
if sub_idx != last_log_sub:
|
||||
log.info("[loop %d %05.1fs] %s", loop_count, now_s, subtitles[sub_idx].text)
|
||||
last_log_sub = sub_idx
|
||||
else:
|
||||
frame = _make_idle_frame()
|
||||
|
||||
await iso_queue.write(frame)
|
||||
frame_count += 1
|
||||
|
||||
# End of pass
|
||||
if n > 0 and now_s >= _pass_duration_s:
|
||||
if loop:
|
||||
loop_count += 1
|
||||
log.info("Loop %d complete – restarting.", loop_count)
|
||||
frame_count = 0
|
||||
sub_idx = 0
|
||||
last_log_sub = -1
|
||||
else:
|
||||
log.info("All subtitles transmitted. Exiting.")
|
||||
break
|
||||
|
||||
|
||||
async def broadcast_text(dcp_path: str, transport: str, loop: bool = True) -> None:
|
||||
subtitles = parse_dcp_xml(dcp_path)
|
||||
if not subtitles:
|
||||
log.error("No subtitles found in %s", dcp_path)
|
||||
return
|
||||
log.info("Loaded %d subtitle(s) from %s", len(subtitles), dcp_path)
|
||||
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
bigs=[
|
||||
auracast_config.AuracastBigConfig(
|
||||
name=BROADCAST_NAME,
|
||||
program_info='Text Broadcast',
|
||||
language='eng',
|
||||
audio_source='file:dummy', # not used – streamer loop is replaced
|
||||
iso_que_len=4,
|
||||
),
|
||||
],
|
||||
auracast_sampling_rate_hz=16000,
|
||||
octets_per_frame=SDU_SIZE,
|
||||
frame_duration_us=SDU_INTERVAL_US,
|
||||
presentation_delay_us=40_000,
|
||||
qos_config=auracast_config.AuracastQosRobust(),
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
async with multicast.create_device(config) as device:
|
||||
bigs = await multicast.init_broadcast(device, config, config.bigs)
|
||||
await _text_stream(bigs, subtitles, loop=loop)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description='Auracast text (subtitle) transmitter')
|
||||
parser.add_argument('--dcp', required=True, help='Path to DCP XML subtitle file')
|
||||
parser.add_argument(
|
||||
'--transport',
|
||||
default=os.environ.get(
|
||||
'AURACAST_TRANSPORT',
|
||||
'serial:/dev/ttyAMA3,1000000,rtscts',
|
||||
),
|
||||
help='Bumble HCI transport string (default: $AURACAST_TRANSPORT or ttyAMA3)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--no-loop',
|
||||
action='store_true',
|
||||
help='Play subtitles once and exit instead of looping indefinitely',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
multicast.run_async(broadcast_text(args.dcp, args.transport, loop=not args.no_loop))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,270 @@
|
||||
"""Vosk speech-to-text → TextCast streamer.
|
||||
|
||||
Captures mono audio from an analog ALSA/sounddevice input, runs Vosk
|
||||
offline ASR in a background thread, and broadcasts recognised text over
|
||||
the TextCast BLE broadcast using the same SDU framing as text_multicast.py.
|
||||
|
||||
Usage (CLI):
|
||||
poetry run python -m auracast.vosk_textcast \\
|
||||
--model /path/to/vosk-model-en-us \\
|
||||
--device ch1 \\
|
||||
--transport serial:/dev/ttyAMA3,1000000,rtscts
|
||||
|
||||
Environment:
|
||||
VOSK_MODEL_PATH – default Vosk model directory
|
||||
AURACAST_TRANSPORT – default HCI transport string
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import samplerate
|
||||
import sounddevice as sd
|
||||
|
||||
from auracast import auracast_config, multicast
|
||||
from auracast.text_multicast import (
|
||||
SDU_SIZE,
|
||||
SDU_INTERVAL_US,
|
||||
_make_text_frame,
|
||||
_make_idle_frame,
|
||||
)
|
||||
|
||||
log = logging.getLogger('vosk_textcast')
|
||||
|
||||
VOSK_SAMPLE_RATE = 16_000 # Vosk models expect 16 kHz
|
||||
CAPTURE_SAMPLE_RATE = 48_000 # Hardware capture rate (always 48 kHz)
|
||||
BLOCK_FRAMES_48K = 4800 # 100 ms blocks at 48 kHz → 1600 frames at 16 kHz
|
||||
CAPTION_HOLD_S = 4.0 # Keep caption visible N seconds after last speech
|
||||
BROADCAST_NAME = 'LiveCaption'
|
||||
|
||||
DEFAULT_MODEL_PATH = os.environ.get(
|
||||
'VOSK_MODEL_PATH',
|
||||
os.path.expanduser('~/vosk-model-en-us'),
|
||||
)
|
||||
|
||||
|
||||
def _tail_to_fit(text: str, max_bytes: int) -> str:
|
||||
"""Return the tail of *text* that fits in *max_bytes* UTF-8 bytes."""
|
||||
encoded = text.encode('utf-8')
|
||||
if len(encoded) <= max_bytes:
|
||||
return text
|
||||
tail = encoded[-max_bytes:].decode('utf-8', errors='ignore')
|
||||
sp = tail.find(' ')
|
||||
return tail[sp + 1:] if sp != -1 else tail
|
||||
|
||||
|
||||
def _new_words(old: str, new: str) -> str:
|
||||
"""Return the words appended to *new* beyond the shared prefix with *old*.
|
||||
|
||||
If *new* doesn't start with *old* (different utterance), return *new* in full.
|
||||
"""
|
||||
old_words = old.split()
|
||||
new_words = new.split()
|
||||
if new_words[:len(old_words)] == old_words:
|
||||
extra = new_words[len(old_words):]
|
||||
return ' '.join(extra)
|
||||
return new
|
||||
|
||||
|
||||
def _resolve_device(device: str) -> Optional[int]:
|
||||
"""Return sounddevice index for a name or numeric string, or None for default."""
|
||||
if not device:
|
||||
return None
|
||||
if device.isdigit():
|
||||
return int(device)
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d['name'] == device and d['max_input_channels'] > 0:
|
||||
return i
|
||||
log.warning("Device '%s' not found in sounddevice list – using default input", device)
|
||||
return None
|
||||
|
||||
|
||||
async def _iso_write_loop(bigs: dict, shared: dict, lock: threading.Lock) -> None:
|
||||
"""ISO SDU write loop.
|
||||
|
||||
Runs at ~10 ms per iteration (flow-controlled by the BLE controller).
|
||||
Sends the current recognised text (partial or final) as-is.
|
||||
"""
|
||||
iso_queue = bigs['big0']['iso_queue']
|
||||
last_sent: str = ''
|
||||
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
with lock:
|
||||
text: str = shared.get('text', '')
|
||||
expiry: float = shared.get('expiry', 0.0)
|
||||
|
||||
if text and now < expiry:
|
||||
display_text = _tail_to_fit(text, SDU_SIZE - 2)
|
||||
if display_text != last_sent:
|
||||
log.info("Caption: %s", display_text)
|
||||
last_sent = display_text
|
||||
frame = _make_text_frame(display_text)
|
||||
else:
|
||||
if last_sent:
|
||||
log.info("Caption cleared")
|
||||
last_sent = ''
|
||||
with lock:
|
||||
shared['text'] = ''
|
||||
frame = _make_idle_frame()
|
||||
|
||||
await iso_queue.write(frame)
|
||||
|
||||
|
||||
def _vosk_thread(
|
||||
model_path: str,
|
||||
device: str,
|
||||
shared: dict,
|
||||
lock: threading.Lock,
|
||||
stop_event: threading.Event,
|
||||
) -> None:
|
||||
"""Blocking audio capture + Vosk recognition loop. Runs in a daemon thread."""
|
||||
try:
|
||||
from vosk import KaldiRecognizer, Model # type: ignore
|
||||
except ImportError:
|
||||
log.error("vosk is not installed. Run: poetry add vosk")
|
||||
return
|
||||
|
||||
log.info("Loading Vosk model from %s …", model_path)
|
||||
model = Model(model_path)
|
||||
rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE)
|
||||
rec.SetMaxAlternatives(0)
|
||||
rec.SetWords(False)
|
||||
|
||||
resampler = samplerate.Resampler('sinc_fastest', channels=1)
|
||||
ratio = VOSK_SAMPLE_RATE / CAPTURE_SAMPLE_RATE
|
||||
|
||||
dev_idx = _resolve_device(device)
|
||||
last_word_count = [0] # word count of last partial sent to display
|
||||
|
||||
def _cb(indata: np.ndarray, frames: int, time_info, status) -> None:
|
||||
if status:
|
||||
log.warning("Audio status: %s", status)
|
||||
if stop_event.is_set():
|
||||
raise sd.CallbackStop()
|
||||
|
||||
# Resample 48 kHz → 16 kHz
|
||||
mono = indata[:, 0].astype(np.float32)
|
||||
downsampled = resampler.process(mono, ratio, end_of_input=False)
|
||||
pcm16 = (downsampled * 32767).astype(np.int16).tobytes()
|
||||
|
||||
if rec.AcceptWaveform(pcm16):
|
||||
result = json.loads(rec.Result())
|
||||
final_text = result.get('text', '').strip()
|
||||
if final_text:
|
||||
log.info("Final: %s", final_text)
|
||||
with lock:
|
||||
shared['text'] = _tail_to_fit(final_text, SDU_SIZE - 2)
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
last_word_count[0] = 0 # reset for next sentence
|
||||
else:
|
||||
partial_text = json.loads(rec.PartialResult()).get('partial', '').strip()
|
||||
if partial_text:
|
||||
wc = len(partial_text.split())
|
||||
if wc > last_word_count[0]: # new word arrived
|
||||
last_word_count[0] = wc
|
||||
with lock:
|
||||
shared['text'] = _tail_to_fit(partial_text, SDU_SIZE - 2)
|
||||
shared['expiry'] = time.monotonic() + CAPTION_HOLD_S
|
||||
|
||||
try:
|
||||
with sd.InputStream(
|
||||
samplerate=CAPTURE_SAMPLE_RATE,
|
||||
blocksize=BLOCK_FRAMES_48K,
|
||||
device=dev_idx,
|
||||
dtype='float32',
|
||||
channels=1,
|
||||
callback=_cb,
|
||||
):
|
||||
log.info("Vosk listening on device '%s' (idx=%s) …", device, dev_idx)
|
||||
stop_event.wait()
|
||||
except Exception as exc:
|
||||
log.error("Vosk audio thread error: %s", exc, exc_info=True)
|
||||
|
||||
|
||||
async def broadcast_vosk(
|
||||
transport: str,
|
||||
model_path: str = DEFAULT_MODEL_PATH,
|
||||
device: str = 'ch1',
|
||||
) -> None:
|
||||
"""Start a Vosk STT → TextCast broadcast. Runs until cancelled."""
|
||||
model_path = os.path.expanduser(model_path)
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(
|
||||
f"Vosk model not found at '{model_path}'. "
|
||||
"Download from https://alphacephei.com/vosk/models and set VOSK_MODEL_PATH."
|
||||
)
|
||||
|
||||
config = auracast_config.AuracastConfigGroup(
|
||||
bigs=[
|
||||
auracast_config.AuracastBigConfig(
|
||||
name=BROADCAST_NAME,
|
||||
program_info='Live Captions',
|
||||
language='eng',
|
||||
audio_source='file:dummy',
|
||||
iso_que_len=4,
|
||||
),
|
||||
],
|
||||
auracast_sampling_rate_hz=16000,
|
||||
octets_per_frame=SDU_SIZE,
|
||||
frame_duration_us=SDU_INTERVAL_US,
|
||||
presentation_delay_us=40_000,
|
||||
qos_config=auracast_config.AuracastQosRobust(),
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
shared: dict = {'text': '', 'expiry': 0.0}
|
||||
lock = threading.Lock()
|
||||
stop_event = threading.Event()
|
||||
|
||||
async with multicast.create_device(config) as ble_device:
|
||||
bigs = await multicast.init_broadcast(ble_device, config, config.bigs)
|
||||
|
||||
t = threading.Thread(
|
||||
target=_vosk_thread,
|
||||
args=(model_path, device, shared, lock, stop_event),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
log.info("VoskCast started (device=%s, model=%s)", device, model_path)
|
||||
|
||||
try:
|
||||
await _iso_write_loop(bigs, shared, lock)
|
||||
except asyncio.CancelledError:
|
||||
log.info("VoskCast cancelled – shutting down")
|
||||
stop_event.set()
|
||||
t.join(timeout=3.0)
|
||||
raise
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description='Vosk STT → Auracast TextCast')
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
default=DEFAULT_MODEL_PATH,
|
||||
help=f'Path to Vosk model directory (default: {DEFAULT_MODEL_PATH})',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
default='ch1',
|
||||
help='sounddevice input device name or index (default: ch1)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--transport',
|
||||
default=os.environ.get('AURACAST_TRANSPORT', 'serial:/dev/ttyAMA3,1000000,rtscts'),
|
||||
help='Bumble HCI transport string',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
multicast.run_async(broadcast_vosk(args.transport, args.model, args.device))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user