First good audio with alsaaudio.

This commit is contained in:
Pbopbo
2026-03-18 16:55:55 +01:00
parent e1d717ed5c
commit a605195646
2 changed files with 96 additions and 79 deletions

View File

@@ -17,7 +17,8 @@ dependencies = [
"sounddevice (>=0.5.2,<0.6.0)", "sounddevice (>=0.5.2,<0.6.0)",
"python-dotenv (>=1.1.1,<2.0.0)", "python-dotenv (>=1.1.1,<2.0.0)",
"smbus2 (>=0.5.0,<0.6.0)", "smbus2 (>=0.5.0,<0.6.0)",
"samplerate (>=0.2.2,<0.3.0)" "samplerate (>=0.2.2,<0.3.0)",
"pyalsaaudio (>=0.9.0,<1.0.0)"
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -56,7 +56,7 @@ from auracast.utils.webrtc_audio_input import WebRTCAudioInput
# Patch sounddevice.InputStream globally to use low-latency settings # Patch sounddevice.InputStream globally to use low-latency settings
import sounddevice as sd import alsaaudio
from collections import deque from collections import deque
@@ -139,96 +139,112 @@ class AlsaArecordAudioInput(audio_io.AudioInput):
self._proc = None self._proc = None
class ModSoundDeviceAudioInput(audio_io.SoundDeviceAudioInput): class PyAlsaAudioInput(audio_io.ThreadedAudioInput):
"""Patched SoundDeviceAudioInput with low-latency capture and adaptive resampling.""" """PyALSA audio input with callback thread and ring buffer."""
def _open(self): def __init__(self, device, pcm_format: audio_io.PcmFormat):
"""Create RawInputStream with low-latency parameters and initialize ring buffer.""" super().__init__()
dev_info = sd.query_devices(self._device) self._device = str(device) if not isinstance(device, str) else device
hostapis = sd.query_hostapis() if self._device.isdigit():
api_index = dev_info.get('hostapi') self._device = 'default' if self._device == '0' else f'hw:{self._device}'
api_name = hostapis[api_index]['name'] if isinstance(api_index, int) and 0 <= api_index < len(hostapis) else 'unknown' self._pcm_format = pcm_format
pa_ver = sd.get_portaudio_version() self._pcm = None
self._ring_buffer = deque()
self._ring_lock = threading.Lock()
self._running = False
self._callback_thread = None
self._max_buffer_bytes = int(self._pcm_format.sample_rate * 0.1 * 2)
logging.info( def _open(self) -> audio_io.PcmFormat:
"SoundDevice backend=%s device='%s' (id=%s) ch=%s default_low_input_latency=%.4f default_high_input_latency=%.4f portaudio=%s", requested_rate = int(self._pcm_format.sample_rate)
api_name,
dev_info.get('name'), self._pcm = alsaaudio.PCM(
self._device, type=alsaaudio.PCM_CAPTURE,
dev_info.get('max_input_channels'), mode=alsaaudio.PCM_NORMAL,
float(dev_info.get('default_low_input_latency') or 0.0),
float(dev_info.get('default_high_input_latency') or 0.0),
pa_ver[1] if isinstance(pa_ver, tuple) and len(pa_ver) >= 2 else pa_ver,
)
# Create RawInputStream with injected low-latency parameters
# Target ~2 ms blocksize (48 kHz -> 96 frames). For other rates, keep ~2 ms.
_sr = int(self._pcm_format.sample_rate)
self.counter=0
self.max_avail=0
self.logfile_name="available_samples.txt"
self.blocksize = 120
if os.path.exists(self.logfile_name):
os.remove(self.logfile_name)
self._stream = sd.RawInputStream(
samplerate=self._pcm_format.sample_rate,
device=self._device, device=self._device,
channels=self._pcm_format.channels,
dtype='int16',
blocksize=self.blocksize,
latency=0.004,
) )
self._stream.start()
self._pcm.setchannels(1)
self._pcm.setformat(alsaaudio.PCM_FORMAT_S16_LE)
actual_rate = self._pcm.setrate(requested_rate)
self._pcm.setperiodsize(240)
logging.info("PyALSA: device=%s requested=%d actual=%d periodsize=240 (5ms)",
self._device, requested_rate, actual_rate)
if actual_rate != requested_rate:
logging.warning("PyALSA: Sample rate mismatch! requested=%d actual=%d", requested_rate, actual_rate)
self._running = True
self._callback_thread = threading.Thread(target=self._capture_loop, daemon=True)
self._callback_thread.start()
return audio_io.PcmFormat( return audio_io.PcmFormat(
audio_io.PcmFormat.Endianness.LITTLE, audio_io.PcmFormat.Endianness.LITTLE,
audio_io.PcmFormat.SampleType.INT16, audio_io.PcmFormat.SampleType.INT16,
self._pcm_format.sample_rate, actual_rate,
1, 1,
) )
def _capture_loop(self):
first_read = True
while self._running:
try:
length, data = self._pcm.read()
if length > 0:
if first_read:
expected_bytes = 240 * 2 # 240 frames * 2 bytes/sample for mono
logging.info("PyALSA first capture: length=%d bytes=%d expected=%d", length, len(data), expected_bytes)
first_read = False
# If we got stereo data (480 bytes instead of 240), downsample to mono
if len(data) == 960: # 240 frames * 2 channels * 2 bytes = stereo
logging.warning("PyALSA: Got stereo data, converting to mono")
pcm_stereo = np.frombuffer(data, dtype=np.int16)
pcm_mono = pcm_stereo[::2] # Take only left channel
data = pcm_mono.tobytes()
with self._ring_lock:
self._ring_buffer.append(data)
total_bytes = sum(len(chunk) for chunk in self._ring_buffer)
while total_bytes > self._max_buffer_bytes:
self._ring_buffer.popleft()
total_bytes = sum(len(chunk) for chunk in self._ring_buffer)
except:
if self._running:
break
def _read(self, frame_size: int) -> bytes: def _read(self, frame_size: int) -> bytes:
"""Read PCM samples from the stream.""" bytes_needed = frame_size * 2
result = b''
while len(result) < bytes_needed:
with self._ring_lock:
if self._ring_buffer:
chunk = self._ring_buffer.popleft()
needed = bytes_needed - len(result)
if len(chunk) <= needed:
result += chunk
else:
result += chunk[:needed]
self._ring_buffer.appendleft(chunk[needed:])
else:
break
if len(result) < bytes_needed:
result += b'\x00' * (bytes_needed - len(result))
return result
#if self.counter % 50 == 0: def _close(self) -> None:
frame_size = frame_size + 1 # consume samples a little faster to avoid latency akkumulation self._running = False
if self._callback_thread:
self._callback_thread.join(timeout=1.0)
if self._pcm:
self._pcm.close()
self._pcm = None
pcm_buffer, overflowed = self._stream.read(frame_size) audio_io.SoundDeviceAudioInput = PyAlsaAudioInput
if overflowed:
logging.warning("SoundDeviceAudioInput: overflowed")
n_available = self._stream.read_available
# adapt = n_available > 20
# if adapt:
# pcm_extra, overflowed = self._stream.read(3)
# logging.info('consuming extra samples, available was %d', n_available)
# if overflowed:
# logging.warning("SoundDeviceAudioInput: overflowed")
# out = bytes(pcm_buffer) + bytes(pcm_extra)
# else:
out = bytes(pcm_buffer)
self.max_avail = max(self.max_avail, n_available)
#Diagnostics
#with open(self.logfile_name, "a", encoding="utf-8") as f:
# f.write(f"{n_available}, {adapt}, {round(self._runavg, 2)}, {overflowed}\n")
if self.counter % 500 == 0:
logging.info(
"read available=%d, max=%d, latency:%d",
n_available, self.max_avail, self._stream.latency
)
self.max_avail = 0
self.counter += 1
return out
audio_io.SoundDeviceAudioInput = ModSoundDeviceAudioInput
# modified from bumble # modified from bumble
class ModWaveAudioInput(audio_io.ThreadedAudioInput): class ModWaveAudioInput(audio_io.ThreadedAudioInput):