tune the adaptive resampler to be near to inaudible

2025-10-23 18:23:20 +02:00
parent 329510beae
commit 60aa653aeb
2 changed files with 99 additions and 19 deletions
@@ -52,7 +52,36 @@ from auracast.utils.network_audio_receiver import NetworkAudioReceiverUncoded
 from auracast.utils.webrtc_audio_input import WebRTCAudioInput


-# Instantiate WebRTC audio input for streaming (can be used per-BIG or globally)
+# Patch sounddevice.InputStream globally to use low-latency settings
+import sounddevice as sd
+
+class ModSoundDeviceAudioInput(audio_io.SoundDeviceAudioInput):
+    """Patched SoundDeviceAudioInput that creates RawInputStream with low-latency parameters."""
+
+    def _open(self):
+        """Patched _open method that creates RawInputStream with low-latency parameters."""
+        
+        # Create RawInputStream with injected low-latency parameters
+        self._stream = sd.RawInputStream(
+            samplerate=self._pcm_format.sample_rate,
+            device=self._device,
+            channels=self._pcm_format.channels,
+            dtype='int16',
+            blocksize=240,  # Match frame size
+            latency=0.010,  
+        )
+        self._stream.start()
+        
+        logging.info(f"SoundDeviceAudioInput: Opened with blocksize=240, latency=0.010 (10ms)")
+        
+        return audio_io.PcmFormat(
+            audio_io.PcmFormat.Endianness.LITTLE,
+            audio_io.PcmFormat.SampleType.INT16,
+            self._pcm_format.sample_rate,
+            2,
+        )
+
+audio_io.SoundDeviceAudioInput = ModSoundDeviceAudioInput

 # modified from bumble
 class ModWaveAudioInput(audio_io.ThreadedAudioInput):
@@ -618,21 +647,24 @@ class Streamer():
                big['encoder'] = encoder
                big['precoded'] = False

-
        logging.info("Streaming audio...")
        bigs = self.bigs
        self.is_streaming = True
    
        # Sample discard stats (clock drift compensation)
+        sample_rate = big['audio_input']._pcm_format.sample_rate
        samples_discarded_total = 0  # Total samples discarded
        discard_events = 0  # Number of times we discarded samples
+        frames_since_last_discard = 999  # Guard: frames since last discard (start high to allow first drop)
        enable_drift_compensation = global_config.enable_drift_compensation
+        discard_guard_frames = sample_rate // 100  # Don't allow discard within this many frames of previous discard
        # Calculate threshold based on config (default 2ms)
-        drift_threshold_ms = global_config.drift_threshold_ms if enable_drift_compensation else 0
-        drop_threshold_samples = 0
+        drift_threshold_ms = global_config.drift_threshold_ms if global_config.enable_drift_compensation else 0
+        drop_threshold_samples = int(sample_rate * drift_threshold_ms / 1000.0)
+        static_drop_samples = int(sample_rate * 0.0005)  # Always drop a static amount of samples
        
-        if enable_drift_compensation:
-            logging.info(f"Clock drift compensation ENABLED: threshold={drift_threshold_ms}ms")
+        if global_config.enable_drift_compensation:
+            logging.info(f"Clock drift compensation ENABLED: threshold={drift_threshold_ms}ms, guard={discard_guard_frames} frames")
        else:
            logging.info("Clock drift compensation DISABLED")
        
@@ -669,26 +701,41 @@ class Streamer():
                    
                    # Calculate threshold samples based on sample rate (only once per BIG)
                    if enable_drift_compensation and drop_threshold_samples == 0:
-                        sample_rate = big['audio_input']._pcm_format.sample_rate
                        drop_threshold_samples = int(sample_rate * drift_threshold_ms / 1000.0)
                        logging.info(f"Drift compensation threshold: {drop_threshold_samples} samples ({drift_threshold_ms}ms @ {sample_rate}Hz)")
+                        logging.info(f"Static drop amount: {static_drop_samples} samples (3.0ms @ {sample_rate}Hz)")
                    
                    # Discard excess samples in buffer if above threshold (clock drift compensation)
                    if enable_drift_compensation and hasattr(big['audio_input'], '_stream') and big['audio_input']._stream:
                        sd_buffer_samples = big['audio_input']._stream.read_available
                        
-                        if sd_buffer_samples > drop_threshold_samples:
-                            # Discard ALL remaining samples to bring buffer back down
+                        # Guard: only allow discard if enough frames have passed since last discard
+                        if sd_buffer_samples > drop_threshold_samples and frames_since_last_discard >= discard_guard_frames:
+                            # Always drop a static amount (3ms) for predictable behavior
+                            # This matches the crossfade duration better for smoother transitions
+                            samples_to_drop = static_drop_samples
                            try:
-                                discarded_data = big['audio_input']._stream.read(sd_buffer_samples)
-                                samples_discarded_total += sd_buffer_samples
+                                discarded_data = await anext(big['audio_input'].frames(samples_to_drop))
+                                samples_discarded_total += samples_to_drop
                                discard_events += 1
                                
-                                if discard_events % 100 == 0:  # Log every 100th discard
-                                    logging.warning(
-                                        f"Discard #{discard_events}: {sd_buffer_samples} samples ({sd_buffer_samples / big['audio_input']._pcm_format.sample_rate * 1000:.1f} ms) "
-                                        f"| total discarded: {samples_discarded_total} samples"
-                                    )
+                                # Log every discard event with timing information
+                                sample_rate = big['audio_input']._pcm_format.sample_rate
+                                time_since_last_ms = frames_since_last_discard * 10  # Each frame is 10ms
+                                logging.info(
+                                    f"DISCARD #{discard_events}: dropped {samples_to_drop} samples ({samples_to_drop / sample_rate * 1000:.1f}ms) | "
+                                    f"buffer was {sd_buffer_samples} samples ({sd_buffer_samples / sample_rate * 1000:.1f}ms) | "
+                                    f"since_last={frames_since_last_discard} frames ({time_since_last_ms}ms) | "
+                                    f"frame={frame_count}"
+                                )
+                                
+                                # Reset guard counter
+                                frames_since_last_discard = 0
+                                # Store how much we dropped for potential adaptive crossfade
+                                big['last_drop_samples'] = samples_to_drop
+                                # Set flag to apply crossfade on next frame
+                                big['apply_crossfade'] = True
+                                
                            except Exception as e:
                                logging.error(f"Failed to discard samples: {e}")
                    
@@ -704,6 +751,39 @@ class Streamer():
                            samples = np.frombuffer(pcm_frame, dtype=dtype)
                            samples = samples.reshape(-1, big['channels']).mean(axis=1)
                            pcm_frame = samples.astype(dtype).tobytes()
+                    
+                    # Apply crossfade if samples were just dropped (drift compensation)
+                    if big.get('apply_crossfade') and big.get('prev_pcm_frame') is not None:
+                        # Crossfade duration: 10ms for smoother transition (was 5ms)
+                        dtype = np.int16 if big['pcm_bit_depth'] == 16 else np.float32
+                        sample_rate = big['audio_input']._pcm_format.sample_rate
+                        crossfade_samples = min(int(sample_rate * 0.010), big['lc3_frame_samples'] // 2)
+                        
+                        # Convert frames to numpy arrays (make writable copies)
+                        prev_samples = np.frombuffer(big['prev_pcm_frame'], dtype=dtype).copy()
+                        curr_samples = np.frombuffer(pcm_frame, dtype=dtype).copy()
+                        
+                        # Create equal-power crossfade curves (smoother than linear)
+                        # Equal-power maintains perceived loudness during transition
+                        t = np.linspace(0, 1, crossfade_samples)
+                        fade_out = np.cos(t * np.pi / 2)  # Cosine fade out
+                        fade_in = np.sin(t * np.pi / 2)   # Sine fade in
+                        
+                        # Apply crossfade to the beginning of current frame with end of previous frame
+                        if len(prev_samples) >= crossfade_samples and len(curr_samples) >= crossfade_samples:
+                            crossfaded = (
+                                prev_samples[-crossfade_samples:] * fade_out +
+                                curr_samples[:crossfade_samples] * fade_in
+                            ).astype(dtype)
+                            # Replace beginning of current frame with crossfaded section
+                            curr_samples[:crossfade_samples] = crossfaded
+                            pcm_frame = curr_samples.tobytes()
+                        
+                        big['apply_crossfade'] = False
+                    
+                    # Store current frame for potential next crossfade
+                    if enable_drift_compensation:
+                        big['prev_pcm_frame'] = pcm_frame

                    lc3_frame = big['encoder'].encode(
                        pcm_frame, num_bytes=big['lc3_bytes_per_frame'], bit_depth=big['pcm_bit_depth']
@@ -711,6 +791,8 @@ class Streamer():

                await big['iso_queue'].write(lc3_frame)
                frame_count += 1
+                # Increment guard counter (tracks frames since last discard)
+                frames_since_last_discard += 1
                
                # Periodic stats logging
                now = time.perf_counter()
@@ -52,8 +52,6 @@ if __name__ == "__main__":
    os.chdir(os.path.dirname(__file__))
    # Load .env located next to this script (only uppercase keys will be referenced)
    load_dotenv(dotenv_path='.env')
-    # Default tight ALSA latency (ms); can be overridden via environment
-    os.environ.setdefault('ALSA_LATENCY_MSEC', '2')
  
    # List USB ALSA inputs
    usb_inputs = get_alsa_usb_inputs()
@@ -112,7 +110,7 @@ if __name__ == "__main__":
        auracast_sampling_rate_hz = LC3_SRATE,
        octets_per_frame = OCTETS_PER_FRAME,
        transport=TRANSPORT1,
-        enable_drift_compensation=False,
+        enable_drift_compensation=True,
        drift_threshold_ms=2.0
    )
    config.debug = False