Rename to piper

2026-05-08 14:28:02 +00:00 · 2023-03-26 21:42:04 -05:00
parent 3dfa161ba5
commit 70afec58bc
62 changed files with 348 additions and 207 deletions
--- a/src/python/piper_train/norm_audio/vad.py
+++ b/src/python/piper_train/norm_audio/vad.py
@@ -0,0 +1,54 @@
+import typing
+from pathlib import Path
+
+import numpy as np
+import onnxruntime
+
+
+class SileroVoiceActivityDetector:
+    """Detects speech/silence using Silero VAD.
+
+    https://github.com/snakers4/silero-vad
+    """
+
+    def __init__(self, onnx_path: typing.Union[str, Path]):
+        onnx_path = str(onnx_path)
+
+        self.session = onnxruntime.InferenceSession(onnx_path)
+        self.session.intra_op_num_threads = 1
+        self.session.inter_op_num_threads = 1
+
+        self._h = np.zeros((2, 1, 64)).astype("float32")
+        self._c = np.zeros((2, 1, 64)).astype("float32")
+
+    def __call__(self, audio_array: np.ndarray, sample_rate: int = 16000):
+        """Return probability of speech in audio [0-1].
+
+        Audio must be 16Khz 16-bit mono PCM.
+        """
+        if len(audio_array.shape) == 1:
+            # Add batch dimension
+            audio_array = np.expand_dims(audio_array, 0)
+
+        if len(audio_array.shape) > 2:
+            raise ValueError(
+                f"Too many dimensions for input audio chunk {audio_array.shape}"
+            )
+
+        if audio_array.shape[0] > 1:
+            raise ValueError("Onnx model does not support batching")
+
+        if sample_rate != 16000:
+            raise ValueError("Only 16Khz audio is supported")
+
+        ort_inputs = {
+            "input": audio_array.astype(np.float32),
+            "h0": self._h,
+            "c0": self._c,
+        }
+        ort_outs = self.session.run(None, ort_inputs)
+        out, self._h, self._c = ort_outs
+
+        out = out.squeeze(2)[:, 1]  # make output type match JIT analog
+
+        return out