Add speaker id to infer_onnx

2026-06-09 21:22:26 +00:00 · 2023-01-05 23:14:08 -05:00
parent f7234546d8
commit a6d72d7990
1 changed files with 24 additions and 13 deletions
@@ -36,17 +36,17 @@ def main():
    model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options)
    _LOGGER.info("Loaded model from %s", args.model)

-    text_empty = np.zeros((1, 300), dtype=np.int64)
-    text_lengths_empty = np.array([text_empty.shape[1]], dtype=np.int64)
-    scales = np.array(
-        [args.noise_scale, args.length_scale, args.noise_scale_w],
-        dtype=np.float32,
-    )
-    bias_audio = model.run(
-        None,
-        {"input": text_empty, "input_lengths": text_lengths_empty, "scales": scales},
-    )[0].squeeze((0, 1))
-    bias_spec, _ = transform(bias_audio)
+    # text_empty = np.zeros((1, 300), dtype=np.int64)
+    # text_lengths_empty = np.array([text_empty.shape[1]], dtype=np.int64)
+    # scales = np.array(
+    #     [args.noise_scale, args.length_scale, args.noise_scale_w],
+    #     dtype=np.float32,
+    # )
+    # bias_audio = model.run(
+    #     None,
+    #     {"input": text_empty, "input_lengths": text_lengths_empty, "scales": scales},
+    # )[0].squeeze((0, 1))
+    # bias_spec, _ = transform(bias_audio)

    for i, line in enumerate(sys.stdin):
        line = line.strip()
@@ -57,6 +57,7 @@ def main():
        # utt_id = utt["id"]
        utt_id = str(i)
        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")

        text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
        text_lengths = np.array([text.shape[1]], dtype=np.int64)
@@ -64,12 +65,22 @@ def main():
            [args.noise_scale, args.length_scale, args.noise_scale_w],
            dtype=np.float32,
        )
+        sid = None
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)

        start_time = time.perf_counter()
        audio = model.run(
-            None, {"input": text, "input_lengths": text_lengths, "scales": scales}
+            None,
+            {
+                "input": text,
+                "input_lengths": text_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
        )[0].squeeze((0, 1))
-        audio = denoise(audio, bias_spec, 10)
+        # audio = denoise(audio, bias_spec, 10)
        audio = audio_float_to_int16(audio.squeeze())
        end_time = time.perf_counter()