Merge branch 'master' of pumpkin.local:Mike/Projects/larynx2

2026-06-01 17:37:01 +00:00 · 2023-04-10 13:16:37 -05:00
parent 3d5d53f07c 1eed98ecd9
commit 49dd00cc45
6 changed files with 177 additions and 9 deletions
@@ -1,6 +1,6 @@
 ![Piper logo](etc/logo.png)

-A fast, local neural text to speech system that is meant to sound good and run reasonably fast on the Raspberry Pi 4.
+A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.

 ``` sh
 echo 'Welcome to the world of speech synthesis!' | \
@@ -26,15 +26,15 @@
     borderopacity="1.0"
     inkscape:pageopacity="1"
     inkscape:pageshadow="2"
-     inkscape:zoom="1.8469919"
-     inkscape:cx="164.97755"
-     inkscape:cy="48.418276"
+     inkscape:zoom="1.421213"
+     inkscape:cx="-23.774381"
+     inkscape:cy="33.944028"
     inkscape:document-units="mm"
     inkscape:current-layer="layer1"
     inkscape:document-rotation="0"
     showgrid="false"
-     inkscape:window-width="1920"
-     inkscape:window-height="1012"
+     inkscape:window-width="1280"
+     inkscape:window-height="653"
     inkscape:window-x="0"
     inkscape:window-y="0"
     inkscape:window-maximized="1"
@@ -50,7 +50,7 @@
        <dc:format>image/svg+xml</dc:format>
        <dc:type
           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
+        <dc:title />
      </cc:Work>
    </rdf:RDF>
  </metadata>
@@ -144,8 +144,8 @@
      <path
         id="path2257"
         style="fill:#ffffff;stroke:none;stroke-width:0.0999995;stroke-linecap:round"
-         d="m 19.97109,185.20282 10.735834,-6.19836 c 0.21219,-0.12249 0.502502,-0.0141 0.650911,0.24289 l 0.11208,0.19413 c 0.148409,0.25705 0.107331,0.58244 -0.115118,0.68513 -3.765389,1.73827 -7.326841,3.8345 -10.735835,6.19834 -0.201345,0.13962 -0.502495,0.0141 -0.65091,-0.24287 l -0.112081,-0.19413 c -0.148409,-0.25704 -0.09706,-0.56263 0.115117,-0.68513 z"
-         sodipodi:nodetypes="ssssssssss" />
+         d="m 19.523765,185.51136 11.807216,-7.07896 0.647873,1.12215 c -3.765389,1.73827 -8.398223,4.7151 -11.807217,7.07894 l -0.647874,-1.12213 z"
+         sodipodi:nodetypes="sccccs" />
    </g>
  </g>
 </svg>
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import time
+import sys
+
+import torch
+
+_SPEAKER_ID = 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    args = parser.parse_args()
+
+    if not args.config:
+        args.config = f"{args.model}.json"
+
+    with open(args.config, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    sample_rate = config["audio"]["sample_rate"]
+    utterances = [json.loads(line) for line in sys.stdin]
+
+    start_time = time.monotonic_ns()
+    model = torch.load(args.model)
+    end_time = time.monotonic_ns()
+
+    model.eval()
+
+    load_sec = (end_time - start_time) / 1e9
+    synthesize_rtf = []
+    for utterance in utterances:
+        phoneme_ids = utterance["phoneme_ids"]
+        speaker_id = utterance.get("speaker_id")
+        synthesize_rtf.append(
+            synthesize(
+                model,
+                phoneme_ids,
+                speaker_id,
+                sample_rate,
+            )
+        )
+
+    json.dump(
+        {"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
+        sys.stdout,
+    )
+
+
+def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
+    text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+    text_lengths = torch.LongTensor([len(phoneme_ids)])
+    sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+    start_time = time.monotonic_ns()
+    audio = (
+        model(
+            text,
+            text_lengths,
+            sid,
+        )[0]
+        .detach()
+        .numpy()
+        .squeeze()
+    )
+    end_time = time.monotonic_ns()
+
+    audio_sec = (len(audio) / 2) / sample_rate
+    infer_sec = (end_time - start_time) / 1e9
+
+    return infer_sec / audio_sec
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import time
+import sys
+
+import onnxruntime
+import numpy as np
+
+_NOISE_SCALE = 0.667
+_LENGTH_SCALE = 1.0
+_NOISE_W = 0.8
+_SPEAKER_ID = 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    args = parser.parse_args()
+
+    if not args.config:
+        args.config = f"{args.model}.json"
+
+    with open(args.config, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    sample_rate = config["audio"]["sample_rate"]
+    utterances = [json.loads(line) for line in sys.stdin]
+
+    start_time = time.monotonic_ns()
+    session = onnxruntime.InferenceSession(args.model)
+    end_time = time.monotonic_ns()
+
+    load_sec = (end_time - start_time) / 1e9
+    synthesize_rtf = []
+    for utterance in utterances:
+        phoneme_ids = utterance["phoneme_ids"]
+        speaker_id = utterance.get("speaker_id")
+        synthesize_rtf.append(
+            synthesize(
+                session,
+                phoneme_ids,
+                speaker_id,
+                sample_rate,
+            )
+        )
+
+    json.dump(
+        {"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
+        sys.stdout,
+    )
+
+
+def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
+    phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+    phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+    scales = np.array(
+        [_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
+        dtype=np.float32,
+    )
+
+    sid = None
+
+    if speaker_id is not None:
+        sid = np.array([speaker_id], dtype=np.int64)
+
+    # Synthesize through Onnx
+    start_time = time.monotonic_ns()
+    audio = session.run(
+        None,
+        {
+            "input": phoneme_ids_array,
+            "input_lengths": phoneme_ids_lengths,
+            "scales": scales,
+            "sid": sid,
+        },
+    )[0].squeeze()
+    end_time = time.monotonic_ns()
+
+    audio_sec = (len(audio) / 2) / sample_rate
+    infer_sec = (end_time - start_time) / 1e9
+
+    return infer_sec / audio_sec
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,2 @@
+onnxruntime~=1.11.0
+torch~=1.11.0