Merge branch 'master' of pumpkin.local:Mike/Projects/larynx2

This commit is contained in:
Michael Hansen
2023-04-10 13:16:37 -05:00
6 changed files with 177 additions and 9 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
![Piper logo](etc/logo.png)
A fast, local neural text to speech system that is meant to sound good and run reasonably fast on the Raspberry Pi 4.
A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.
``` sh
echo 'Welcome to the world of speech synthesis!' | \
BIN
View File
Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.3 KiB

After

Width:  |  Height:  |  Size: 8.3 KiB

+8 -8
View File
@@ -26,15 +26,15 @@
borderopacity="1.0"
inkscape:pageopacity="1"
inkscape:pageshadow="2"
inkscape:zoom="1.8469919"
inkscape:cx="164.97755"
inkscape:cy="48.418276"
inkscape:zoom="1.421213"
inkscape:cx="-23.774381"
inkscape:cy="33.944028"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
inkscape:document-rotation="0"
showgrid="false"
inkscape:window-width="1920"
inkscape:window-height="1012"
inkscape:window-width="1280"
inkscape:window-height="653"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
@@ -50,7 +50,7 @@
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
<dc:title />
</cc:Work>
</rdf:RDF>
</metadata>
@@ -144,8 +144,8 @@
<path
id="path2257"
style="fill:#ffffff;stroke:none;stroke-width:0.0999995;stroke-linecap:round"
d="m 19.97109,185.20282 10.735834,-6.19836 c 0.21219,-0.12249 0.502502,-0.0141 0.650911,0.24289 l 0.11208,0.19413 c 0.148409,0.25705 0.107331,0.58244 -0.115118,0.68513 -3.765389,1.73827 -7.326841,3.8345 -10.735835,6.19834 -0.201345,0.13962 -0.502495,0.0141 -0.65091,-0.24287 l -0.112081,-0.19413 c -0.148409,-0.25704 -0.09706,-0.56263 0.115117,-0.68513 z"
sodipodi:nodetypes="ssssssssss" />
d="m 19.523765,185.51136 11.807216,-7.07896 0.647873,1.12215 c -3.765389,1.73827 -8.398223,4.7151 -11.807217,7.07894 l -0.647874,-1.12213 z"
sodipodi:nodetypes="sccccs" />
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 8.6 KiB

After

Width:  |  Height:  |  Size: 8.3 KiB

+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
import argparse
import json
import time
import sys
import torch
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
args = parser.parse_args()
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
model = torch.load(args.model)
end_time = time.monotonic_ns()
model.eval()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
model,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
sys.stdout,
)
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.monotonic_ns()
audio = (
model(
text,
text_lengths,
sid,
)[0]
.detach()
.numpy()
.squeeze()
)
end_time = time.monotonic_ns()
audio_sec = (len(audio) / 2) / sample_rate
infer_sec = (end_time - start_time) / 1e9
return infer_sec / audio_sec
if __name__ == "__main__":
main()
+88
View File
@@ -0,0 +1,88 @@
#!/usr/bin/env python3
import argparse
import json
import time
import sys
import onnxruntime
import numpy as np
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
args = parser.parse_args()
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
session = onnxruntime.InferenceSession(args.model)
end_time = time.monotonic_ns()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
session,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
sys.stdout,
)
def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
# Synthesize through Onnx
start_time = time.monotonic_ns()
audio = session.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze()
end_time = time.monotonic_ns()
audio_sec = (len(audio) / 2) / sample_rate
infer_sec = (end_time - start_time) / 1e9
return infer_sec / audio_sec
if __name__ == "__main__":
main()
+2
View File
@@ -0,0 +1,2 @@
onnxruntime~=1.11.0
torch~=1.11.0