mirror of
https://github.com/pstrueb/piper.git
synced 2026-06-01 17:37:01 +00:00
Merge branch 'master' of pumpkin.local:Mike/Projects/larynx2
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||

|
||||
|
||||
A fast, local neural text to speech system that is meant to sound good and run reasonably fast on the Raspberry Pi 4.
|
||||
A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.
|
||||
|
||||
``` sh
|
||||
echo 'Welcome to the world of speech synthesis!' | \
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 8.3 KiB After Width: | Height: | Size: 8.3 KiB |
+8
-8
@@ -26,15 +26,15 @@
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="1"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="1.8469919"
|
||||
inkscape:cx="164.97755"
|
||||
inkscape:cy="48.418276"
|
||||
inkscape:zoom="1.421213"
|
||||
inkscape:cx="-23.774381"
|
||||
inkscape:cy="33.944028"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="layer1"
|
||||
inkscape:document-rotation="0"
|
||||
showgrid="false"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1012"
|
||||
inkscape:window-width="1280"
|
||||
inkscape:window-height="653"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-maximized="1"
|
||||
@@ -50,7 +50,7 @@
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
<dc:title />
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
@@ -144,8 +144,8 @@
|
||||
<path
|
||||
id="path2257"
|
||||
style="fill:#ffffff;stroke:none;stroke-width:0.0999995;stroke-linecap:round"
|
||||
d="m 19.97109,185.20282 10.735834,-6.19836 c 0.21219,-0.12249 0.502502,-0.0141 0.650911,0.24289 l 0.11208,0.19413 c 0.148409,0.25705 0.107331,0.58244 -0.115118,0.68513 -3.765389,1.73827 -7.326841,3.8345 -10.735835,6.19834 -0.201345,0.13962 -0.502495,0.0141 -0.65091,-0.24287 l -0.112081,-0.19413 c -0.148409,-0.25704 -0.09706,-0.56263 0.115117,-0.68513 z"
|
||||
sodipodi:nodetypes="ssssssssss" />
|
||||
d="m 19.523765,185.51136 11.807216,-7.07896 0.647873,1.12215 c -3.765389,1.73827 -8.398223,4.7151 -11.807217,7.07894 l -0.647874,-1.12213 z"
|
||||
sodipodi:nodetypes="sccccs" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
||||
|
Before Width: | Height: | Size: 8.6 KiB After Width: | Height: | Size: 8.3 KiB |
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
_SPEAKER_ID = 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.config:
|
||||
args.config = f"{args.model}.json"
|
||||
|
||||
with open(args.config, "r", encoding="utf-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
sample_rate = config["audio"]["sample_rate"]
|
||||
utterances = [json.loads(line) for line in sys.stdin]
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
model = torch.load(args.model)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
model.eval()
|
||||
|
||||
load_sec = (end_time - start_time) / 1e9
|
||||
synthesize_rtf = []
|
||||
for utterance in utterances:
|
||||
phoneme_ids = utterance["phoneme_ids"]
|
||||
speaker_id = utterance.get("speaker_id")
|
||||
synthesize_rtf.append(
|
||||
synthesize(
|
||||
model,
|
||||
phoneme_ids,
|
||||
speaker_id,
|
||||
sample_rate,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
audio = (
|
||||
model(
|
||||
text,
|
||||
text_lengths,
|
||||
sid,
|
||||
)[0]
|
||||
.detach()
|
||||
.numpy()
|
||||
.squeeze()
|
||||
)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
audio_sec = (len(audio) / 2) / sample_rate
|
||||
infer_sec = (end_time - start_time) / 1e9
|
||||
|
||||
return infer_sec / audio_sec
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
|
||||
import onnxruntime
|
||||
import numpy as np
|
||||
|
||||
_NOISE_SCALE = 0.667
|
||||
_LENGTH_SCALE = 1.0
|
||||
_NOISE_W = 0.8
|
||||
_SPEAKER_ID = 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.config:
|
||||
args.config = f"{args.model}.json"
|
||||
|
||||
with open(args.config, "r", encoding="utf-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
sample_rate = config["audio"]["sample_rate"]
|
||||
utterances = [json.loads(line) for line in sys.stdin]
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
session = onnxruntime.InferenceSession(args.model)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
load_sec = (end_time - start_time) / 1e9
|
||||
synthesize_rtf = []
|
||||
for utterance in utterances:
|
||||
phoneme_ids = utterance["phoneme_ids"]
|
||||
speaker_id = utterance.get("speaker_id")
|
||||
synthesize_rtf.append(
|
||||
synthesize(
|
||||
session,
|
||||
phoneme_ids,
|
||||
speaker_id,
|
||||
sample_rate,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
|
||||
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
||||
scales = np.array(
|
||||
[_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
sid = None
|
||||
|
||||
if speaker_id is not None:
|
||||
sid = np.array([speaker_id], dtype=np.int64)
|
||||
|
||||
# Synthesize through Onnx
|
||||
start_time = time.monotonic_ns()
|
||||
audio = session.run(
|
||||
None,
|
||||
{
|
||||
"input": phoneme_ids_array,
|
||||
"input_lengths": phoneme_ids_lengths,
|
||||
"scales": scales,
|
||||
"sid": sid,
|
||||
},
|
||||
)[0].squeeze()
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
audio_sec = (len(audio) / 2) / sample_rate
|
||||
infer_sec = (end_time - start_time) / 1e9
|
||||
|
||||
return infer_sec / audio_sec
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,2 @@
|
||||
onnxruntime~=1.11.0
|
||||
torch~=1.11.0
|
||||
Reference in New Issue
Block a user