Add benchmark

This commit is contained in:
Michael Hansen
2023-04-09 20:15:05 -05:00
parent cf37ad7e22
commit 1eed98ecd9
3 changed files with 168 additions and 0 deletions
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
import argparse
import json
import time
import sys
import torch
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
args = parser.parse_args()
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
model = torch.load(args.model)
end_time = time.monotonic_ns()
model.eval()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
model,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
sys.stdout,
)
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.monotonic_ns()
audio = (
model(
text,
text_lengths,
sid,
)[0]
.detach()
.numpy()
.squeeze()
)
end_time = time.monotonic_ns()
audio_sec = (len(audio) / 2) / sample_rate
infer_sec = (end_time - start_time) / 1e9
return infer_sec / audio_sec
if __name__ == "__main__":
main()
+88
View File
@@ -0,0 +1,88 @@
#!/usr/bin/env python3
import argparse
import json
import time
import sys
import onnxruntime
import numpy as np
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
args = parser.parse_args()
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
session = onnxruntime.InferenceSession(args.model)
end_time = time.monotonic_ns()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
session,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
sys.stdout,
)
def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
# Synthesize through Onnx
start_time = time.monotonic_ns()
audio = session.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze()
end_time = time.monotonic_ns()
audio_sec = (len(audio) / 2) / sample_rate
infer_sec = (end_time - start_time) / 1e9
return infer_sec / audio_sec
if __name__ == "__main__":
main()
+2
View File
@@ -0,0 +1,2 @@
onnxruntime~=1.11.0
torch~=1.11.0