More benchmarks

This commit is contained in:
Michael Hansen
2023-04-10 14:37:34 -05:00
parent 082e64a53c
commit c945c141f1
4 changed files with 120 additions and 12 deletions

View File

@@ -6,13 +6,13 @@ import sys
import torch
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
parser.add_argument(
"-m", "--model", required=True, help="Path to generator file (.pt)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
if not args.config:

View File

@@ -2,6 +2,7 @@
import argparse
import json
import time
import statistics
import sys
import onnxruntime
@@ -10,13 +11,14 @@ import numpy as np
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
_SPEAKER_ID = 0
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
parser.add_argument(
"-m", "--model", required=True, help="Path to Onnx model file (.onnx)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
if not args.config:
@@ -29,7 +31,25 @@ def main() -> None:
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
session = onnxruntime.InferenceSession(args.model)
session_options = onnxruntime.SessionOptions()
session_options.graph_optimization_level = (
onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
)
# session_options.enable_cpu_mem_arena = False
# session_options.enable_mem_pattern = False
session_options.enable_mem_reuse = False
# session_options.enable_profiling = False
# session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
# session_options.execution_order = onnxruntime.ExecutionOrder.PRIORITY_BASED
session = onnxruntime.InferenceSession(
args.model,
sess_options=session_options,
)
# session.intra_op_num_threads = 1
# session.inter_op_num_threads = 1
end_time = time.monotonic_ns()
load_sec = (end_time - start_time) / 1e9
@@ -47,7 +67,12 @@ def main() -> None:
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
{
"load_sec": load_sec,
"rtf_mean": statistics.mean(synthesize_rtf),
"rtf_stdev": statistics.stdev(synthesize_rtf),
"rtfs": synthesize_rtf,
},
sys.stdout,
)

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
import argparse
import json
import time
import sys
import torch
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model", required=True, help="Path to Torchscript file (.ts)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
model = torch.jit.load(args.model)
end_time = time.monotonic_ns()
model.eval()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
model,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
sys.stdout,
)
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.monotonic_ns()
audio = (
model(
text,
text_lengths,
sid,
torch.FloatTensor([_NOISE_SCALE]),
torch.FloatTensor([_LENGTH_SCALE]),
torch.FloatTensor([_NOISE_W]),
)[0]
.detach()
.numpy()
.squeeze()
)
end_time = time.monotonic_ns()
audio_sec = (len(audio) / 2) / sample_rate
infer_sec = (end_time - start_time) / 1e9
return infer_sec / audio_sec
if __name__ == "__main__":
main()