diff --git a/README.md b/README.md
index 63e7ef5..ab03d2b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@

-A fast, local neural text to speech system that is meant to sound good and run reasonably fast on the Raspberry Pi 4.
+A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.
``` sh
echo 'Welcome to the world of speech synthesis!' | \
diff --git a/etc/logo.png b/etc/logo.png
index fbb8705..a3ea7d2 100644
Binary files a/etc/logo.png and b/etc/logo.png differ
diff --git a/etc/logo.svg b/etc/logo.svg
index 1943a5e..99c73b5 100644
--- a/etc/logo.svg
+++ b/etc/logo.svg
@@ -26,15 +26,15 @@
borderopacity="1.0"
inkscape:pageopacity="1"
inkscape:pageshadow="2"
- inkscape:zoom="1.8469919"
- inkscape:cx="164.97755"
- inkscape:cy="48.418276"
+ inkscape:zoom="1.421213"
+ inkscape:cx="-23.774381"
+ inkscape:cy="33.944028"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
inkscape:document-rotation="0"
showgrid="false"
- inkscape:window-width="1920"
- inkscape:window-height="1012"
+ inkscape:window-width="1280"
+ inkscape:window-height="653"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
@@ -50,7 +50,7 @@
image/svg+xml
-
+
@@ -144,8 +144,8 @@
+ d="m 19.523765,185.51136 11.807216,-7.07896 0.647873,1.12215 c -3.765389,1.73827 -8.398223,4.7151 -11.807217,7.07894 l -0.647874,-1.12213 z"
+ sodipodi:nodetypes="sccccs" />
diff --git a/src/benchmark/benchmark_generator.py b/src/benchmark/benchmark_generator.py
new file mode 100644
index 0000000..bfb70be
--- /dev/null
+++ b/src/benchmark/benchmark_generator.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import time
+import sys
+
+import torch
+
+_SPEAKER_ID = 0
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+ parser.add_argument("-c", "--config", help="Path to model config file")
+ args = parser.parse_args()
+
+ if not args.config:
+ args.config = f"{args.model}.json"
+
+ with open(args.config, "r", encoding="utf-8") as config_file:
+ config = json.load(config_file)
+
+ sample_rate = config["audio"]["sample_rate"]
+ utterances = [json.loads(line) for line in sys.stdin]
+
+ start_time = time.monotonic_ns()
+ model = torch.load(args.model)
+ end_time = time.monotonic_ns()
+
+ model.eval()
+
+ load_sec = (end_time - start_time) / 1e9
+ synthesize_rtf = []
+ for utterance in utterances:
+ phoneme_ids = utterance["phoneme_ids"]
+ speaker_id = utterance.get("speaker_id")
+ synthesize_rtf.append(
+ synthesize(
+ model,
+ phoneme_ids,
+ speaker_id,
+ sample_rate,
+ )
+ )
+
+ json.dump(
+ {"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
+ sys.stdout,
+ )
+
+
+def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
+ text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+ text_lengths = torch.LongTensor([len(phoneme_ids)])
+ sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+ start_time = time.monotonic_ns()
+ audio = (
+ model(
+ text,
+ text_lengths,
+ sid,
+ )[0]
+ .detach()
+ .numpy()
+ .squeeze()
+ )
+ end_time = time.monotonic_ns()
+
+ audio_sec = (len(audio) / 2) / sample_rate
+ infer_sec = (end_time - start_time) / 1e9
+
+ return infer_sec / audio_sec
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/benchmark/benchmark_onnx.py b/src/benchmark/benchmark_onnx.py
new file mode 100644
index 0000000..22426cd
--- /dev/null
+++ b/src/benchmark/benchmark_onnx.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import time
+import sys
+
+import onnxruntime
+import numpy as np
+
+_NOISE_SCALE = 0.667
+_LENGTH_SCALE = 1.0
+_NOISE_W = 0.8
+_SPEAKER_ID = 0
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+ parser.add_argument("-c", "--config", help="Path to model config file")
+ args = parser.parse_args()
+
+ if not args.config:
+ args.config = f"{args.model}.json"
+
+ with open(args.config, "r", encoding="utf-8") as config_file:
+ config = json.load(config_file)
+
+ sample_rate = config["audio"]["sample_rate"]
+ utterances = [json.loads(line) for line in sys.stdin]
+
+ start_time = time.monotonic_ns()
+ session = onnxruntime.InferenceSession(args.model)
+ end_time = time.monotonic_ns()
+
+ load_sec = (end_time - start_time) / 1e9
+ synthesize_rtf = []
+ for utterance in utterances:
+ phoneme_ids = utterance["phoneme_ids"]
+ speaker_id = utterance.get("speaker_id")
+ synthesize_rtf.append(
+ synthesize(
+ session,
+ phoneme_ids,
+ speaker_id,
+ sample_rate,
+ )
+ )
+
+ json.dump(
+ {"load_sec": load_sec, "synthesize_rtf": synthesize_rtf},
+ sys.stdout,
+ )
+
+
+def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+ scales = np.array(
+ [_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
+ dtype=np.float32,
+ )
+
+ sid = None
+
+ if speaker_id is not None:
+ sid = np.array([speaker_id], dtype=np.int64)
+
+ # Synthesize through Onnx
+ start_time = time.monotonic_ns()
+ audio = session.run(
+ None,
+ {
+ "input": phoneme_ids_array,
+ "input_lengths": phoneme_ids_lengths,
+ "scales": scales,
+ "sid": sid,
+ },
+ )[0].squeeze()
+ end_time = time.monotonic_ns()
+
+ audio_sec = (len(audio) / 2) / sample_rate
+ infer_sec = (end_time - start_time) / 1e9
+
+ return infer_sec / audio_sec
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/benchmark/requirements.txt b/src/benchmark/requirements.txt
new file mode 100644
index 0000000..26f8d83
--- /dev/null
+++ b/src/benchmark/requirements.txt
@@ -0,0 +1,2 @@
+onnxruntime~=1.11.0
+torch~=1.11.0