Files
piper/src/python_run/larynx/__init__.py
2023-02-26 23:03:11 -06:00

135 lines
4.1 KiB
Python

import io
import json
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import List, Mapping, Optional, Sequence, Union
import numpy as np
import onnxruntime
from espeak_phonemizer import Phonemizer
_BOS = "^"
_EOS = "$"
_PAD = "_"
@dataclass
class LarynxConfig:
num_symbols: int
num_speakers: int
sample_rate: int
espeak_voice: str
length_scale: float
noise_scale: float
noise_w: float
phoneme_id_map: Mapping[str, Sequence[int]]
class Larynx:
def __init__(
self,
model_path: Union[str, Path],
config_path: Optional[Union[str, Path]] = None,
use_cuda: bool = False,
):
if config_path is None:
config_path = f"{model_path}.json"
self.config = load_config(config_path)
self.phonemizer = Phonemizer(self.config.espeak_voice)
self.model = onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=None if not use_cuda else ["CUDAExecutionProvider"],
)
def synthesize(
self,
text: str,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
) -> bytes:
"""Synthesize WAV audio from text."""
if length_scale is None:
length_scale = self.config.length_scale
if noise_scale is None:
noise_scale = self.config.noise_scale
if noise_w is None:
noise_w = self.config.noise_w
phonemes_str = self.phonemizer.phonemize(text)
phonemes = [_BOS] + list(phonemes_str)
phoneme_ids: List[int] = []
for phoneme in phonemes:
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[noise_scale, length_scale, noise_w],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
# Synthesize through Onnx
audio = self.model.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
audio = audio_float_to_int16(audio.squeeze())
# Convert to WAV
with io.BytesIO() as wav_io:
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
with wav_file:
wav_file.setframerate(self.config.sample_rate)
wav_file.setsampwidth(2)
wav_file.setnchannels(1)
wav_file.writeframes(audio.tobytes())
return wav_io.getvalue()
def load_config(config_path: Union[str, Path]) -> LarynxConfig:
with open(config_path, "r", encoding="utf-8") as config_file:
config_dict = json.load(config_file)
inference = config_dict.get("inference", {})
return LarynxConfig(
num_symbols=config_dict["num_symbols"],
num_speakers=config_dict["num_speakers"],
sample_rate=config_dict["audio"]["sample_rate"],
espeak_voice=config_dict["espeak"]["voice"],
noise_scale=inference.get("noise_scale", 0.667),
length_scale=inference.get("length_scale", 1.0),
noise_w=inference.get("noise_w", 0.8),
phoneme_id_map=config_dict["phoneme_id_map"],
)
def audio_float_to_int16(
audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
"""Normalize audio and convert to int16 range"""
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
audio_norm = audio_norm.astype("int16")
return audio_norm