mirror of
https://github.com/pstrueb/piper.git
synced 2026-05-22 21:28:01 +00:00
Update Python package
This commit is contained in:
20
README.md
20
README.md
@@ -5,7 +5,7 @@ Piper is used in a [variety of projects](#people-using-piper).
|
|||||||
|
|
||||||
``` sh
|
``` sh
|
||||||
echo 'Welcome to the world of speech synthesis!' | \
|
echo 'Welcome to the world of speech synthesis!' | \
|
||||||
./piper --model en-us-blizzard_lessac-medium.onnx --output_file welcome.wav
|
./piper --model en_US-lessac-medium.onnx --output_file welcome.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
[Listen to voice samples](https://rhasspy.github.io/piper-samples) and check out a [video tutorial by Thorsten Müller](https://youtu.be/rjq5eZoWWSo)
|
[Listen to voice samples](https://rhasspy.github.io/piper-samples) and check out a [video tutorial by Thorsten Müller](https://youtu.be/rjq5eZoWWSo)
|
||||||
@@ -54,7 +54,7 @@ The `MODEL_CARD` file for each voice contains important licensing information. P
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Download a release:
|
You can [run Piper with Python](#running-in-python) or download a binary release:
|
||||||
|
|
||||||
* [amd64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_amd64.tar.gz) (64-bit desktop Linux)
|
* [amd64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_amd64.tar.gz) (64-bit desktop Linux)
|
||||||
* [arm64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4)
|
* [arm64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4)
|
||||||
@@ -131,14 +131,22 @@ Pretrained checkpoints are available on [Hugging Face](https://huggingface.co/da
|
|||||||
|
|
||||||
See [src/python_run](src/python_run)
|
See [src/python_run](src/python_run)
|
||||||
|
|
||||||
Run `scripts/setup.sh` to create a virtual environment and install the requirements. Then run:
|
Install with `pip`:
|
||||||
|
|
||||||
``` sh
|
``` sh
|
||||||
echo 'Welcome to the world of speech synthesis!' | scripts/piper \
|
pip install piper-tts
|
||||||
--model /path/to/voice.onnx \
|
```
|
||||||
|
|
||||||
|
and then run:
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
echo 'Welcome to the world of speech synthesis!' | piper \
|
||||||
|
--model en_US-lessac-medium \
|
||||||
--output_file welcome.wav
|
--output_file welcome.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This will automatically download [voice files](https://huggingface.co/rhasspy/piper-voices/tree/v1.0.0) the first time they're used. Use `--data-dir` and `--download-dir` to adjust where voices are found/downloaded.
|
||||||
|
|
||||||
If you'd like to use a GPU, install the `onnxruntime-gpu` package:
|
If you'd like to use a GPU, install the `onnxruntime-gpu` package:
|
||||||
|
|
||||||
|
|
||||||
@@ -146,5 +154,5 @@ If you'd like to use a GPU, install the `onnxruntime-gpu` package:
|
|||||||
.venv/bin/pip3 install onnxruntime-gpu
|
.venv/bin/pip3 install onnxruntime-gpu
|
||||||
```
|
```
|
||||||
|
|
||||||
and then run `scripts/piper` with the `--cuda` argument. You will need to have a functioning CUDA environment, such as what's available in [NVIDIA's PyTorch containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
|
and then run `piper` with the `--cuda` argument. You will need to have a functioning CUDA environment, such as what's available in [NVIDIA's PyTorch containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
|
||||||
|
|
||||||
|
|||||||
3
src/python_run/.gitignore
vendored
Normal file
3
src/python_run/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
build/
|
||||||
|
dist/
|
||||||
|
*.egg-info/
|
||||||
2
src/python_run/MANIFEST.in
Normal file
2
src/python_run/MANIFEST.in
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
include requirements.txt
|
||||||
|
include piper/voices.json
|
||||||
@@ -1,147 +1,5 @@
|
|||||||
import io
|
from .voice import PiperVoice
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import wave
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List, Mapping, Optional, Sequence, Union
|
|
||||||
|
|
||||||
import numpy as np
|
__all__ = [
|
||||||
import onnxruntime
|
"PiperVoice",
|
||||||
from espeak_phonemizer import Phonemizer
|
]
|
||||||
|
|
||||||
_LOGGER = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_BOS = "^"
|
|
||||||
_EOS = "$"
|
|
||||||
_PAD = "_"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PiperConfig:
|
|
||||||
num_symbols: int
|
|
||||||
num_speakers: int
|
|
||||||
sample_rate: int
|
|
||||||
espeak_voice: str
|
|
||||||
length_scale: float
|
|
||||||
noise_scale: float
|
|
||||||
noise_w: float
|
|
||||||
phoneme_id_map: Mapping[str, Sequence[int]]
|
|
||||||
|
|
||||||
|
|
||||||
class Piper:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_path: Union[str, Path],
|
|
||||||
config_path: Optional[Union[str, Path]] = None,
|
|
||||||
use_cuda: bool = False,
|
|
||||||
):
|
|
||||||
if config_path is None:
|
|
||||||
config_path = f"{model_path}.json"
|
|
||||||
|
|
||||||
self.config = load_config(config_path)
|
|
||||||
self.phonemizer = Phonemizer(self.config.espeak_voice)
|
|
||||||
self.model = onnxruntime.InferenceSession(
|
|
||||||
str(model_path),
|
|
||||||
sess_options=onnxruntime.SessionOptions(),
|
|
||||||
providers=["CPUExecutionProvider"]
|
|
||||||
if not use_cuda
|
|
||||||
else ["CUDAExecutionProvider"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def synthesize(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_id: Optional[int] = None,
|
|
||||||
length_scale: Optional[float] = None,
|
|
||||||
noise_scale: Optional[float] = None,
|
|
||||||
noise_w: Optional[float] = None,
|
|
||||||
) -> bytes:
|
|
||||||
"""Synthesize WAV audio from text."""
|
|
||||||
if length_scale is None:
|
|
||||||
length_scale = self.config.length_scale
|
|
||||||
|
|
||||||
if noise_scale is None:
|
|
||||||
noise_scale = self.config.noise_scale
|
|
||||||
|
|
||||||
if noise_w is None:
|
|
||||||
noise_w = self.config.noise_w
|
|
||||||
|
|
||||||
phonemes_str = self.phonemizer.phonemize(text)
|
|
||||||
phonemes = [_BOS] + list(phonemes_str)
|
|
||||||
phoneme_ids: List[int] = []
|
|
||||||
|
|
||||||
for phoneme in phonemes:
|
|
||||||
if phoneme in self.config.phoneme_id_map:
|
|
||||||
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
|
|
||||||
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
|
|
||||||
else:
|
|
||||||
_LOGGER.warning("No id for phoneme: %s", phoneme)
|
|
||||||
|
|
||||||
phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
|
|
||||||
|
|
||||||
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
|
||||||
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
|
||||||
scales = np.array(
|
|
||||||
[noise_scale, length_scale, noise_w],
|
|
||||||
dtype=np.float32,
|
|
||||||
)
|
|
||||||
|
|
||||||
if (self.config.num_speakers > 1) and (speaker_id is None):
|
|
||||||
# Default speaker
|
|
||||||
speaker_id = 0
|
|
||||||
|
|
||||||
sid = None
|
|
||||||
|
|
||||||
if speaker_id is not None:
|
|
||||||
sid = np.array([speaker_id], dtype=np.int64)
|
|
||||||
|
|
||||||
# Synthesize through Onnx
|
|
||||||
audio = self.model.run(
|
|
||||||
None,
|
|
||||||
{
|
|
||||||
"input": phoneme_ids_array,
|
|
||||||
"input_lengths": phoneme_ids_lengths,
|
|
||||||
"scales": scales,
|
|
||||||
"sid": sid,
|
|
||||||
},
|
|
||||||
)[0].squeeze((0, 1))
|
|
||||||
audio = audio_float_to_int16(audio.squeeze())
|
|
||||||
|
|
||||||
# Convert to WAV
|
|
||||||
with io.BytesIO() as wav_io:
|
|
||||||
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
|
|
||||||
with wav_file:
|
|
||||||
wav_file.setframerate(self.config.sample_rate)
|
|
||||||
wav_file.setsampwidth(2)
|
|
||||||
wav_file.setnchannels(1)
|
|
||||||
wav_file.writeframes(audio.tobytes())
|
|
||||||
|
|
||||||
return wav_io.getvalue()
|
|
||||||
|
|
||||||
|
|
||||||
def load_config(config_path: Union[str, Path]) -> PiperConfig:
|
|
||||||
with open(config_path, "r", encoding="utf-8") as config_file:
|
|
||||||
config_dict = json.load(config_file)
|
|
||||||
inference = config_dict.get("inference", {})
|
|
||||||
|
|
||||||
return PiperConfig(
|
|
||||||
num_symbols=config_dict["num_symbols"],
|
|
||||||
num_speakers=config_dict["num_speakers"],
|
|
||||||
sample_rate=config_dict["audio"]["sample_rate"],
|
|
||||||
espeak_voice=config_dict["espeak"]["voice"],
|
|
||||||
noise_scale=inference.get("noise_scale", 0.667),
|
|
||||||
length_scale=inference.get("length_scale", 1.0),
|
|
||||||
noise_w=inference.get("noise_w", 0.8),
|
|
||||||
phoneme_id_map=config_dict["phoneme_id_map"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def audio_float_to_int16(
|
|
||||||
audio: np.ndarray, max_wav_value: float = 32767.0
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""Normalize audio and convert to int16 range"""
|
|
||||||
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
|
||||||
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
|
||||||
audio_norm = audio_norm.astype("int16")
|
|
||||||
return audio_norm
|
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
import wave
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
from . import Piper
|
from . import PiperVoice
|
||||||
|
from .download import ensure_voice_exists, find_voice, get_voices
|
||||||
|
|
||||||
_FILE = Path(__file__)
|
_FILE = Path(__file__)
|
||||||
_DIR = _FILE.parent
|
_DIR = _FILE.parent
|
||||||
@@ -17,33 +19,108 @@ def main() -> None:
|
|||||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-f", "--output_file", help="Path to output WAV file (default: stdout)"
|
"-f",
|
||||||
|
"--output-file",
|
||||||
|
"--output_file",
|
||||||
|
help="Path to output WAV file (default: stdout)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d", "--output_dir", help="Path to output directory (default: cwd)"
|
"-d",
|
||||||
|
"--output-dir",
|
||||||
|
"--output_dir",
|
||||||
|
help="Path to output directory (default: cwd)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-raw",
|
||||||
|
"--output_raw",
|
||||||
|
action="store_true",
|
||||||
|
help="Stream raw audio to stdout",
|
||||||
|
)
|
||||||
|
#
|
||||||
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
||||||
parser.add_argument("--noise-scale", type=float, help="Generator noise")
|
parser.add_argument(
|
||||||
parser.add_argument("--length-scale", type=float, help="Phoneme length")
|
"--length-scale", "--length_scale", type=float, help="Phoneme length"
|
||||||
parser.add_argument("--noise-w", type=float, help="Phoneme width noise")
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
|
||||||
|
)
|
||||||
|
#
|
||||||
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
||||||
#
|
#
|
||||||
|
parser.add_argument(
|
||||||
|
"--sentence-silence",
|
||||||
|
"--sentence_silence",
|
||||||
|
type=float,
|
||||||
|
default=0.0,
|
||||||
|
help="Seconds of silence after each sentence",
|
||||||
|
)
|
||||||
|
#
|
||||||
|
parser.add_argument(
|
||||||
|
"--data-dir",
|
||||||
|
"--data_dir",
|
||||||
|
action="append",
|
||||||
|
default=[str(Path.cwd())],
|
||||||
|
help="Data directory to check for downloaded models (default: current directory)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--download-dir",
|
||||||
|
"--download_dir",
|
||||||
|
help="Directory to download voices into (default: first data dir)",
|
||||||
|
)
|
||||||
|
#
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--debug", action="store_true", help="Print DEBUG messages to console"
|
"--debug", action="store_true", help="Print DEBUG messages to console"
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
||||||
|
_LOGGER.debug(args)
|
||||||
|
|
||||||
voice = Piper(args.model, config_path=args.config, use_cuda=args.cuda)
|
if not args.download_dir:
|
||||||
synthesize = partial(
|
# Download to first data directory by default
|
||||||
voice.synthesize,
|
args.download_dir = args.data_dir[0]
|
||||||
speaker_id=args.speaker,
|
|
||||||
length_scale=args.length_scale,
|
|
||||||
noise_scale=args.noise_scale,
|
|
||||||
noise_w=args.noise_w,
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.output_dir:
|
# Download voice if file doesn't exist
|
||||||
|
model_path = Path(args.model)
|
||||||
|
if not model_path.exists():
|
||||||
|
# Load voice info
|
||||||
|
voices_info = get_voices()
|
||||||
|
|
||||||
|
# Resolve aliases for backwards compatibility with old voice names
|
||||||
|
aliases_info: Dict[str, Any] = {}
|
||||||
|
for voice_info in voices_info.values():
|
||||||
|
for voice_alias in voice_info.get("aliases", []):
|
||||||
|
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
|
||||||
|
|
||||||
|
voices_info.update(aliases_info)
|
||||||
|
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
|
||||||
|
args.model, args.config = find_voice(args.model, args.data_dir)
|
||||||
|
|
||||||
|
# Load voice
|
||||||
|
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
|
||||||
|
synthesize_args = {
|
||||||
|
"speaker_id": args.speaker,
|
||||||
|
"length_scale": args.length_scale,
|
||||||
|
"noise_scale": args.noise_scale,
|
||||||
|
"noise_w": args.noise_w,
|
||||||
|
"sentence_silence": args.sentence_silence,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.output_raw:
|
||||||
|
# Read line-by-line
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Write raw audio to stdout as its produced
|
||||||
|
audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
|
||||||
|
for audio_bytes in audio_stream:
|
||||||
|
sys.stdout.buffer.write(audio_bytes)
|
||||||
|
sys.stdout.buffer.flush()
|
||||||
|
elif args.output_dir:
|
||||||
output_dir = Path(args.output_dir)
|
output_dir = Path(args.output_dir)
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
@@ -53,21 +130,23 @@ def main() -> None:
|
|||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
wav_bytes = synthesize(line)
|
|
||||||
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
|
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
|
||||||
wav_path.write_bytes(wav_bytes)
|
with wave.open(str(wav_path), "wb") as wav_file:
|
||||||
|
voice.synthesize(line, wav_file, **synthesize_args)
|
||||||
|
|
||||||
_LOGGER.info("Wrote %s", wav_path)
|
_LOGGER.info("Wrote %s", wav_path)
|
||||||
else:
|
else:
|
||||||
# Read entire input
|
# Read entire input
|
||||||
text = sys.stdin.read()
|
text = sys.stdin.read()
|
||||||
wav_bytes = synthesize(text)
|
|
||||||
|
|
||||||
if (not args.output_file) or (args.output_file == "-"):
|
if (not args.output_file) or (args.output_file == "-"):
|
||||||
# Write to stdout
|
# Write to stdout
|
||||||
sys.stdout.buffer.write(wav_bytes)
|
with wave.open(sys.stdout.buffer, "wb") as wav_file:
|
||||||
|
voice.synthesize(text, wav_file, **synthesize_args)
|
||||||
else:
|
else:
|
||||||
with open(args.output_file, "wb") as output_file:
|
# Write to file
|
||||||
output_file.write(wav_bytes)
|
with wave.open(args.output_file, "wb") as wav_file:
|
||||||
|
voice.synthesize(text, wav_file, **synthesize_args)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
53
src/python_run/piper/config.py
Normal file
53
src/python_run/piper/config.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Piper configuration"""
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, Mapping, Sequence
|
||||||
|
|
||||||
|
|
||||||
|
class PhonemeType(str, Enum):
|
||||||
|
ESPEAK = "espeak"
|
||||||
|
TEXT = "text"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PiperConfig:
|
||||||
|
"""Piper configuration"""
|
||||||
|
|
||||||
|
num_symbols: int
|
||||||
|
"""Number of phonemes"""
|
||||||
|
|
||||||
|
num_speakers: int
|
||||||
|
"""Number of speakers"""
|
||||||
|
|
||||||
|
sample_rate: int
|
||||||
|
"""Sample rate of output audio"""
|
||||||
|
|
||||||
|
espeak_voice: str
|
||||||
|
"""Name of espeak-ng voice or alphabet"""
|
||||||
|
|
||||||
|
length_scale: float
|
||||||
|
noise_scale: float
|
||||||
|
noise_w: float
|
||||||
|
|
||||||
|
phoneme_id_map: Mapping[str, Sequence[int]]
|
||||||
|
"""Phoneme -> [id,]"""
|
||||||
|
|
||||||
|
phoneme_type: PhonemeType
|
||||||
|
"""espeak or text"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict(config: Dict[str, Any]) -> "PiperConfig":
|
||||||
|
inference = config.get("inference", {})
|
||||||
|
|
||||||
|
return PiperConfig(
|
||||||
|
num_symbols=config["num_symbols"],
|
||||||
|
num_speakers=config["num_speakers"],
|
||||||
|
sample_rate=config["audio"]["sample_rate"],
|
||||||
|
noise_scale=inference.get("noise_scale", 0.667),
|
||||||
|
length_scale=inference.get("length_scale", 1.0),
|
||||||
|
noise_w=inference.get("noise_w", 0.8),
|
||||||
|
#
|
||||||
|
espeak_voice=config["espeak"]["voice"],
|
||||||
|
phoneme_id_map=config["phoneme_id_map"],
|
||||||
|
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
|
||||||
|
)
|
||||||
5
src/python_run/piper/const.py
Normal file
5
src/python_run/piper/const.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""Constants"""
|
||||||
|
|
||||||
|
PAD = "_" # padding (0)
|
||||||
|
BOS = "^" # beginning of sentence
|
||||||
|
EOS = "$" # end of sentence
|
||||||
120
src/python_run/piper/download.py
Executable file
120
src/python_run/piper/download.py
Executable file
@@ -0,0 +1,120 @@
|
|||||||
|
"""Utility for downloading Piper voices."""
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, Set, Tuple, Union
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
from .file_hash import get_file_hash
|
||||||
|
|
||||||
|
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
|
||||||
|
|
||||||
|
_DIR = Path(__file__).parent
|
||||||
|
_LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SKIP_FILES = {"MODEL_CARD"}
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceNotFoundError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_voices() -> Dict[str, Any]:
|
||||||
|
"""Loads available voices from embedded JSON file."""
|
||||||
|
with open(_DIR / "voices.json", "r", encoding="utf-8") as voices_file:
|
||||||
|
return json.load(voices_file)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_voice_exists(
|
||||||
|
name: str,
|
||||||
|
data_dirs: Iterable[Union[str, Path]],
|
||||||
|
download_dir: Union[str, Path],
|
||||||
|
voices_info: Dict[str, Any],
|
||||||
|
):
|
||||||
|
assert data_dirs, "No data dirs"
|
||||||
|
if name not in voices_info:
|
||||||
|
raise VoiceNotFoundError(name)
|
||||||
|
|
||||||
|
voice_info = voices_info[name]
|
||||||
|
voice_files = voice_info["files"]
|
||||||
|
files_to_download: Set[str] = set()
|
||||||
|
|
||||||
|
for data_dir in data_dirs:
|
||||||
|
data_dir = Path(data_dir)
|
||||||
|
|
||||||
|
# Check sizes/hashes
|
||||||
|
for file_path, file_info in voice_files.items():
|
||||||
|
if file_path in files_to_download:
|
||||||
|
# Already planning to download
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_name = Path(file_path).name
|
||||||
|
if file_name in _SKIP_FILES:
|
||||||
|
continue
|
||||||
|
|
||||||
|
data_file_path = data_dir / file_name
|
||||||
|
_LOGGER.debug("Checking %s", data_file_path)
|
||||||
|
if not data_file_path.exists():
|
||||||
|
_LOGGER.debug("Missing %s", data_file_path)
|
||||||
|
files_to_download.add(file_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
expected_size = file_info["size_bytes"]
|
||||||
|
actual_size = data_file_path.stat().st_size
|
||||||
|
if expected_size != actual_size:
|
||||||
|
_LOGGER.warning(
|
||||||
|
"Wrong size (expected=%s, actual=%s) for %s",
|
||||||
|
expected_size,
|
||||||
|
actual_size,
|
||||||
|
data_file_path,
|
||||||
|
)
|
||||||
|
files_to_download.add(file_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
expected_hash = file_info["md5_digest"]
|
||||||
|
actual_hash = get_file_hash(data_file_path)
|
||||||
|
if expected_hash != actual_hash:
|
||||||
|
_LOGGER.warning(
|
||||||
|
"Wrong hash (expected=%s, actual=%s) for %s",
|
||||||
|
expected_hash,
|
||||||
|
actual_hash,
|
||||||
|
data_file_path,
|
||||||
|
)
|
||||||
|
files_to_download.add(file_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (not voice_files) and (not files_to_download):
|
||||||
|
raise ValueError(f"Unable to find or download voice: {name}")
|
||||||
|
|
||||||
|
# Download missing files
|
||||||
|
download_dir = Path(download_dir)
|
||||||
|
|
||||||
|
for file_path in files_to_download:
|
||||||
|
file_name = Path(file_path).name
|
||||||
|
if file_name in _SKIP_FILES:
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_url = URL_FORMAT.format(file=file_path)
|
||||||
|
download_file_path = download_dir / file_name
|
||||||
|
download_file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
_LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
|
||||||
|
with urlopen(file_url) as response, open(
|
||||||
|
download_file_path, "wb"
|
||||||
|
) as download_file:
|
||||||
|
shutil.copyfileobj(response, download_file)
|
||||||
|
|
||||||
|
_LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
|
||||||
|
|
||||||
|
|
||||||
|
def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
|
||||||
|
for data_dir in data_dirs:
|
||||||
|
data_dir = Path(data_dir)
|
||||||
|
onnx_path = data_dir / f"{name}.onnx"
|
||||||
|
config_path = data_dir / f"{name}.onnx.json"
|
||||||
|
|
||||||
|
if onnx_path.exists() and config_path.exists():
|
||||||
|
return onnx_path, config_path
|
||||||
|
|
||||||
|
raise ValueError(f"Missing files for voice {name}")
|
||||||
46
src/python_run/piper/file_hash.py
Normal file
46
src/python_run/piper/file_hash.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
|
||||||
|
"""Hash a file in chunks using md5."""
|
||||||
|
path_hash = hashlib.md5()
|
||||||
|
with open(path, "rb") as path_file:
|
||||||
|
chunk = path_file.read(bytes_per_chunk)
|
||||||
|
while chunk:
|
||||||
|
path_hash.update(chunk)
|
||||||
|
chunk = path_file.read(bytes_per_chunk)
|
||||||
|
|
||||||
|
return path_hash.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("file", nargs="+")
|
||||||
|
parser.add_argument("--dir", help="Parent directory")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.dir:
|
||||||
|
args.dir = Path(args.dir)
|
||||||
|
|
||||||
|
hashes = {}
|
||||||
|
for path_str in args.file:
|
||||||
|
path = Path(path_str)
|
||||||
|
path_hash = get_file_hash(path)
|
||||||
|
if args.dir:
|
||||||
|
path = path.relative_to(args.dir)
|
||||||
|
|
||||||
|
hashes[str(path)] = path_hash
|
||||||
|
|
||||||
|
json.dump(hashes, sys.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
12
src/python_run/piper/util.py
Normal file
12
src/python_run/piper/util.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
"""Utilities"""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def audio_float_to_int16(
|
||||||
|
audio: np.ndarray, max_wav_value: float = 32767.0
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Normalize audio and convert to int16 range"""
|
||||||
|
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
||||||
|
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
||||||
|
audio_norm = audio_norm.astype("int16")
|
||||||
|
return audio_norm
|
||||||
177
src/python_run/piper/voice.py
Normal file
177
src/python_run/piper/voice.py
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import wave
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import onnxruntime
|
||||||
|
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
|
||||||
|
|
||||||
|
from .config import PhonemeType, PiperConfig
|
||||||
|
from .const import BOS, EOS, PAD
|
||||||
|
from .util import audio_float_to_int16
|
||||||
|
|
||||||
|
_LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PiperVoice:
|
||||||
|
session: onnxruntime.InferenceSession
|
||||||
|
config: PiperConfig
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(
|
||||||
|
model_path: Union[str, Path],
|
||||||
|
config_path: Optional[Union[str, Path]] = None,
|
||||||
|
use_cuda: bool = False,
|
||||||
|
) -> "PiperVoice":
|
||||||
|
"""Load an ONNX model and config."""
|
||||||
|
if config_path is None:
|
||||||
|
config_path = f"{model_path}.json"
|
||||||
|
|
||||||
|
with open(config_path, "r", encoding="utf-8") as config_file:
|
||||||
|
config_dict = json.load(config_file)
|
||||||
|
|
||||||
|
return PiperVoice(
|
||||||
|
config=PiperConfig.from_dict(config_dict),
|
||||||
|
session=onnxruntime.InferenceSession(
|
||||||
|
str(model_path),
|
||||||
|
sess_options=onnxruntime.SessionOptions(),
|
||||||
|
providers=["CPUExecutionProvider"]
|
||||||
|
if not use_cuda
|
||||||
|
else ["CUDAExecutionProvider"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def phonemize(self, text: str) -> List[List[str]]:
|
||||||
|
"""Text to phonemes grouped by sentence."""
|
||||||
|
if self.config.phoneme_type == PhonemeType.ESPEAK:
|
||||||
|
if self.config.espeak_voice == "ar":
|
||||||
|
# Arabic diacritization
|
||||||
|
# https://github.com/mush42/libtashkeel/
|
||||||
|
text = tashkeel_run(text)
|
||||||
|
|
||||||
|
return phonemize_espeak(text, self.config.espeak_voice)
|
||||||
|
|
||||||
|
if self.config.phoneme_type == PhonemeType.TEXT:
|
||||||
|
return phonemize_codepoints(text)
|
||||||
|
|
||||||
|
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
|
||||||
|
|
||||||
|
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
|
||||||
|
"""Phonemes to ids."""
|
||||||
|
id_map = self.config.phoneme_id_map
|
||||||
|
ids: List[int] = list(id_map[BOS])
|
||||||
|
|
||||||
|
for phoneme in phonemes:
|
||||||
|
if phoneme not in id_map:
|
||||||
|
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
|
||||||
|
continue
|
||||||
|
|
||||||
|
ids.extend(id_map[phoneme])
|
||||||
|
ids.extend(id_map[PAD])
|
||||||
|
|
||||||
|
ids.extend(id_map[EOS])
|
||||||
|
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def synthesize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
wav_file: wave.Wave_write,
|
||||||
|
speaker_id: Optional[int] = None,
|
||||||
|
length_scale: Optional[float] = None,
|
||||||
|
noise_scale: Optional[float] = None,
|
||||||
|
noise_w: Optional[float] = None,
|
||||||
|
sentence_silence: float = 0.0,
|
||||||
|
):
|
||||||
|
"""Synthesize WAV audio from text."""
|
||||||
|
wav_file.setframerate(self.config.sample_rate)
|
||||||
|
wav_file.setsampwidth(2) # 16-bit
|
||||||
|
wav_file.setnchannels(1) # mono
|
||||||
|
|
||||||
|
for audio_bytes in self.synthesize_stream_raw(
|
||||||
|
text,
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
length_scale=length_scale,
|
||||||
|
noise_scale=noise_scale,
|
||||||
|
noise_w=noise_w,
|
||||||
|
sentence_silence=sentence_silence,
|
||||||
|
):
|
||||||
|
wav_file.writeframes(audio_bytes)
|
||||||
|
|
||||||
|
def synthesize_stream_raw(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_id: Optional[int] = None,
|
||||||
|
length_scale: Optional[float] = None,
|
||||||
|
noise_scale: Optional[float] = None,
|
||||||
|
noise_w: Optional[float] = None,
|
||||||
|
sentence_silence: float = 0.0,
|
||||||
|
) -> Iterable[bytes]:
|
||||||
|
"""Synthesize raw audio per sentence from text."""
|
||||||
|
sentence_phonemes = self.phonemize(text)
|
||||||
|
|
||||||
|
# 16-bit mono
|
||||||
|
num_silence_samples = int(sentence_silence * self.config.sample_rate)
|
||||||
|
silence_bytes = bytes(num_silence_samples * 2)
|
||||||
|
|
||||||
|
for phonemes in sentence_phonemes:
|
||||||
|
phoneme_ids = self.phonemes_to_ids(phonemes)
|
||||||
|
yield self.synthesize_ids_to_raw(
|
||||||
|
phoneme_ids,
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
length_scale=length_scale,
|
||||||
|
noise_scale=noise_scale,
|
||||||
|
noise_w=noise_w,
|
||||||
|
) + silence_bytes
|
||||||
|
|
||||||
|
def synthesize_ids_to_raw(
|
||||||
|
self,
|
||||||
|
phoneme_ids: List[int],
|
||||||
|
speaker_id: Optional[int] = None,
|
||||||
|
length_scale: Optional[float] = None,
|
||||||
|
noise_scale: Optional[float] = None,
|
||||||
|
noise_w: Optional[float] = None,
|
||||||
|
) -> bytes:
|
||||||
|
"""Synthesize raw audio from phoneme ids."""
|
||||||
|
if length_scale is None:
|
||||||
|
length_scale = self.config.length_scale
|
||||||
|
|
||||||
|
if noise_scale is None:
|
||||||
|
noise_scale = self.config.noise_scale
|
||||||
|
|
||||||
|
if noise_w is None:
|
||||||
|
noise_w = self.config.noise_w
|
||||||
|
|
||||||
|
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||||
|
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
||||||
|
scales = np.array(
|
||||||
|
[noise_scale, length_scale, noise_w],
|
||||||
|
dtype=np.float32,
|
||||||
|
)
|
||||||
|
|
||||||
|
if (self.config.num_speakers > 1) and (speaker_id is None):
|
||||||
|
# Default speaker
|
||||||
|
speaker_id = 0
|
||||||
|
|
||||||
|
sid = None
|
||||||
|
|
||||||
|
if speaker_id is not None:
|
||||||
|
sid = np.array([speaker_id], dtype=np.int64)
|
||||||
|
|
||||||
|
# Synthesize through Onnx
|
||||||
|
audio = self.session.run(
|
||||||
|
None,
|
||||||
|
{
|
||||||
|
"input": phoneme_ids_array,
|
||||||
|
"input_lengths": phoneme_ids_lengths,
|
||||||
|
"scales": scales,
|
||||||
|
"sid": sid,
|
||||||
|
},
|
||||||
|
)[0].squeeze((0, 1))
|
||||||
|
audio = audio_float_to_int16(audio.squeeze())
|
||||||
|
|
||||||
|
return audio.tobytes()
|
||||||
3782
src/python_run/piper/voices.json
Normal file
3782
src/python_run/piper/voices.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,2 +1,2 @@
|
|||||||
espeak-phonemizer>=1.1.0,<2
|
piper-phonemize~=1.0.0
|
||||||
onnxruntime~=1.11.0
|
onnxruntime>=1.11.0,<2
|
||||||
|
|||||||
47
src/python_run/setup.py
Normal file
47
src/python_run/setup.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import setuptools
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
this_dir = Path(__file__).parent
|
||||||
|
module_dir = this_dir / "piper"
|
||||||
|
|
||||||
|
requirements = []
|
||||||
|
requirements_path = this_dir / "requirements.txt"
|
||||||
|
if requirements_path.is_file():
|
||||||
|
with open(requirements_path, "r", encoding="utf-8") as requirements_file:
|
||||||
|
requirements = requirements_file.read().splitlines()
|
||||||
|
|
||||||
|
data_files = [module_dir / "voices.json"]
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="piper-tts",
|
||||||
|
version="1.1.0",
|
||||||
|
description="A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.",
|
||||||
|
url="http://github.com/rhasspy/piper",
|
||||||
|
author="Michael Hansen",
|
||||||
|
author_email="mike@rhasspy.org",
|
||||||
|
license="MIT",
|
||||||
|
packages=setuptools.find_packages(),
|
||||||
|
package_data={"piper": [str(p.relative_to(module_dir)) for p in data_files]},
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"piper = piper.__main__:main",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
install_requires=requirements,
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Topic :: Text Processing :: Linguistic",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
],
|
||||||
|
keywords="rhasspy piper tts",
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user