Update Python package

2026-05-22 21:28:01 +00:00 · 2023-07-27 17:00:44 -05:00
parent eddb39f7e4
commit a9be4c0314
14 changed files with 4368 additions and 176 deletions
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Piper is used in a [variety of projects](#people-using-piper).
 ``` sh
 echo 'Welcome to the world of speech synthesis!' | \
-  ./piper --model en-us-blizzard_lessac-medium.onnx --output_file welcome.wav
+  ./piper --model en_US-lessac-medium.onnx --output_file welcome.wav
 ```
 [Listen to voice samples](https://rhasspy.github.io/piper-samples) and check out a [video tutorial by Thorsten Müller](https://youtu.be/rjq5eZoWWSo)
@@ -54,7 +54,7 @@ The `MODEL_CARD` file for each voice contains important licensing information. P
 ## Installation
-Download a release:
+You can [run Piper with Python](#running-in-python) or download a binary release:
 * [amd64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_amd64.tar.gz) (64-bit desktop Linux)
 * [arm64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4)
@@ -131,14 +131,22 @@ Pretrained checkpoints are available on [Hugging Face](https://huggingface.co/da
 See [src/python_run](src/python_run)
-Run `scripts/setup.sh` to create a virtual environment and install the requirements. Then run:
+Install with `pip`:
 ``` sh
-echo 'Welcome to the world of speech synthesis!' | scripts/piper \
+pip install piper-tts
-  --model /path/to/voice.onnx \
+```
 and then run:
 ``` sh
 echo 'Welcome to the world of speech synthesis!' | piper \
  --model en_US-lessac-medium \
  --output_file welcome.wav
 ```
 This will automatically download [voice files](https://huggingface.co/rhasspy/piper-voices/tree/v1.0.0) the first time they're used. Use `--data-dir` and `--download-dir` to adjust where voices are found/downloaded.
 If you'd like to use a GPU, install the `onnxruntime-gpu` package:
@@ -146,5 +154,5 @@ If you'd like to use a GPU, install the `onnxruntime-gpu` package:
 .venv/bin/pip3 install onnxruntime-gpu
 ```
-and then run `scripts/piper` with the `--cuda` argument. You will need to have a functioning CUDA environment, such as what's available in [NVIDIA's PyTorch containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
+and then run `piper` with the `--cuda` argument. You will need to have a functioning CUDA environment, such as what's available in [NVIDIA's PyTorch containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
--- a/src/python_run/.gitignore
+++ b/src/python_run/.gitignore
@@ -0,0 +1,3 @@
 build/
 dist/
 *.egg-info/
--- a/src/python_run/MANIFEST.in
+++ b/src/python_run/MANIFEST.in
@@ -0,0 +1,2 @@
 include requirements.txt
 include piper/voices.json
--- a/src/python_run/piper/init.py
+++ b/src/python_run/piper/init.py
@@ -1,147 +1,5 @@
-import io
+from .voice import PiperVoice
 import json
 import logging
 import wave
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Mapping, Optional, Sequence, Union
-import numpy as np
+__all__ = [
-import onnxruntime
+    "PiperVoice",
-from espeak_phonemizer import Phonemizer
+]
 _LOGGER = logging.getLogger(__name__)
 _BOS = "^"
 _EOS = "$"
 _PAD = "_"
@dataclass
 class PiperConfig:
    num_symbols: int
    num_speakers: int
    sample_rate: int
    espeak_voice: str
    length_scale: float
    noise_scale: float
    noise_w: float
    phoneme_id_map: Mapping[str, Sequence[int]]
 class Piper:
    def __init__(
        self,
        model_path: Union[str, Path],
        config_path: Optional[Union[str, Path]] = None,
        use_cuda: bool = False,
    ):
        if config_path is None:
            config_path = f"{model_path}.json"
        self.config = load_config(config_path)
        self.phonemizer = Phonemizer(self.config.espeak_voice)
        self.model = onnxruntime.InferenceSession(
            str(model_path),
            sess_options=onnxruntime.SessionOptions(),
            providers=["CPUExecutionProvider"]
            if not use_cuda
            else ["CUDAExecutionProvider"],
        )
    def synthesize(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        length_scale: Optional[float] = None,
        noise_scale: Optional[float] = None,
        noise_w: Optional[float] = None,
    ) -> bytes:
        """Synthesize WAV audio from text."""
        if length_scale is None:
            length_scale = self.config.length_scale
        if noise_scale is None:
            noise_scale = self.config.noise_scale
        if noise_w is None:
            noise_w = self.config.noise_w
        phonemes_str = self.phonemizer.phonemize(text)
        phonemes = [_BOS] + list(phonemes_str)
        phoneme_ids: List[int] = []
        for phoneme in phonemes:
            if phoneme in self.config.phoneme_id_map:
                phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
                phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
            else:
                _LOGGER.warning("No id for phoneme: %s", phoneme)
        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
        scales = np.array(
            [noise_scale, length_scale, noise_w],
            dtype=np.float32,
        )
        if (self.config.num_speakers > 1) and (speaker_id is None):
            # Default speaker
            speaker_id = 0
        sid = None
        if speaker_id is not None:
            sid = np.array([speaker_id], dtype=np.int64)
        # Synthesize through Onnx
        audio = self.model.run(
            None,
            {
                "input": phoneme_ids_array,
                "input_lengths": phoneme_ids_lengths,
                "scales": scales,
                "sid": sid,
            },
        )[0].squeeze((0, 1))
        audio = audio_float_to_int16(audio.squeeze())
        # Convert to WAV
        with io.BytesIO() as wav_io:
            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
            with wav_file:
                wav_file.setframerate(self.config.sample_rate)
                wav_file.setsampwidth(2)
                wav_file.setnchannels(1)
                wav_file.writeframes(audio.tobytes())
            return wav_io.getvalue()
 def load_config(config_path: Union[str, Path]) -> PiperConfig:
    with open(config_path, "r", encoding="utf-8") as config_file:
        config_dict = json.load(config_file)
        inference = config_dict.get("inference", {})
        return PiperConfig(
            num_symbols=config_dict["num_symbols"],
            num_speakers=config_dict["num_speakers"],
            sample_rate=config_dict["audio"]["sample_rate"],
            espeak_voice=config_dict["espeak"]["voice"],
            noise_scale=inference.get("noise_scale", 0.667),
            length_scale=inference.get("length_scale", 1.0),
            noise_w=inference.get("noise_w", 0.8),
            phoneme_id_map=config_dict["phoneme_id_map"],
        )
 def audio_float_to_int16(
    audio: np.ndarray, max_wav_value: float = 32767.0
 ) -> np.ndarray:
    """Normalize audio and convert to int16 range"""
    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
    audio_norm = audio_norm.astype("int16")
    return audio_norm
--- a/src/python_run/piper/main.py
+++ b/src/python_run/piper/main.py
@@ -2,10 +2,12 @@ import argparse
 import logging
 import sys
 import time
-from functools import partial
+import wave
 from pathlib import Path
 from typing import Any, Dict
-from . import Piper
+from . import PiperVoice
 from .download import ensure_voice_exists, find_voice, get_voices
 _FILE = Path(__file__)
 _DIR = _FILE.parent
@@ -17,33 +19,108 @@ def main() -> None:
    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
    parser.add_argument("-c", "--config", help="Path to model config file")
    parser.add_argument(
-        "-f", "--output_file", help="Path to output WAV file (default: stdout)"
+        "-f",
        "--output-file",
        "--output_file",
        help="Path to output WAV file (default: stdout)",
    )
    parser.add_argument(
-        "-d", "--output_dir", help="Path to output directory (default: cwd)"
+        "-d",
        "--output-dir",
        "--output_dir",
        help="Path to output directory (default: cwd)",
    )
    parser.add_argument(
        "--output-raw",
        "--output_raw",
        action="store_true",
        help="Stream raw audio to stdout",
    )
    #
    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
-    parser.add_argument("--noise-scale", type=float, help="Generator noise")
+    parser.add_argument(
-    parser.add_argument("--length-scale", type=float, help="Phoneme length")
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
-    parser.add_argument("--noise-w", type=float, help="Phoneme width noise")
+    )
    parser.add_argument(
        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
    )
    parser.add_argument(
        "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
    )
    #
    parser.add_argument("--cuda", action="store_true", help="Use GPU")
    #
    parser.add_argument(
        "--sentence-silence",
        "--sentence_silence",
        type=float,
        default=0.0,
        help="Seconds of silence after each sentence",
    )
    #
    parser.add_argument(
        "--data-dir",
        "--data_dir",
        action="append",
        default=[str(Path.cwd())],
        help="Data directory to check for downloaded models (default: current directory)",
    )
    parser.add_argument(
        "--download-dir",
        "--download_dir",
        help="Directory to download voices into (default: first data dir)",
    )
    #
    parser.add_argument(
        "--debug", action="store_true", help="Print DEBUG messages to console"
    )
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
    _LOGGER.debug(args)
-    voice = Piper(args.model, config_path=args.config, use_cuda=args.cuda)
+    if not args.download_dir:
-    synthesize = partial(
+        # Download to first data directory by default
-        voice.synthesize,
+        args.download_dir = args.data_dir[0]
        speaker_id=args.speaker,
        length_scale=args.length_scale,
        noise_scale=args.noise_scale,
        noise_w=args.noise_w,
    )
-    if args.output_dir:
+    # Download voice if file doesn't exist
    model_path = Path(args.model)
    if not model_path.exists():
        # Load voice info
        voices_info = get_voices()
        # Resolve aliases for backwards compatibility with old voice names
        aliases_info: Dict[str, Any] = {}
        for voice_info in voices_info.values():
            for voice_alias in voice_info.get("aliases", []):
                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
        voices_info.update(aliases_info)
        ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
        args.model, args.config = find_voice(args.model, args.data_dir)
    # Load voice
    voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
    synthesize_args = {
        "speaker_id": args.speaker,
        "length_scale": args.length_scale,
        "noise_scale": args.noise_scale,
        "noise_w": args.noise_w,
        "sentence_silence": args.sentence_silence,
    }
    if args.output_raw:
        # Read line-by-line
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            # Write raw audio to stdout as its produced
            audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
            for audio_bytes in audio_stream:
                sys.stdout.buffer.write(audio_bytes)
                sys.stdout.buffer.flush()
    elif args.output_dir:
        output_dir = Path(args.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
@@ -53,21 +130,23 @@ def main() -> None:
            if not line:
                continue
            wav_bytes = synthesize(line)
            wav_path = output_dir / f"{time.monotonic_ns()}.wav"
-            wav_path.write_bytes(wav_bytes)
+            with wave.open(str(wav_path), "wb") as wav_file:
                voice.synthesize(line, wav_file, **synthesize_args)
            _LOGGER.info("Wrote %s", wav_path)
    else:
        # Read entire input
        text = sys.stdin.read()
        wav_bytes = synthesize(text)
        if (not args.output_file) or (args.output_file == "-"):
            # Write to stdout
-            sys.stdout.buffer.write(wav_bytes)
+            with wave.open(sys.stdout.buffer, "wb") as wav_file:
                voice.synthesize(text, wav_file, **synthesize_args)
        else:
-            with open(args.output_file, "wb") as output_file:
+            # Write to file
-                output_file.write(wav_bytes)
+            with wave.open(args.output_file, "wb") as wav_file:
                voice.synthesize(text, wav_file, **synthesize_args)
 if __name__ == "__main__":
--- a/src/python_run/piper/config.py
+++ b/src/python_run/piper/config.py
@@ -0,0 +1,53 @@
 """Piper configuration"""
 from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Mapping, Sequence
 class PhonemeType(str, Enum):
    ESPEAK = "espeak"
    TEXT = "text"
@dataclass
 class PiperConfig:
    """Piper configuration"""
    num_symbols: int
    """Number of phonemes"""
    num_speakers: int
    """Number of speakers"""
    sample_rate: int
    """Sample rate of output audio"""
    espeak_voice: str
    """Name of espeak-ng voice or alphabet"""
    length_scale: float
    noise_scale: float
    noise_w: float
    phoneme_id_map: Mapping[str, Sequence[int]]
    """Phoneme -> [id,]"""
    phoneme_type: PhonemeType
    """espeak or text"""
    @staticmethod
    def from_dict(config: Dict[str, Any]) -> "PiperConfig":
        inference = config.get("inference", {})
        return PiperConfig(
            num_symbols=config["num_symbols"],
            num_speakers=config["num_speakers"],
            sample_rate=config["audio"]["sample_rate"],
            noise_scale=inference.get("noise_scale", 0.667),
            length_scale=inference.get("length_scale", 1.0),
            noise_w=inference.get("noise_w", 0.8),
            #
            espeak_voice=config["espeak"]["voice"],
            phoneme_id_map=config["phoneme_id_map"],
            phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
        )
--- a/src/python_run/piper/const.py
+++ b/src/python_run/piper/const.py
@@ -0,0 +1,5 @@
 """Constants"""
 PAD = "_"  # padding (0)
 BOS = "^"  # beginning of sentence
 EOS = "$"  # end of sentence
--- a/src/python_run/piper/download.py
+++ b/src/python_run/piper/download.py
@@ -0,0 +1,120 @@
 """Utility for downloading Piper voices."""
 import json
 import logging
 import shutil
 from pathlib import Path
 from typing import Any, Dict, Iterable, Set, Tuple, Union
 from urllib.request import urlopen
 from .file_hash import get_file_hash
 URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
 _DIR = Path(__file__).parent
 _LOGGER = logging.getLogger(__name__)
 _SKIP_FILES = {"MODEL_CARD"}
 class VoiceNotFoundError(Exception):
    pass
 def get_voices() -> Dict[str, Any]:
    """Loads available voices from embedded JSON file."""
    with open(_DIR / "voices.json", "r", encoding="utf-8") as voices_file:
        return json.load(voices_file)
 def ensure_voice_exists(
    name: str,
    data_dirs: Iterable[Union[str, Path]],
    download_dir: Union[str, Path],
    voices_info: Dict[str, Any],
 ):
    assert data_dirs, "No data dirs"
    if name not in voices_info:
        raise VoiceNotFoundError(name)
    voice_info = voices_info[name]
    voice_files = voice_info["files"]
    files_to_download: Set[str] = set()
    for data_dir in data_dirs:
        data_dir = Path(data_dir)
        # Check sizes/hashes
        for file_path, file_info in voice_files.items():
            if file_path in files_to_download:
                # Already planning to download
                continue
            file_name = Path(file_path).name
            if file_name in _SKIP_FILES:
                continue
            data_file_path = data_dir / file_name
            _LOGGER.debug("Checking %s", data_file_path)
            if not data_file_path.exists():
                _LOGGER.debug("Missing %s", data_file_path)
                files_to_download.add(file_path)
                continue
            expected_size = file_info["size_bytes"]
            actual_size = data_file_path.stat().st_size
            if expected_size != actual_size:
                _LOGGER.warning(
                    "Wrong size (expected=%s, actual=%s) for %s",
                    expected_size,
                    actual_size,
                    data_file_path,
                )
                files_to_download.add(file_path)
                continue
            expected_hash = file_info["md5_digest"]
            actual_hash = get_file_hash(data_file_path)
            if expected_hash != actual_hash:
                _LOGGER.warning(
                    "Wrong hash (expected=%s, actual=%s) for %s",
                    expected_hash,
                    actual_hash,
                    data_file_path,
                )
                files_to_download.add(file_path)
                continue
    if (not voice_files) and (not files_to_download):
        raise ValueError(f"Unable to find or download voice: {name}")
    # Download missing files
    download_dir = Path(download_dir)
    for file_path in files_to_download:
        file_name = Path(file_path).name
        if file_name in _SKIP_FILES:
            continue
        file_url = URL_FORMAT.format(file=file_path)
        download_file_path = download_dir / file_name
        download_file_path.parent.mkdir(parents=True, exist_ok=True)
        _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
        with urlopen(file_url) as response, open(
            download_file_path, "wb"
        ) as download_file:
            shutil.copyfileobj(response, download_file)
        _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
 def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
    for data_dir in data_dirs:
        data_dir = Path(data_dir)
        onnx_path = data_dir / f"{name}.onnx"
        config_path = data_dir / f"{name}.onnx.json"
        if onnx_path.exists() and config_path.exists():
            return onnx_path, config_path
    raise ValueError(f"Missing files for voice {name}")
--- a/src/python_run/piper/file_hash.py
+++ b/src/python_run/piper/file_hash.py
@@ -0,0 +1,46 @@
 import argparse
 import hashlib
 import json
 import sys
 from pathlib import Path
 from typing import Union
 def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
    """Hash a file in chunks using md5."""
    path_hash = hashlib.md5()
    with open(path, "rb") as path_file:
        chunk = path_file.read(bytes_per_chunk)
        while chunk:
            path_hash.update(chunk)
            chunk = path_file.read(bytes_per_chunk)
    return path_hash.hexdigest()
 # -----------------------------------------------------------------------------
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("file", nargs="+")
    parser.add_argument("--dir", help="Parent directory")
    args = parser.parse_args()
    if args.dir:
        args.dir = Path(args.dir)
    hashes = {}
    for path_str in args.file:
        path = Path(path_str)
        path_hash = get_file_hash(path)
        if args.dir:
            path = path.relative_to(args.dir)
        hashes[str(path)] = path_hash
    json.dump(hashes, sys.stdout)
 if __name__ == "__main__":
    main()
--- a/src/python_run/piper/util.py
+++ b/src/python_run/piper/util.py
@@ -0,0 +1,12 @@
 """Utilities"""
 import numpy as np
 def audio_float_to_int16(
    audio: np.ndarray, max_wav_value: float = 32767.0
 ) -> np.ndarray:
    """Normalize audio and convert to int16 range"""
    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
    audio_norm = audio_norm.astype("int16")
    return audio_norm
--- a/src/python_run/piper/voice.py
+++ b/src/python_run/piper/voice.py
@@ -0,0 +1,177 @@
 import json
 import logging
 import wave
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
 import numpy as np
 import onnxruntime
 from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
 from .config import PhonemeType, PiperConfig
 from .const import BOS, EOS, PAD
 from .util import audio_float_to_int16
 _LOGGER = logging.getLogger(__name__)
@dataclass
 class PiperVoice:
    session: onnxruntime.InferenceSession
    config: PiperConfig
    @staticmethod
    def load(
        model_path: Union[str, Path],
        config_path: Optional[Union[str, Path]] = None,
        use_cuda: bool = False,
    ) -> "PiperVoice":
        """Load an ONNX model and config."""
        if config_path is None:
            config_path = f"{model_path}.json"
        with open(config_path, "r", encoding="utf-8") as config_file:
            config_dict = json.load(config_file)
        return PiperVoice(
            config=PiperConfig.from_dict(config_dict),
            session=onnxruntime.InferenceSession(
                str(model_path),
                sess_options=onnxruntime.SessionOptions(),
                providers=["CPUExecutionProvider"]
                if not use_cuda
                else ["CUDAExecutionProvider"],
            ),
        )
    def phonemize(self, text: str) -> List[List[str]]:
        """Text to phonemes grouped by sentence."""
        if self.config.phoneme_type == PhonemeType.ESPEAK:
            if self.config.espeak_voice == "ar":
                # Arabic diacritization
                # https://github.com/mush42/libtashkeel/
                text = tashkeel_run(text)
            return phonemize_espeak(text, self.config.espeak_voice)
        if self.config.phoneme_type == PhonemeType.TEXT:
            return phonemize_codepoints(text)
        raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
    def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
        """Phonemes to ids."""
        id_map = self.config.phoneme_id_map
        ids: List[int] = list(id_map[BOS])
        for phoneme in phonemes:
            if phoneme not in id_map:
                _LOGGER.warning("Missing phoneme from id map: %s", phoneme)
                continue
            ids.extend(id_map[phoneme])
            ids.extend(id_map[PAD])
        ids.extend(id_map[EOS])
        return ids
    def synthesize(
        self,
        text: str,
        wav_file: wave.Wave_write,
        speaker_id: Optional[int] = None,
        length_scale: Optional[float] = None,
        noise_scale: Optional[float] = None,
        noise_w: Optional[float] = None,
        sentence_silence: float = 0.0,
    ):
        """Synthesize WAV audio from text."""
        wav_file.setframerate(self.config.sample_rate)
        wav_file.setsampwidth(2)  # 16-bit
        wav_file.setnchannels(1)  # mono
        for audio_bytes in self.synthesize_stream_raw(
            text,
            speaker_id=speaker_id,
            length_scale=length_scale,
            noise_scale=noise_scale,
            noise_w=noise_w,
            sentence_silence=sentence_silence,
        ):
            wav_file.writeframes(audio_bytes)
    def synthesize_stream_raw(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        length_scale: Optional[float] = None,
        noise_scale: Optional[float] = None,
        noise_w: Optional[float] = None,
        sentence_silence: float = 0.0,
    ) -> Iterable[bytes]:
        """Synthesize raw audio per sentence from text."""
        sentence_phonemes = self.phonemize(text)
        # 16-bit mono
        num_silence_samples = int(sentence_silence * self.config.sample_rate)
        silence_bytes = bytes(num_silence_samples * 2)
        for phonemes in sentence_phonemes:
            phoneme_ids = self.phonemes_to_ids(phonemes)
            yield self.synthesize_ids_to_raw(
                phoneme_ids,
                speaker_id=speaker_id,
                length_scale=length_scale,
                noise_scale=noise_scale,
                noise_w=noise_w,
            ) + silence_bytes
    def synthesize_ids_to_raw(
        self,
        phoneme_ids: List[int],
        speaker_id: Optional[int] = None,
        length_scale: Optional[float] = None,
        noise_scale: Optional[float] = None,
        noise_w: Optional[float] = None,
    ) -> bytes:
        """Synthesize raw audio from phoneme ids."""
        if length_scale is None:
            length_scale = self.config.length_scale
        if noise_scale is None:
            noise_scale = self.config.noise_scale
        if noise_w is None:
            noise_w = self.config.noise_w
        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
        scales = np.array(
            [noise_scale, length_scale, noise_w],
            dtype=np.float32,
        )
        if (self.config.num_speakers > 1) and (speaker_id is None):
            # Default speaker
            speaker_id = 0
        sid = None
        if speaker_id is not None:
            sid = np.array([speaker_id], dtype=np.int64)
        # Synthesize through Onnx
        audio = self.session.run(
            None,
            {
                "input": phoneme_ids_array,
                "input_lengths": phoneme_ids_lengths,
                "scales": scales,
                "sid": sid,
            },
        )[0].squeeze((0, 1))
        audio = audio_float_to_int16(audio.squeeze())
        return audio.tobytes()
--- a/src/python_run/piper/voices.json
+++ b/src/python_run/piper/voices.json
--- a/src/python_run/requirements.txt
+++ b/src/python_run/requirements.txt
@@ -1,2 +1,2 @@
-espeak-phonemizer>=1.1.0,<2
+piper-phonemize~=1.0.0
-onnxruntime~=1.11.0
+onnxruntime>=1.11.0,<2
--- a/src/python_run/setup.py
+++ b/src/python_run/setup.py
@@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 from pathlib import Path
 import setuptools
 from setuptools import setup
 this_dir = Path(__file__).parent
 module_dir = this_dir / "piper"
 requirements = []
 requirements_path = this_dir / "requirements.txt"
 if requirements_path.is_file():
    with open(requirements_path, "r", encoding="utf-8") as requirements_file:
        requirements = requirements_file.read().splitlines()
 data_files = [module_dir / "voices.json"]
 # -----------------------------------------------------------------------------
 setup(
    name="piper-tts",
    version="1.1.0",
    description="A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.",
    url="http://github.com/rhasspy/piper",
    author="Michael Hansen",
    author_email="mike@rhasspy.org",
    license="MIT",
    packages=setuptools.find_packages(),
    package_data={"piper": [str(p.relative_to(module_dir)) for p in data_files]},
    entry_points={
        "console_scripts": [
            "piper = piper.__main__:main",
        ]
    },
    install_requires=requirements,
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "Topic :: Text Processing :: Linguistic",
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
    ],
    keywords="rhasspy piper tts",
 )
		`@@ -0,0 +1,2 @@`
							`include requirements.txt`
							`include piper/voices.json`
`@@ -1,2 +1,2 @@`
	`espeak-phonemizer>=1.1.0,<2`	`piper-phonemize~=1.0.0`
	`onnxruntime~=1.11.0`	`onnxruntime>=1.11.0,<2`