Updated inference notebook.

This commit is contained in:
Mateo Cedillo
2023-08-08 15:42:40 -05:00
parent cf892d6e14
commit 91e31de1f2

View File

@@ -5,7 +5,7 @@
"colab": { "colab": {
"provenance": [], "provenance": [],
"gpuType": "T4", "gpuType": "T4",
"authorship_tag": "ABX9TyMcevzeVyewWF1ZHKzBu3CB", "authorship_tag": "ABX9TyNju0yzRK8wgAS+WgyeTEAl",
"include_colab_link": true "include_colab_link": true
}, },
"kernelspec": { "kernelspec": {
@@ -88,12 +88,13 @@
" playaudio(\"installing\")\n", " playaudio(\"installing\")\n",
"!git clone -q https://github.com/rmcpantoja/piper\n", "!git clone -q https://github.com/rmcpantoja/piper\n",
"%cd /content/piper/src/python\n", "%cd /content/piper/src/python\n",
"!pip install -q -r requirements.txt\n", "#!pip install -q -r requirements.txt\n",
"!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy>=1.19.0 onnxruntime>=1.11.0 pytorch-lightning==1.7.0 torch==1.11.0\n",
"!pip install -q torchtext==0.12.0 torchvision==0.12.0\n", "!pip install -q torchtext==0.12.0 torchvision==0.12.0\n",
"#!pip install -q torchtext==0.14.1 torchvision==0.14.1\n",
"# fixing recent compativility isswes:\n", "# fixing recent compativility isswes:\n",
"!pip install -q torchaudio==0.11.0 torchmetrics==0.11.4\n", "!pip install -q torchaudio==0.11.0 torchmetrics==0.11.4\n",
"!bash build_monotonic_align.sh\n", "!bash build_monotonic_align.sh\n",
"!apt-get install -q espeak-ng\n",
"import os\n", "import os\n",
"if not os.path.exists(\"/content/piper/src/python/lng\"):\n", "if not os.path.exists(\"/content/piper/src/python/lng\"):\n",
" !cp -r \"/content/piper/notebooks/lng\" /content/piper/src/python/lng\n", " !cp -r \"/content/piper/notebooks/lng\" /content/piper/src/python/lng\n",
@@ -190,6 +191,8 @@
"import logging\n", "import logging\n",
"import sys\n", "import sys\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"from enum import Enum\n",
"from typing import Iterable, List, Optional, Union\n",
"import torch\n", "import torch\n",
"from piper_train.vits.lightning import VitsModel\n", "from piper_train.vits.lightning import VitsModel\n",
"from piper_train.vits.utils import audio_float_to_int16\n", "from piper_train.vits.utils import audio_float_to_int16\n",
@@ -198,8 +201,7 @@
"import glob\n", "import glob\n",
"import ipywidgets as widgets\n", "import ipywidgets as widgets\n",
"from IPython.display import display, Audio, Markdown, clear_output\n", "from IPython.display import display, Audio, Markdown, clear_output\n",
"from espeak_phonemizer import Phonemizer\n", "from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run\n",
"from piper_train import phonemize\n",
"\n", "\n",
"_LOGGER = logging.getLogger(\"piper_train.infer_onnx\")\n", "_LOGGER = logging.getLogger(\"piper_train.infer_onnx\")\n",
"\n", "\n",
@@ -382,12 +384,44 @@
" config = json.load(file)\n", " config = json.load(file)\n",
" return config\n", " return config\n",
"\n", "\n",
"PAD = \"_\" # padding (0)\n",
"BOS = \"^\" # beginning of sentence\n",
"EOS = \"$\" # end of sentence\n",
"\n",
"class PhonemeType(str, Enum):\n",
" ESPEAK = \"espeak\"\n",
" TEXT = \"text\"\n",
"\n",
"def phonemize(config, text: str) -> List[List[str]]:\n",
" \"\"\"Text to phonemes grouped by sentence.\"\"\"\n",
" if config[\"phoneme_type\"] == PhonemeType.ESPEAK:\n",
" if config[\"espeak\"][\"voice\"] == \"ar\":\n",
" # Arabic diacritization\n",
" # https://github.com/mush42/libtashkeel/\n",
" text = tashkeel_run(text)\n",
" return phonemize_espeak(text, config[\"espeak\"][\"voice\"])\n",
" if config[\"phoneme_type\"] == PhonemeType.TEXT:\n",
" return phonemize_codepoints(text)\n",
" raise ValueError(f\"Unexpected phoneme type: {self.config.phoneme_type}\")\n",
"\n",
"def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:\n",
" \"\"\"Phonemes to ids.\"\"\"\n",
" id_map = config[\"phoneme_id_map\"]\n",
" ids: List[int] = list(id_map[BOS])\n",
" for phoneme in phonemes:\n",
" if phoneme not in id_map:\n",
" print(\"Missing phoneme from id map: %s\", phoneme)\n",
" continue\n",
" ids.extend(id_map[phoneme])\n",
" ids.extend(id_map[PAD])\n",
" ids.extend(id_map[EOS])\n",
" return ids\n",
"\n",
"def inferencing(model, config, sid, line, length_scale = 1, noise_scale = 0.667, noise_scale_w = 0.8, auto_play=True):\n", "def inferencing(model, config, sid, line, length_scale = 1, noise_scale = 0.667, noise_scale_w = 0.8, auto_play=True):\n",
" espeak_voice = config[\"espeak\"][\"voice\"]\n", " audios = []\n",
" phonemizer = Phonemizer(default_voice=espeak_voice)\n", " text = phonemize(config, line)\n",
" phonemes = phonemize.phonemize(line, phonemizer)\n", " for phonemes in text:\n",
" ids = phonemize.phonemes_to_ids(phonemes)\n", " phoneme_ids = phonemes_to_ids(config, phonemes)\n",
" phoneme_ids = ids\n",
" num_speakers = config[\"num_speakers\"]\n", " num_speakers = config[\"num_speakers\"]\n",
" if num_speakers == 1:\n", " if num_speakers == 1:\n",
" speaker_id = None # for now\n", " speaker_id = None # for now\n",
@@ -408,9 +442,11 @@
" sid=sid\n", " sid=sid\n",
" ).detach().numpy()\n", " ).detach().numpy()\n",
" audio = audio_float_to_int16(audio.squeeze())\n", " audio = audio_float_to_int16(audio.squeeze())\n",
" audios.append(audio)\n",
" merged_audio = np.concatenate(audios)\n",
" sample_rate = config[\"audio\"][\"sample_rate\"]\n", " sample_rate = config[\"audio\"][\"sample_rate\"]\n",
" display(Markdown(f\"{line}\"))\n", " display(Markdown(f\"{line}\"))\n",
" display(Audio(audio, rate=sample_rate, autoplay=auto_play))\n", " display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))\n",
"\n", "\n",
"def denoise(\n", "def denoise(\n",
" audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float\n", " audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float\n",