From d0c545141da45f33b36a6020f59a2951afa1b724 Mon Sep 17 00:00:00 2001 From: Mateo Cedillo <54605382+rmcpantoja@users.noreply.github.com> Date: Tue, 8 Aug 2023 15:48:05 -0500 Subject: [PATCH] Updated inference notebook. --- notebooks/piper_inference_(ONNX).ipynb | 115 ++++++++++++++++--------- 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/notebooks/piper_inference_(ONNX).ipynb b/notebooks/piper_inference_(ONNX).ipynb index 5123ec9..9d1e5f9 100644 --- a/notebooks/piper_inference_(ONNX).ipynb +++ b/notebooks/piper_inference_(ONNX).ipynb @@ -5,7 +5,7 @@ "colab": { "provenance": [], "gpuType": "T4", - "authorship_tag": "ABX9TyPFgeWX60dXmmKm+pi5Wr2v", + "authorship_tag": "ABX9TyMAPvo6Syxu5wDRkSmySUxq", "include_colab_link": true }, "kernelspec": { @@ -14,7 +14,8 @@ }, "language_info": { "name": "python" - } + }, + "accelerator": "GPU" }, "cells": [ { @@ -74,7 +75,7 @@ "#@markdown #### Do you want to use the GPU for inference?\n", "\n", "#@markdown The GPU can be enabled in the edit/notebook settings menu, and this step must be done before connecting to a runtime. The GPU can lead to a higher response speed in inference, but you can use the CPU, for example, if your colab runtime to use GPU's has been ended.\n", - "use_gpu = False #@param {type:\"boolean\"}\n", + "use_gpu = True #@param {type:\"boolean\"}\n", "\n", "if enhanced_accessibility:\n", " from google.colab import output\n", @@ -88,10 +89,10 @@ " playaudio(\"installing\")\n", "!git clone -q https://github.com/rmcpantoja/piper\n", "%cd /content/piper/src/python\n", - "!pip install -q -r requirements.txt\n", + "#!pip install -q -r requirements.txt\n", + "!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy>=1.19.0 onnxruntime>=1.11.0 pytorch-lightning==1.7.0 torch==1.11.0\n", "!pip install -q onnxruntime-gpu\n", "!bash build_monotonic_align.sh\n", - "!apt-get install -q espeak-ng\n", "import os\n", "if not os.path.exists(\"/content/piper/src/python/lng\"):\n", " !cp -r \"/content/piper/notebooks/lng\" /content/piper/src/python/lng\n", @@ -186,15 +187,15 @@ "import math\n", "import sys\n", "from pathlib import Path\n", - "\n", + "from enum import Enum\n", + "from typing import Iterable, List, Optional, Union\n", "import numpy as np\n", "import onnxruntime\n", "from piper_train.vits.utils import audio_float_to_int16\n", "import glob\n", "import ipywidgets as widgets\n", "from IPython.display import display, Audio, Markdown, clear_output\n", - "from espeak_phonemizer import Phonemizer\n", - "from piper_train import phonemize\n", + "from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run\n", "\n", "_LOGGER = logging.getLogger(\"piper_train.infer_onnx\")\n", "\n", @@ -381,40 +382,75 @@ " with open(f\"{model}.json\", \"r\") as file:\n", " config = json.load(file)\n", " return config\n", + "PAD = \"_\" # padding (0)\n", + "BOS = \"^\" # beginning of sentence\n", + "EOS = \"$\" # end of sentence\n", + "\n", + "class PhonemeType(str, Enum):\n", + " ESPEAK = \"espeak\"\n", + " TEXT = \"text\"\n", + "\n", + "def phonemize(config, text: str) -> List[List[str]]:\n", + " \"\"\"Text to phonemes grouped by sentence.\"\"\"\n", + " if config[\"phoneme_type\"] == PhonemeType.ESPEAK:\n", + " if config[\"espeak\"][\"voice\"] == \"ar\":\n", + " # Arabic diacritization\n", + " # https://github.com/mush42/libtashkeel/\n", + " text = tashkeel_run(text)\n", + " return phonemize_espeak(text, config[\"espeak\"][\"voice\"])\n", + " if config[\"phoneme_type\"] == PhonemeType.TEXT:\n", + " return phonemize_codepoints(text)\n", + " raise ValueError(f'Unexpected phoneme type: {config[\"phoneme_type\"]}')\n", + "\n", + "def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:\n", + " \"\"\"Phonemes to ids.\"\"\"\n", + " id_map = config[\"phoneme_id_map\"]\n", + " ids: List[int] = list(id_map[BOS])\n", + " for phoneme in phonemes:\n", + " if phoneme not in id_map:\n", + " print(\"Missing phoneme from id map: %s\", phoneme)\n", + " continue\n", + " ids.extend(id_map[phoneme])\n", + " ids.extend(id_map[PAD])\n", + " ids.extend(id_map[EOS])\n", + " return ids\n", "\n", "def inferencing(model, config, sid, line, length_scale = 1, noise_scale = 0.667, noise_scale_w = 0.8, auto_play=True):\n", - " espeak_voice = config[\"espeak\"][\"voice\"]\n", - " phonemizer = Phonemizer(default_voice=espeak_voice)\n", - " phonemes = phonemize.phonemize(line, phonemizer)\n", - " ids = phonemize.phonemes_to_ids(phonemes)\n", - " phoneme_ids = ids\n", - " num_speakers = config[\"num_speakers\"]\n", - " if num_speakers == 1:\n", - " speaker_id = None # for now\n", - " else:\n", - " speaker_id = sid\n", - " text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)\n", - " text_lengths = np.array([text.shape[1]], dtype=np.int64)\n", - " scales = np.array(\n", - " [noise_scale, length_scale, noise_scale_w],\n", - " dtype=np.float32,\n", - " )\n", - " sid = None\n", - " if speaker_id is not None:\n", - " sid = np.array([speaker_id], dtype=np.int64)\n", - " audio = model.run(\n", - " None,\n", - " {\n", - " \"input\": text,\n", - " \"input_lengths\": text_lengths,\n", - " \"scales\": scales,\n", - " \"sid\": sid,\n", - " },\n", - " )[0].squeeze((0, 1))\n", - " audio = audio_float_to_int16(audio.squeeze())\n", + " audios = []\n", + " if config[\"phoneme_type\"] == \"PhonemeType.ESPEAK\":\n", + " config[\"phoneme_type\"] = \"espeak\"\n", + " text = phonemize(config, line)\n", + " for phonemes in text:\n", + " phoneme_ids = phonemes_to_ids(config, phonemes)\n", + " num_speakers = config[\"num_speakers\"]\n", + " if num_speakers == 1:\n", + " speaker_id = None # for now\n", + " else:\n", + " speaker_id = sid\n", + " text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)\n", + " text_lengths = np.array([text.shape[1]], dtype=np.int64)\n", + " scales = np.array(\n", + " [noise_scale, length_scale, noise_scale_w],\n", + " dtype=np.float32,\n", + " )\n", + " sid = None\n", + " if speaker_id is not None:\n", + " sid = np.array([speaker_id], dtype=np.int64)\n", + " audio = model.run(\n", + " None,\n", + " {\n", + " \"input\": text,\n", + " \"input_lengths\": text_lengths,\n", + " \"scales\": scales,\n", + " \"sid\": sid,\n", + " },\n", + " )[0].squeeze((0, 1))\n", + " audio = audio_float_to_int16(audio.squeeze())\n", + " audios.append(audio)\n", + " merged_audio = np.concatenate(audios)\n", " sample_rate = config[\"audio\"][\"sample_rate\"]\n", " display(Markdown(f\"{line}\"))\n", - " display(Audio(audio, rate=sample_rate, autoplay=auto_play))\n", + " display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))\n", "\n", "def denoise(\n", " audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float\n", @@ -512,8 +548,7 @@ "main()" ], "metadata": { - "id": "hcKk8M2ug8kM", - "cellView": "form" + "id": "hcKk8M2ug8kM" }, "execution_count": null, "outputs": []