From d0c545141da45f33b36a6020f59a2951afa1b724 Mon Sep 17 00:00:00 2001
From: Mateo Cedillo <54605382+rmcpantoja@users.noreply.github.com>
Date: Tue, 8 Aug 2023 15:48:05 -0500
Subject: [PATCH] Updated inference notebook.

---
 notebooks/piper_inference_(ONNX).ipynb | 115 ++++++++++++++++---------
 1 file changed, 75 insertions(+), 40 deletions(-)

diff --git a/notebooks/piper_inference_(ONNX).ipynb b/notebooks/piper_inference_(ONNX).ipynb
index 5123ec9..9d1e5f9 100644
--- a/notebooks/piper_inference_(ONNX).ipynb
+++ b/notebooks/piper_inference_(ONNX).ipynb
@@ -5,7 +5,7 @@
     "colab": {
       "provenance": [],
       "gpuType": "T4",
-      "authorship_tag": "ABX9TyPFgeWX60dXmmKm+pi5Wr2v",
+      "authorship_tag": "ABX9TyMAPvo6Syxu5wDRkSmySUxq",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -14,7 +14,8 @@
     },
     "language_info": {
       "name": "python"
-    }
+    },
+    "accelerator": "GPU"
   },
   "cells": [
     {
@@ -74,7 +75,7 @@
         "#@markdown #### Do you want to use the GPU for inference?\n",
         "\n",
         "#@markdown The GPU can be enabled in the edit/notebook settings menu, and this step must be done before connecting to a runtime. The GPU can lead to a higher response speed in inference, but you can use the CPU, for example, if your colab runtime to use GPU's has been ended.\n",
-        "use_gpu = False #@param {type:\"boolean\"}\n",
+        "use_gpu = True #@param {type:\"boolean\"}\n",
         "\n",
         "if enhanced_accessibility:\n",
         "    from google.colab import output\n",
@@ -88,10 +89,10 @@
         "    playaudio(\"installing\")\n",
         "!git clone -q https://github.com/rmcpantoja/piper\n",
         "%cd /content/piper/src/python\n",
-        "!pip install -q -r requirements.txt\n",
+        "#!pip install -q -r requirements.txt\n",
+        "!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy>=1.19.0 onnxruntime>=1.11.0 pytorch-lightning==1.7.0 torch==1.11.0\n",
         "!pip install -q onnxruntime-gpu\n",
         "!bash build_monotonic_align.sh\n",
-        "!apt-get install -q espeak-ng\n",
         "import os\n",
         "if not os.path.exists(\"/content/piper/src/python/lng\"):\n",
         "  !cp -r \"/content/piper/notebooks/lng\" /content/piper/src/python/lng\n",
@@ -186,15 +187,15 @@
         "import math\n",
         "import sys\n",
         "from pathlib import Path\n",
-        "\n",
+        "from enum import Enum\n",
+        "from typing import Iterable, List, Optional, Union\n",
         "import numpy as np\n",
         "import onnxruntime\n",
         "from piper_train.vits.utils import audio_float_to_int16\n",
         "import glob\n",
         "import ipywidgets as widgets\n",
         "from IPython.display import display, Audio, Markdown, clear_output\n",
-        "from espeak_phonemizer import Phonemizer\n",
-        "from piper_train import phonemize\n",
+        "from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run\n",
         "\n",
         "_LOGGER = logging.getLogger(\"piper_train.infer_onnx\")\n",
         "\n",
@@ -381,40 +382,75 @@
         "    with open(f\"{model}.json\", \"r\") as file:\n",
         "        config = json.load(file)\n",
         "    return config\n",
+        "PAD = \"_\"  # padding (0)\n",
+        "BOS = \"^\"  # beginning of sentence\n",
+        "EOS = \"$\"  # end of sentence\n",
+        "\n",
+        "class PhonemeType(str, Enum):\n",
+        "    ESPEAK = \"espeak\"\n",
+        "    TEXT = \"text\"\n",
+        "\n",
+        "def phonemize(config, text: str) -> List[List[str]]:\n",
+        "    \"\"\"Text to phonemes grouped by sentence.\"\"\"\n",
+        "    if config[\"phoneme_type\"] == PhonemeType.ESPEAK:\n",
+        "        if config[\"espeak\"][\"voice\"] == \"ar\":\n",
+        "            # Arabic diacritization\n",
+        "            # https://github.com/mush42/libtashkeel/\n",
+        "            text = tashkeel_run(text)\n",
+        "        return phonemize_espeak(text, config[\"espeak\"][\"voice\"])\n",
+        "    if config[\"phoneme_type\"] == PhonemeType.TEXT:\n",
+        "        return phonemize_codepoints(text)\n",
+        "    raise ValueError(f'Unexpected phoneme type: {config[\"phoneme_type\"]}')\n",
+        "\n",
+        "def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:\n",
+        "    \"\"\"Phonemes to ids.\"\"\"\n",
+        "    id_map = config[\"phoneme_id_map\"]\n",
+        "    ids: List[int] = list(id_map[BOS])\n",
+        "    for phoneme in phonemes:\n",
+        "        if phoneme not in id_map:\n",
+        "            print(\"Missing phoneme from id map: %s\", phoneme)\n",
+        "            continue\n",
+        "        ids.extend(id_map[phoneme])\n",
+        "        ids.extend(id_map[PAD])\n",
+        "    ids.extend(id_map[EOS])\n",
+        "    return ids\n",
         "\n",
         "def inferencing(model, config, sid, line, length_scale = 1, noise_scale = 0.667, noise_scale_w = 0.8, auto_play=True):\n",
-        "    espeak_voice = config[\"espeak\"][\"voice\"]\n",
-        "    phonemizer = Phonemizer(default_voice=espeak_voice)\n",
-        "    phonemes = phonemize.phonemize(line, phonemizer)\n",
-        "    ids = phonemize.phonemes_to_ids(phonemes)\n",
-        "    phoneme_ids = ids\n",
-        "    num_speakers = config[\"num_speakers\"]\n",
-        "    if num_speakers == 1:\n",
-        "        speaker_id = None # for now\n",
-        "    else:\n",
-        "        speaker_id = sid\n",
-        "    text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)\n",
-        "    text_lengths = np.array([text.shape[1]], dtype=np.int64)\n",
-        "    scales = np.array(\n",
-        "        [noise_scale, length_scale, noise_scale_w],\n",
-        "        dtype=np.float32,\n",
-        "    )\n",
-        "    sid = None\n",
-        "    if speaker_id is not None:\n",
-        "        sid = np.array([speaker_id], dtype=np.int64)\n",
-        "    audio = model.run(\n",
-        "        None,\n",
-        "        {\n",
-        "            \"input\": text,\n",
-        "            \"input_lengths\": text_lengths,\n",
-        "            \"scales\": scales,\n",
-        "            \"sid\": sid,\n",
-        "        },\n",
-        "    )[0].squeeze((0, 1))\n",
-        "    audio = audio_float_to_int16(audio.squeeze())\n",
+        "    audios = []\n",
+        "    if config[\"phoneme_type\"] == \"PhonemeType.ESPEAK\":\n",
+        "        config[\"phoneme_type\"] = \"espeak\"\n",
+        "    text = phonemize(config, line)\n",
+        "    for phonemes in text:\n",
+        "        phoneme_ids = phonemes_to_ids(config, phonemes)\n",
+        "        num_speakers = config[\"num_speakers\"]\n",
+        "        if num_speakers == 1:\n",
+        "            speaker_id = None # for now\n",
+        "        else:\n",
+        "            speaker_id = sid\n",
+        "        text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)\n",
+        "        text_lengths = np.array([text.shape[1]], dtype=np.int64)\n",
+        "        scales = np.array(\n",
+        "            [noise_scale, length_scale, noise_scale_w],\n",
+        "            dtype=np.float32,\n",
+        "        )\n",
+        "        sid = None\n",
+        "        if speaker_id is not None:\n",
+        "            sid = np.array([speaker_id], dtype=np.int64)\n",
+        "        audio = model.run(\n",
+        "            None,\n",
+        "            {\n",
+        "                \"input\": text,\n",
+        "                \"input_lengths\": text_lengths,\n",
+        "                \"scales\": scales,\n",
+        "                \"sid\": sid,\n",
+        "            },\n",
+        "        )[0].squeeze((0, 1))\n",
+        "        audio = audio_float_to_int16(audio.squeeze())\n",
+        "        audios.append(audio)\n",
+        "    merged_audio = np.concatenate(audios)\n",
         "    sample_rate = config[\"audio\"][\"sample_rate\"]\n",
         "    display(Markdown(f\"{line}\"))\n",
-        "    display(Audio(audio, rate=sample_rate, autoplay=auto_play))\n",
+        "    display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))\n",
         "\n",
         "def denoise(\n",
         "    audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float\n",
@@ -512,8 +548,7 @@
         "main()"
       ],
       "metadata": {
-        "id": "hcKk8M2ug8kM",
-        "cellView": "form"
+        "id": "hcKk8M2ug8kM"
       },
       "execution_count": null,
       "outputs": []