Updated training notebook.

2026-04-18 22:34:49 +00:00 · 2023-07-03 18:17:51 -05:00
parent e274b144c2
commit ae5a42a474
1 changed files with 115 additions and 99 deletions
--- a/notebooks/piper_multilingual_training_notebook.ipynb
+++ b/notebooks/piper_multilingual_training_notebook.ipynb
@@ -1,22 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4",
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU",
-    "gpuClass": "standard"
-  },
  "cells": [
    {
      "cell_type": "markdown",
@@ -30,6 +12,9 @@
    },
    {
      "cell_type": "markdown",
+      "metadata": {
+        "id": "eK3nmYDB6C1a"
+      },
      "source": [
        "# <font color=\"pink\"> **[Piper](https://github.com/rhasspy/piper) training notebook.**\n",
        "## ![Piper logo](https://contribute.rhasspy.org/img/logo.png)\n",
@@ -38,22 +23,25 @@
        "\n",
        "- Notebook made by [rmcpantoja](http://github.com/rmcpantoja)\n",
        "- Collaborator: [Xx_Nessu_xX](https://fakeyou.com/profile/Xx_Nessu_xX)"
-      ],
-      "metadata": {
-        "id": "eK3nmYDB6C1a"
-      }
+      ]
    },
    {
      "cell_type": "markdown",
-      "source": [
-        "# <font color=\"pink\">🔧 ***First steps.*** 🔧"
-      ],
      "metadata": {
        "id": "AICh6p5OJybj"
-      }
+      },
+      "source": [
+        "# <font color=\"pink\">🔧 ***First steps.*** 🔧"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "qyxSMuzjfQrz"
+      },
+      "outputs": [],
      "source": [
        "#@markdown ## <font color=\"pink\"> **Google Colab Anti-Disconnect.** 🔌\n",
        "#@markdown ---\n",
@@ -68,28 +56,22 @@
        "setInterval(ClickConnect,60000)\n",
        "'''\n",
        "display(IPython.display.Javascript(js_code))"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "qyxSMuzjfQrz"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "ygxzp-xHTC7T"
+      },
+      "outputs": [],
      "source": [
        "#@markdown ## <font color=\"pink\"> **Check GPU type.** 👁️\n",
        "#@markdown ---\n",
        "#@markdown #### A higher capable GPU can lead to faster training speeds. By default, you will have a <font color=\"orange\">**Tesla T4**</font>.\n",
        "!nvidia-smi"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "ygxzp-xHTC7T"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
@@ -107,6 +89,12 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "_XwmTVlcUgCh"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **Install software.** 📦\n",
        "\n",
@@ -125,25 +113,25 @@
        "!gdown -q \"1EWEb7amo1rgFGpBFfRD4BKX3pkjVK1I-\" -O \"/content/piper/src/python/patch.zip\"\n",
        "!unzip -o -q \"patch.zip\"\n",
        "%cd /content"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "_XwmTVlcUgCh"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "markdown",
-      "source": [
-        "# <font color=\"pink\"> 🤖 ***Training.*** 🤖"
-      ],
      "metadata": {
        "id": "A3bMzEE0V5Ma"
-      }
+      },
+      "source": [
+        "# <font color=\"pink\"> 🤖 ***Training.*** 🤖"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "SvEGjf0aV8eg"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **1. Extract dataset.** 📥\n",
        "#@markdown ####Important: the audios must be in <font color=\"orange\">**wav format, (16000 or 22050hz, 16-bits, mono), and, for convenience, numbered. Example:**\n",
@@ -160,19 +148,19 @@
        "%cd /content/dataset\n",
        "!mkdir /content/dataset/wavs\n",
        "#@markdown ### Audio dataset path to unzip\n",
-        "zip_path = \"/content/drive/MyDrive/wavs.zip\" #@param {type:\"string\"}\n",
+        "zip_path = \"/content/drive/MyDrive/Wavs.zip\" #@param {type:\"string\"}\n",
        "!unzip \"{zip_path}\" -d /content/dataset/wavs\n",
        "#@markdown ---"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "SvEGjf0aV8eg"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "E0W0OCvXXvue"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **2. Upload the transcript file.** 📝\n",
        "#@markdown ---\n",
@@ -201,22 +189,22 @@
        "if listfn != \"metadata.csv\":\n",
        "  !mv \"$listfn\" metadata.csv\n",
        "%cd .."
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "E0W0OCvXXvue"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "dOyx9Y6JYvRF"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **3. Preprocess dataset.** 🔄\n",
        "\n",
        "import os\n",
        "#@markdown ### First of all, select the language of your dataset.\n",
-        "language = \"English (U.S.)\" #@param [\"Català\", \"Dansk\", \"Deutsch\", \"Ελληνικά\", \"English (British)\", \"English (U.S.)\", \"Español\", \"Suomi\", \"Français\", \"ქართული\", \"Icelandic\", \"Italiano\", \"қазақша\", \"नेपाली\", \"Nederlands\", \"Norsk\", \"Polski\", \"Português (Brasil)\", \"Русский\", \"Svenska\", \"украї́нська\", \"Tiếng Việt\", \"简体中文\"]\n",
+        "language = \"English (U.S.)\" #@param [\"Català\", \"Dansk\", \"Deutsch\", \"Ελληνικά\", \"English (British)\", \"English (U.S.)\", \"Español\", \"Español (latinoamericano)\", \"Suomi\", \"Français\", \"ქართული\", \"Icelandic\", \"Italiano\", \"қазақша\", \"नेपाली\", \"Nederlands\", \"Norsk\", \"Polski\", \"Português (Brasil)\", \"Русский\", \"Svenska\", \"украї́нська\", \"Tiếng Việt\", \"简体中文\"]\n",
        "#@markdown ---\n",
        "# language definition:\n",
        "languages = {\n",
@@ -227,6 +215,7 @@
        "    \"English (British)\": \"en\",\n",
        "    \"English (U.S.)\": \"en-us\",\n",
        "    \"Español\": \"es\",\n",
+        "    \"Español (latinoamericano)\": \"es-419\",\n",
        "    \"Suomi\": \"fi\",\n",
        "    \"Français\": \"fr\",\n",
        "    \"Icelandic\": \"is\",\n",
@@ -282,16 +271,16 @@
        "  --dataset-format {dataset_format} \\\n",
        "  --sample-rate {sample_rate} \\\n",
        "  {force_sp}"
-      ],
-      "metadata": {
-        "id": "dOyx9Y6JYvRF",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "ickQlOCRjkBL"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **4. Settings.** 🧰\n",
        "import json\n",
@@ -301,16 +290,29 @@
        "import os\n",
        "#@markdown ### Select the action to train this dataset:\n",
        "\n",
+        "#@markdown * The option to continue a training is self-explanatory. If you've previously trained a model with free colab, your time is up and you're considering training it some more, this is ideal for you. You just have to set the same settings that you set when you first trained this model.\n",
        "#@markdown * The option to convert a single-speaker model to a multi-speaker model is self-explanatory, and for this it is important that you have processed a dataset that contains text and audio from all possible speakers that you want to train in your model.\n",
        "#@markdown * The finetune option is used to train a dataset using a pretrained model, that is, train on that data. This option is ideal if you want to train a very small dataset (more than five minutes recommended).\n",
        "#@markdown * The train from scratch option builds features such as dictionary and speech form from scratch, and this may take longer to converge. For this, hours of audio (8 at least) are recommended, which have a large collection of phonemes.\n",
        "\n",
-        "action = \"finetune\" #@param [\"convert single-speaker to multi-speaker model\", \"finetune\", \"train from scratch\"]\n",
+        "action = \"finetune\" #@param [\"Continue training\", \"convert single-speaker to multi-speaker model\", \"finetune\", \"train from scratch\"]\n",
        "#@markdown ---\n",
-        "if action == \"finetune\":\n",
-        "    ft_command = '--resume_from_checkpoint \"/content/pretrained.ckpt\" '\n",
+        "if action == \"Continue training\":\n",
+        "    if os.path.exists(f\"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\"):\n",
+        "        ft_command = f'--resume_from_checkpoint \"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\" '\n",
+        "        print(f\"Continuing {model_name}'s training at: {{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt}\")\n",
+        "    else:\n",
+        "        raise Exception(\"Training cannot be continued as there is no checkpoint to continue at.\")\n",
+        "elif action == \"finetune\":\n",
+        "    if os.path.exists(f\"{output_dir}/lightning_logs/version_0/checkpoints/last.ckpt\"):\n",
+        "        raise Exception(\"Oh no! You have already trained this model before, you cannot choose this option since your progress will be lost, and then your previous time will not count. Please select the option to continue a training.\")\n",
+        "    else:\n",
+        "        ft_command = '--resume_from_checkpoint \"/content/pretrained.ckpt\" '\n",
        "elif action == \"convert single-speaker to multi-speaker model\":\n",
-        "    ft_command = '--resume_from_single_speaker_checkpoint \"/content/pretrained.ckpt\" '\n",
+        "    if not single_speaker:\n",
+        "        ft_command = '--resume_from_single_speaker_checkpoint \"/content/pretrained.ckpt\" '\n",
+        "    else:\n",
+        "        raise Exception(\"This dataset is not a multi-speaker dataset!\")\n",
        "else:\n",
        "    ft_command = \"\"\n",
        "if action== \"convert single-speaker to multi-speaker model\" or action == \"finetune\":\n",
@@ -348,7 +350,7 @@
        "else:\n",
        "    print(\"Warning: this model will be trained from scratch. You need at least 8 hours of data for everything to work decent. Good luck!\")\n",
        "#@markdown ### Choose batch size based on this dataset:\n",
-        "batch_size = 8 #@param {type:\"integer\"}\n",
+        "batch_size = 12 #@param {type:\"integer\"}\n",
        "#@markdown ---\n",
        "validation_split = 0.01\n",
        "#@markdown ### Choose the quality for this model:\n",
@@ -363,21 +365,24 @@
        "checkpoint_epochs = 3 #@param {type:\"integer\"}\n",
        "#@markdown ---\n",
        "#@markdown ### Step interval to generate model samples:\n",
-        "log_every_n_steps = 250 #@param {type:\"integer\"}\n",
+        "log_every_n_steps = 500 #@param {type:\"integer\"}\n",
        "#@markdown ---\n",
        "#@markdown ### Training epochs:\n",
        "max_epochs = 10000 #@param {type:\"integer\"}\n",
        "#@markdown ---"
-      ],
-      "metadata": {
-        "id": "ickQlOCRjkBL",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "background_save": true
+        },
+        "id": "X4zbSjXg2J3N"
+      },
+      "outputs": [],
      "source": [
        "#@markdown # <font color=\"pink\"> **5. Train.** 🏋️‍♂️\n",
        "#@markdown Run this cell to train your final model! If possible, some audio samples will be saved during training in the output folder.\n",
@@ -397,24 +402,35 @@
        "{ft_command}\\\n",
        "--precision 32\n",
        "''')"
-      ],
-      "metadata": {
-        "id": "X4zbSjXg2J3N",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "markdown",
+      "metadata": {
+        "id": "6ISG085SYn85"
+      },
      "source": [
        "# Have you finished training and want to test the model?\n",
        "\n",
        "Export your model using the [model exporter notebook](https://colab.research.google.com/github/rmcpantoja/piper/blob/master/notebooks/piper_model_exporter.ipynb)!"
-      ],
-      "metadata": {
-        "id": "6ISG085SYn85"
-      }
+      ]
    }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": [],
+      "include_colab_link": true
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }