diff --git a/notebooks/piper_multilingual_training_notebook.ipynb b/notebooks/piper_multilingual_training_notebook.ipynb new file mode 100644 index 0000000..cb96aac --- /dev/null +++ b/notebooks/piper_multilingual_training_notebook.ipynb @@ -0,0 +1,350 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyPovMyxp8xorYRHeQp1RAP2", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# [Piper](https://github.com/rhasspy/piper) training notebook\n", + "\n", + "Notebook made by [rmcpantoja](http://github.com/rmcpantoja)" + ], + "metadata": { + "id": "eK3nmYDB6C1a" + } + }, + { + "cell_type": "markdown", + "source": [ + "# First steps" + ], + "metadata": { + "id": "AICh6p5OJybj" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## **Google Colab Anti-Disconnect.**\n", + "#@markdown ---\n", + "#@markdown #### Avoid automatic disconnection. Still, it will disconnect after **6 to 12 hours**.\n", + "\n", + "import IPython\n", + "js_code = '''\n", + "function ClickConnect(){\n", + "console.log(\"Working\");\n", + "document.querySelector(\"colab-toolbar-button#connect\").click()\n", + "}\n", + "setInterval(ClickConnect,60000)\n", + "'''\n", + "display(IPython.display.Javascript(js_code))" + ], + "metadata": { + "cellView": "form", + "id": "qyxSMuzjfQrz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## Check GPU type\n", + "#@markdown ---\n", + "#@markdown #### A higher capable GPU can lead to faster training speeds. By default, you will have a **Tesla T4**.\n", + "!nvidia-smi" + ], + "metadata": { + "cellView": "form", + "id": "ygxzp-xHTC7T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "sUNjId07JfAK" + }, + "outputs": [], + "source": [ + "#@markdown ## mount Google Drive\n", + "from google.colab import drive\n", + "drive.mount('/content/drive', force_remount=True)" + ] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## Install software\n", + "# clone:\n", + "!git clone https://github.com/rmcpantoja/piper\n", + "%cd piper/src/python\n", + "!pip install --upgrade pip\n", + "!pip install --upgrade wheel setuptools\n", + "!pip install -r requirements.txt\n", + "!pip install torchtext==0.12.0\n", + "!pip install torchvision==0.12.0\n", + "!bash build_monotonic_align.sh\n", + "!apt-get install espeak-ng\n", + "%cd /content" + ], + "metadata": { + "cellView": "form", + "id": "_XwmTVlcUgCh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Training" + ], + "metadata": { + "id": "A3bMzEE0V5Ma" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 1. Extract dataset\n", + "%cd /content\n", + "!mkdir /content/dataset\n", + "%cd /content/dataset\n", + "!mkdir /content/dataset/wavs\n", + "#@markdown ### Audio dataset path to unzip\n", + "zip_path = \"/content/drive/MyDrive/Fakeyou/odal_castilian/wavs16k.zip\" #@param {type:\"string\"}\n", + "!unzip \"{zip_path}\" -d /content/dataset/wavs\n", + "#@markdown ---" + ], + "metadata": { + "cellView": "form", + "id": "SvEGjf0aV8eg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 2. Upload the transcript file\n", + "%cd /content/dataset\n", + "from google.colab import files\n", + "!rm /content/dataset/metadata.csv\n", + "listfn, length = files.upload().popitem()\n", + "if listfn != \"metadata.csv\":\n", + " !mv \"$listfn\" metadata.csv\n", + "%cd .." + ], + "metadata": { + "cellView": "form", + "id": "E0W0OCvXXvue" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 3. Preprocess dataset\n", + "\n", + "import os\n", + "#@markdown ### First of all, select the language of your dataset.\n", + "language = \"English (U.S.)\" #@param [\"Català\", \"Dansk\", \"Deutsch\", \"Ελληνικά\", \"English (British)\", \"English (U.S.)\", \"Español\", \"Suomi\", \"Français\", \"Icelandic\", \"Italiano\", \"қазақша\", \"नेपाली\", \"Nederlands\", \"Norsk\", \"Polski\", \"Português (Brasil)\", \"Русский\", \"Svenska\", \"украї́нська\", \"Tiếng Việt\", \"简体中文\"]\n", + "#@markdown ---\n", + "# language definition:\n", + "languages = {\n", + " \"Català\": \"ca\",\n", + " \"Dansk\": \"da\",\n", + " \"Deutsch\": \"de\",\n", + " \"Ελληνικά\": \"grc\",\n", + " \"English (British)\": \"en\",\n", + " \"English (U.S.)\": \"en-us\",\n", + " \"Español\": \"es\",\n", + " \"Suomi\": \"fi\",\n", + " \"Français\": \"fr\",\n", + " \"Icelandic\": \"is\",\n", + " \"Italiano\": \"it\",\n", + " \"қазақша\": \"kk\",\n", + " \"नेपाली\": \"ne\",\n", + " \"Nederlands\": \"nl\",\n", + " \"Norsk\": \"nb\",\n", + " \"Polski\": \"pl\",\n", + " \"Português (Brasil)\": \"pt-br\",\n", + " \"Русский\": \"ru\",\n", + " \"Svenska\": \"sv\",\n", + " \"украї́нська\": \"uk\",\n", + " \"Tiếng Việt\": \"vi-vn-x-central\",\n", + " \"简体中文\": \"yue\"\n", + "}\n", + "\n", + "def _get_language(code):\n", + " return languages[code]\n", + "\n", + "final_language = _get_language(language)\n", + "# output:\n", + "#@markdown ### Choose the working folder. (recommended to save to Drive)\n", + "\n", + "#@markdown The working folder will be used in preprocessing, but also in training the model.\n", + "output_dir = \"/content/drive/MyDrive/colab/piper\" #@param {type:\"string\"}\n", + "if not os.path.exists(output_dir):\n", + " os.makedirs(output_dir)\n", + "#@markdown ---\n", + "#@markdown ### Choose dataset format\n", + "dataset_format = \"ljspeech\" #@param [\"ljspeech\", \"mycroft\"]\n", + "#@markdown ---\n", + "#@markdown ### Select the sample rate of the dataset\n", + "sample_rate = \"16000\" #@param [\"16000\", \"22050\"]\n", + "#@markdown ---\n", + "%cd /content/piper/src/python\n", + "!python -m piper_train.preprocess \\\n", + " --language {final_language} \\\n", + " --input-dir /content/dataset \\\n", + " --output-dir {output_dir} \\\n", + " --dataset-format {dataset_format} \\\n", + " --sample-rate {sample_rate}" + ], + "metadata": { + "cellView": "form", + "id": "dOyx9Y6JYvRF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 4. Settings\n", + "import json\n", + "import ipywidgets as widgets\n", + "from IPython.display import display\n", + "\n", + "#@markdown ### Fine-tune this dataset?\n", + "finetune = True #@param {type:\"boolean\"}\n", + "#@markdown ---\n", + "if finetune:\n", + " ft_command = '--resume_from_checkpoint \"/content/pretrained.ckpt\" '\n", + " try:\n", + " with open('/CONTENT/PIPER/NOTEBOOKS/pretrained_models.json') as f:\n", + " pretrained_models = json.load(f)\n", + " if final_language in pretrained_models:\n", + " models = pretrained_models[final_language]\n", + " model_options = [(model_name, model_url) for model_name, model_url in models.items()]\n", + " model_dropdown = widgets.Dropdown(description = \"Choose pretrained model\", options=model_options)\n", + " download_button = widgets.Button(description=\"Download\")\n", + " def download_model(btn):\n", + " model_name, model_url = model_dropdown.value\n", + " file_id = model_url.split('/')[-2]\n", + " !gdown {file_id} -O \"/content/pretrained.ckpt\"\n", + "\n", + " download_button.on_click(download_model)\n", + " display(model_dropdown, download_button)\n", + " else:\n", + " raise Exception(f\"There are no pretrained models available for the language {final_language}\")\n", + " except FileNotFoundError:\n", + " raise Exception(\"The pretrained_models.json file was not found.\")\n", + "else:\n", + " ft_command = \"\"\n", + "#@makrdown ### Choose batch size based on this dataset\n", + "batch_size = 8 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### Validation split\n", + "validation_split = 0.03 #@param {type:\"number\"}\n", + "#@markdown ---\n", + "#@markdown ### Choose the quality for this model:\n", + "\n", + "#@markdown * x-low - 16Khz audio, 5-7M params\n", + "#@markdown * low - 16Khz audio, 15-20M params\n", + "#@markdown * medium - 22.05Khz audio, 15-20 params\n", + "#@markdown * high - 22.05Khz audio, 28-32M params\n", + "quality = \"x-low\" #@param [\"high\", \"low\", \"medium\", \"x-low\"]\n", + "#@markdown ---\n", + "#@markdown ### For how many steps to save training checkpoints?\n", + "checkpoint_epochs = 25 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### Step interval to generate model samples\n", + "log_every_n_steps = 1000 #@param {type:\"integer\"}\n", + "#@markdown ---\n", + "#@markdown ### training epochs\n", + "max_epochs = 5000 #@param {type:\"integer\"}\n", + "#@markdown ---" + ], + "metadata": { + "cellView": "form", + "id": "ickQlOCRjkBL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 5. Run the tensorboard extension\n", + "\n", + "#@markdown The tensorboard is used to visualize the results of the model while it is being trained.\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir {output_dir}" + ], + "metadata": { + "cellView": "form", + "id": "MpKDfhAHjHJ3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ## 6. Train\n", + "!python -m piper_train \\\n", + " --dataset-dir \"{output_dir}\" \\\n", + " --accelerator 'gpu' \\\n", + " --devices 1 \\\n", + " --batch-size {batch_size} \\\n", + " --validation-split {validation_split} \\\n", + " --num-test-examples 2 \\\n", + " --quality {quality} \\\n", + " --checkpoint-epochs {checkpoint_epochs} \\\n", + " --check_val_every_n_epoch 25 \\\n", + " --num_sanity_val_steps 1000 \\\n", + " --log_every_n_steps {log_every_n_steps} \\\n", + " --max_epochs {max_epochs} \\\n", + " {ft-command}\\\n", + " --precision 32" + ], + "metadata": { + "cellView": "form", + "id": "X4zbSjXg2J3N" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file