diff --git a/src/python/piper_train/check_phonemes.py b/src/python/piper_train/check_phonemes.py index a17d37c..82e0685 100644 --- a/src/python/piper_train/check_phonemes.py +++ b/src/python/piper_train/check_phonemes.py @@ -45,7 +45,7 @@ def main() -> None: "category": unicodedata.category(phoneme), } for phoneme, count in missing_phonemes.most_common() - } + }, }, sys.stdout, ) diff --git a/src/python/piper_train/export_torchscript.py b/src/python/piper_train/export_torchscript.py index 3555a20..312cc95 100644 --- a/src/python/piper_train/export_torchscript.py +++ b/src/python/piper_train/export_torchscript.py @@ -2,7 +2,6 @@ import argparse import logging from pathlib import Path -from typing import Optional import torch @@ -41,7 +40,6 @@ def main(): model_g = model.model_g num_symbols = model_g.n_vocab - num_speakers = model_g.n_speakers # Inference only model_g.eval() diff --git a/src/python/piper_train/phonemize.py b/src/python/piper_train/phonemize.py index ce68729..a716bbe 100644 --- a/src/python/piper_train/phonemize.py +++ b/src/python/piper_train/phonemize.py @@ -3,10 +3,20 @@ import json import sys import unicodedata from collections import Counter +from enum import Enum from typing import Dict, Iterable, List, Mapping, Optional from espeak_phonemizer import Phonemizer + +class PhonemeType(str, Enum): + ESPEAK = "espeak" + """Phonemes come from espeak-ng""" + + TEXT = "text" + """Phonemes come from text itself""" + + MAX_PHONEMES = 256 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { "_": [0], @@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { '"': [150], # Russian } +ALPHABETS = { + # Ukrainian + "uk": { + "_": [0], + "^": [1], + "$": [2], + " ": [3], + "!": [4], + "'": [5], + ",": [6], + "-": [7], + ".": [8], + ":": [9], + ";": [10], + "?": [11], + "а": [12], + "б": [13], + "в": [14], + "г": [15], + "ґ": [16], + "д": [17], + "е": [18], + "є": [19], + "ж": [20], + "з": [21], + "и": [22], + "і": [23], + "ї": [24], + "й": [25], + "к": [26], + "л": [27], + "м": [28], + "н": [29], + "о": [30], + "п": [31], + "р": [32], + "с": [33], + "т": [34], + "у": [35], + "ф": [36], + "х": [37], + "ц": [38], + "ч": [39], + "ш": [40], + "щ": [41], + "ь": [42], + "ю": [43], + "я": [44], + } +} + def phonemize(text: str, phonemizer: Phonemizer) -> List[str]: phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True) diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 1eae600..7692ab8 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -6,8 +6,8 @@ import itertools import json import logging import os +import unicodedata from collections import Counter -from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field from multiprocessing import JoinableQueue, Process, Queue from pathlib import Path @@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional from espeak_phonemizer import Phonemizer from .norm_audio import cache_norm_audio, make_silence_detector -from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES +from .phonemize import ( + ALPHABETS, + DEFAULT_PHONEME_ID_MAP, + MAX_PHONEMES, + PhonemeType, + phonemes_to_ids, + phonemize, +) _LOGGER = logging.getLogger("preprocess") @@ -49,6 +56,20 @@ def main() -> None: parser.add_argument( "--speaker-id", type=int, help="Add speaker id to single speaker dataset" ) + # + parser.add_argument( + "--phoneme-type", + choices=list(PhonemeType), + default=PhonemeType.ESPEAK, + help="Type of phonemes to use (default: espeak)", + ) + parser.add_argument( + "--text-casing", + choices=("ignore", "lower", "upper", "casefold"), + default="ignore", + help="Casing applied to utterance text", + ) + # parser.add_argument( "--skip-audio", action="store_true", help="Don't preprocess audio" ) @@ -89,7 +110,12 @@ def main() -> None: _LOGGER.debug("Counting number of speakers/utterances in the dataset") speaker_counts: Counter[str] = Counter() num_utterances = 0 - for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id): + for utt in make_dataset( + args.input_dir, + args.single_speaker, + args.speaker_id, + args.skip_audio, + ): speaker = utt.speaker or "" speaker_counts[speaker] += 1 num_utterances += 1 @@ -121,8 +147,11 @@ def main() -> None: "voice": args.language, }, "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8}, + "phoneme_type": str(args.phoneme_type), "phoneme_map": {}, - "phoneme_id_map": DEFAULT_PHONEME_ID_MAP, + "phoneme_id_map": ALPHABETS[args.language] + if args.phoneme_type == PhonemeType.TEXT + else DEFAULT_PHONEME_ID_MAP, "num_symbols": MAX_PHONEMES, "num_speakers": len(speaker_counts), "speaker_id_map": speaker_ids, @@ -143,8 +172,13 @@ def main() -> None: queue_out: "Queue[Optional[Utterance]]" = Queue() # Start workers + if args.phoneme_type == PhonemeType.TEXT: + target = phonemize_batch_text + else: + target = phonemize_batch_espeak + processes = [ - Process(target=process_batch, args=(args, queue_in, queue_out)) + Process(target=target, args=(args, queue_in, queue_out)) for _ in range(args.max_workers) ] for proc in processes: @@ -155,7 +189,12 @@ def main() -> None: ) with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file: for utt_batch in batched( - make_dataset(args.input_dir, args.single_speaker, args.speaker_id), + make_dataset( + args.input_dir, + args.single_speaker, + args.speaker_id, + args.skip_audio, + ), batch_size, ): queue_in.put(utt_batch) @@ -200,8 +239,24 @@ def main() -> None: # ----------------------------------------------------------------------------- -def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue): +def get_text_casing(casing: str): + if casing == "lower": + return str.lower + + if casing == "upper": + return str.upper + + if casing == "casefold": + return str.casefold + + return lambda s: s + + +def phonemize_batch_espeak( + args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue +): try: + casing = get_text_casing(args.text_casing) silence_detector = make_silence_detector() phonemizer = Phonemizer(default_voice=args.language) @@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: for utt in utt_batch: try: _LOGGER.debug(utt) - utt.phonemes = phonemize(utt.text, phonemizer) + utt.phonemes = phonemize(casing(utt.text), phonemizer) utt.phoneme_ids = phonemes_to_ids( utt.phonemes, missing_phonemes=utt.missing_phonemes, @@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: queue_in.task_done() except Exception: - _LOGGER.exception("process_batch") + _LOGGER.exception("phonemize_batch_espeak") + + +def phonemize_batch_text( + args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue +): + try: + casing = get_text_casing(args.text_casing) + silence_detector = make_silence_detector() + alphabet = ALPHABETS[args.language] + + while True: + utt_batch = queue_in.get() + if utt_batch is None: + break + + for utt in utt_batch: + try: + _LOGGER.debug(utt) + utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text))) + utt.phoneme_ids = [] + for phoneme in utt.phonemes: + if phoneme in alphabet: + utt.phoneme_ids.extend(alphabet[phoneme]) + else: + utt.missing_phonemes[phoneme] += 1 + if not args.skip_audio: + utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio( + utt.audio_path, + args.cache_dir, + silence_detector, + args.sample_rate, + ) + queue_out.put(utt) + except TimeoutError: + _LOGGER.error("Skipping utterance due to timeout: %s", utt) + except Exception: + _LOGGER.exception("Failed to process utterance: %s", utt) + queue_out.put(None) + + queue_in.task_done() + except Exception: + _LOGGER.exception("phonemize_batch_text") # ----------------------------------------------------------------------------- @@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder): def ljspeech_dataset( - dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None + dataset_dir: Path, + is_single_speaker: bool, + speaker_id: Optional[int] = None, + skip_audio: bool = False, ) -> Iterable[Utterance]: # filename|speaker|text # speaker is optional @@ -298,7 +398,7 @@ def ljspeech_dataset( # Try with .wav wav_path = wav_dir / f"{filename}.wav" - if not wav_path.exists(): + if (not skip_audio) and (not wav_path.exists()): _LOGGER.warning("Missing %s", filename) continue @@ -308,7 +408,10 @@ def ljspeech_dataset( def mycroft_dataset( - dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None + dataset_dir: Path, + is_single_speaker: bool, + speaker_id: Optional[int] = None, + skip_audio: bool = False, ) -> Iterable[Utterance]: speaker_id = 0 for metadata_path in dataset_dir.glob("**/*-metadata.txt"): diff --git a/src/python/piper_train/select_speaker.py b/src/python/piper_train/select_speaker.py index b611827..f92ce34 100644 --- a/src/python/piper_train/select_speaker.py +++ b/src/python/piper_train/select_speaker.py @@ -4,6 +4,7 @@ import csv import sys from collections import Counter, defaultdict + def main(): parser = argparse.ArgumentParser() parser.add_argument("--speaker-number", type=int)