diff --git a/src/python/piper_train/phonemize.py b/src/python/piper_train/phonemize.py deleted file mode 100644 index 46d646d..0000000 --- a/src/python/piper_train/phonemize.py +++ /dev/null @@ -1,372 +0,0 @@ -import argparse -import json -import sys -import unicodedata -from collections import Counter -from enum import Enum -from typing import Dict, Iterable, List, Mapping, Optional - -from espeak_phonemizer import Phonemizer - - -class PhonemeType(str, Enum): - ESPEAK = "espeak" - """Phonemes come from espeak-ng""" - - TEXT = "text" - """Phonemes come from text itself""" - - -MAX_PHONEMES = 256 -DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { - "_": [0], - "^": [1], - "$": [2], - " ": [3], - "!": [4], - "'": [5], - "(": [6], - ")": [7], - ",": [8], - "-": [9], - ".": [10], - ":": [11], - ";": [12], - "?": [13], - "a": [14], - "b": [15], - "c": [16], - "d": [17], - "e": [18], - "f": [19], - "h": [20], - "i": [21], - "j": [22], - "k": [23], - "l": [24], - "m": [25], - "n": [26], - "o": [27], - "p": [28], - "q": [29], - "r": [30], - "s": [31], - "t": [32], - "u": [33], - "v": [34], - "w": [35], - "x": [36], - "y": [37], - "z": [38], - "æ": [39], - "ç": [40], - "ð": [41], - "ø": [42], - "ħ": [43], - "ŋ": [44], - "œ": [45], - "ǀ": [46], - "ǁ": [47], - "ǂ": [48], - "ǃ": [49], - "ɐ": [50], - "ɑ": [51], - "ɒ": [52], - "ɓ": [53], - "ɔ": [54], - "ɕ": [55], - "ɖ": [56], - "ɗ": [57], - "ɘ": [58], - "ə": [59], - "ɚ": [60], - "ɛ": [61], - "ɜ": [62], - "ɞ": [63], - "ɟ": [64], - "ɠ": [65], - "ɡ": [66], - "ɢ": [67], - "ɣ": [68], - "ɤ": [69], - "ɥ": [70], - "ɦ": [71], - "ɧ": [72], - "ɨ": [73], - "ɪ": [74], - "ɫ": [75], - "ɬ": [76], - "ɭ": [77], - "ɮ": [78], - "ɯ": [79], - "ɰ": [80], - "ɱ": [81], - "ɲ": [82], - "ɳ": [83], - "ɴ": [84], - "ɵ": [85], - "ɶ": [86], - "ɸ": [87], - "ɹ": [88], - "ɺ": [89], - "ɻ": [90], - "ɽ": [91], - "ɾ": [92], - "ʀ": [93], - "ʁ": [94], - "ʂ": [95], - "ʃ": [96], - "ʄ": [97], - "ʈ": [98], - "ʉ": [99], - "ʊ": [100], - "ʋ": [101], - "ʌ": [102], - "ʍ": [103], - "ʎ": [104], - "ʏ": [105], - "ʐ": [106], - "ʑ": [107], - "ʒ": [108], - "ʔ": [109], - "ʕ": [110], - "ʘ": [111], - "ʙ": [112], - "ʛ": [113], - "ʜ": [114], - "ʝ": [115], - "ʟ": [116], - "ʡ": [117], - "ʢ": [118], - "ʲ": [119], - "ˈ": [120], - "ˌ": [121], - "ː": [122], - "ˑ": [123], - "˞": [124], - "β": [125], - "θ": [126], - "χ": [127], - "ᵻ": [128], - "ⱱ": [129], - "0": [130], # tones - "1": [131], - "2": [132], - "3": [133], - "4": [134], - "5": [135], - "6": [136], - "7": [137], - "8": [138], - "9": [139], - "\u0327": [140], # combining cedilla - "\u0303": [141], # combining tilde - "\u032a": [142], # combining bridge below - "\u032f": [143], # combining inverted breve below - "\u0329": [144], # combining vertical line below - "ʰ": [145], - "ˤ": [146], - "ε": [147], - "↓": [148], - "#": [149], # Icelandic - '"': [150], # Russian - "↑": [151], - "\u033a": [152], # Basque - "\u033b": [153], -} - -PHONEME_MAPS = { - # Brazilian Portuguese - "pt-br": {"c": ["k"]} -} - -ALPHABETS = { - # Ukrainian - "uk": { - "_": [0], - "^": [1], - "$": [2], - " ": [3], - "!": [4], - "'": [5], - ",": [6], - "-": [7], - ".": [8], - ":": [9], - ";": [10], - "?": [11], - "а": [12], - "б": [13], - "в": [14], - "г": [15], - "ґ": [16], - "д": [17], - "е": [18], - "є": [19], - "ж": [20], - "з": [21], - "и": [22], - "і": [23], - "ї": [24], - "й": [25], - "к": [26], - "л": [27], - "м": [28], - "н": [29], - "о": [30], - "п": [31], - "р": [32], - "с": [33], - "т": [34], - "у": [35], - "ф": [36], - "х": [37], - "ц": [38], - "ч": [39], - "ш": [40], - "щ": [41], - "ь": [42], - "ю": [43], - "я": [44], - "\u0301": [45], # combining acute accent - "\u0306": [46], # combining breve - "\u0308": [47], # combining diaeresis - "—": [48], # em dash - } -} - - -def phonemize( - text: str, - phonemizer: Phonemizer, - phoneme_map: Optional[Dict[str, List[str]]] = None, -) -> List[str]: - phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True) - - # Phonemes are decomposed into unicode codepoints - unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str)) - if not phoneme_map: - return unmapped_phonemes - - # Phonemes can be mapped to lists of other phonemes - mapped_phonemes = [] - for phoneme in unmapped_phonemes: - sub_phonemes = phoneme_map.get(phoneme) - if sub_phonemes: - mapped_phonemes.extend(sub_phonemes) - else: - mapped_phonemes.append(phoneme) - - return mapped_phonemes - - -def phonemes_to_ids( - phonemes: Iterable[str], - phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None, - missing_phonemes: "Optional[Counter[str]]" = None, - pad: Optional[str] = "_", - bos: Optional[str] = "^", - eos: Optional[str] = "$", -) -> List[int]: - if phoneme_id_map is None: - phoneme_id_map = DEFAULT_PHONEME_ID_MAP - - phoneme_ids: List[int] = [] - - if bos: - phoneme_ids.extend(phoneme_id_map[bos]) - - if pad: - phoneme_ids.extend(phoneme_id_map[pad]) - - for phoneme in phonemes: - mapped_phoneme_ids = phoneme_id_map.get(phoneme) - if mapped_phoneme_ids: - phoneme_ids.extend(mapped_phoneme_ids) - - if pad: - phoneme_ids.extend(phoneme_id_map[pad]) - elif missing_phonemes is not None: - # Make note of missing phonemes - missing_phonemes[phoneme] += 1 - - if eos: - phoneme_ids.extend(phoneme_id_map[eos]) - - return phoneme_ids - - -# ----------------------------------------------------------------------------- - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("language") - parser.add_argument( - "--phoneme-type", - choices=list(PhonemeType), - default=PhonemeType.ESPEAK, - help="Type of phonemes to use (default: espeak)", - ) - parser.add_argument( - "--text-casing", - choices=("ignore", "lower", "upper", "casefold"), - default="ignore", - help="Casing applied to utterance text", - ) - args = parser.parse_args() - - phonemizer: Optional[Phonemizer] = None - - if args.text_casing == "lower": - casing = str.lower - elif args.text_casing == "upper": - casing = str.upper - else: - # ignore - casing = lambda s: s - - if args.phoneme_type == PhonemeType.TEXT: - # Use text directly - phoneme_id_map = ALPHABETS[args.language] - else: - # Use eSpeak - phonemizer = Phonemizer(args.language) - phoneme_id_map = DEFAULT_PHONEME_ID_MAP - - phoneme_map = PHONEME_MAPS.get(args.language) - missing_phonemes: "Counter[str]" = Counter() - - for line in sys.stdin: - line = line.strip() - if not line: - continue - - if args.phoneme_type == PhonemeType.TEXT: - phonemes = list(unicodedata.normalize("NFD", casing(line))) - else: - assert phonemizer is not None - phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map) - - phoneme_ids = phonemes_to_ids( - phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes - ) - json.dump( - { - "text": line, - "phonemes": phonemes, - "phoneme_ids": phoneme_ids, - }, - sys.stdout, - ensure_ascii=False, - ) - print("") - - if missing_phonemes: - print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr) - for phoneme, count in missing_phonemes.most_common(): - print(phoneme, count, file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 584e2b0..b89da57 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -9,28 +9,37 @@ import os import unicodedata from collections import Counter from dataclasses import dataclass, field +from enum import Enum from multiprocessing import JoinableQueue, Process, Queue from pathlib import Path from typing import Dict, Iterable, List, Optional -from espeak_phonemizer import Phonemizer +from piper_phonemize import ( + phonemize_espeak, + phonemize_codepoints, + phoneme_ids_espeak, + phoneme_ids_codepoints, + get_codepoints_map, + get_espeak_map, + get_max_phonemes, + tashkeel_run, +) from .norm_audio import cache_norm_audio, make_silence_detector -from .phonemize import ( - ALPHABETS, - DEFAULT_PHONEME_ID_MAP, - MAX_PHONEMES, - PHONEME_MAPS, - PhonemeType, - phonemes_to_ids, - phonemize, -) _DIR = Path(__file__).parent _VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip() _LOGGER = logging.getLogger("preprocess") +class PhonemeType(str, Enum): + ESPEAK = "espeak" + """Phonemes come from espeak-ng""" + + TEXT = "text" + """Phonemes come from text itself""" + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( @@ -150,10 +159,10 @@ def main() -> None: "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8}, "phoneme_type": args.phoneme_type.value, "phoneme_map": {}, - "phoneme_id_map": ALPHABETS[args.language] + "phoneme_id_map": get_codepoints_map()[args.language] if args.phoneme_type == PhonemeType.TEXT - else DEFAULT_PHONEME_ID_MAP, - "num_symbols": MAX_PHONEMES, + else get_espeak_map(), + "num_symbols": get_max_phonemes(), "num_speakers": len(speaker_counts), "speaker_id_map": speaker_ids, "piper_version": _VERSION, @@ -255,8 +264,6 @@ def phonemize_batch_espeak( try: casing = get_text_casing(args.text_casing) silence_detector = make_silence_detector() - phonemizer = Phonemizer(default_voice=args.language) - phoneme_map = PHONEME_MAPS.get(args.language) while True: utt_batch = queue_in.get() @@ -266,10 +273,15 @@ def phonemize_batch_espeak( for utt in utt_batch: try: _LOGGER.debug(utt) - utt.phonemes = phonemize( - casing(utt.text), phonemizer, phoneme_map=phoneme_map - ) - utt.phoneme_ids = phonemes_to_ids( + all_phonemes = phonemize_espeak(casing(utt.text), args.language) + + # Flatten + utt.phonemes = [ + phoneme + for sentence_phonemes in all_phonemes + for phoneme in sentence_phonemes + ] + utt.phoneme_ids = phoneme_ids_espeak( utt.phonemes, missing_phonemes=utt.missing_phonemes, ) @@ -298,7 +310,6 @@ def phonemize_batch_text( try: casing = get_text_casing(args.text_casing) silence_detector = make_silence_detector() - alphabet = ALPHABETS[args.language] while True: utt_batch = queue_in.get() @@ -308,10 +319,16 @@ def phonemize_batch_text( for utt in utt_batch: try: _LOGGER.debug(utt) - utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text))) - utt.phoneme_ids = phonemes_to_ids( + all_phonemes = phonemize_codepoints(casing(utt.text)) + # Flatten + utt.phonemes = [ + phoneme + for sentence_phonemes in all_phonemes + for phoneme in sentence_phonemes + ] + utt.phoneme_ids = phoneme_ids_codepoints( + args.language, utt.phonemes, - phoneme_id_map=alphabet, missing_phonemes=utt.missing_phonemes, ) if not args.skip_audio: diff --git a/src/python/requirements.txt b/src/python/requirements.txt index 9bca1cd..31b0763 100644 --- a/src/python/requirements.txt +++ b/src/python/requirements.txt @@ -1,7 +1,7 @@ cython>=0.29.0,<1 -espeak-phonemizer>=1.1.0,<2 +piper-phonemize~=1.0.0 librosa>=0.9.2,<1 numpy>=1.19.0 -onnxruntime~=1.11.0 +onnxruntime>=1.11.0 pytorch-lightning~=1.7.0 -torch~=1.11.0 +torch>=1.11.0,<2