import argparse import json import sys import unicodedata from collections import Counter from enum import Enum from typing import Dict, Iterable, List, Mapping, Optional from espeak_phonemizer import Phonemizer class PhonemeType(str, Enum): ESPEAK = "espeak" """Phonemes come from espeak-ng""" TEXT = "text" """Phonemes come from text itself""" MAX_PHONEMES = 256 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { "_": [0], "^": [1], "$": [2], " ": [3], "!": [4], "'": [5], "(": [6], ")": [7], ",": [8], "-": [9], ".": [10], ":": [11], ";": [12], "?": [13], "a": [14], "b": [15], "c": [16], "d": [17], "e": [18], "f": [19], "h": [20], "i": [21], "j": [22], "k": [23], "l": [24], "m": [25], "n": [26], "o": [27], "p": [28], "q": [29], "r": [30], "s": [31], "t": [32], "u": [33], "v": [34], "w": [35], "x": [36], "y": [37], "z": [38], "æ": [39], "ç": [40], "ð": [41], "ø": [42], "ħ": [43], "ŋ": [44], "œ": [45], "ǀ": [46], "ǁ": [47], "ǂ": [48], "ǃ": [49], "ɐ": [50], "ɑ": [51], "ɒ": [52], "ɓ": [53], "ɔ": [54], "ɕ": [55], "ɖ": [56], "ɗ": [57], "ɘ": [58], "ə": [59], "ɚ": [60], "ɛ": [61], "ɜ": [62], "ɞ": [63], "ɟ": [64], "ɠ": [65], "ɡ": [66], "ɢ": [67], "ɣ": [68], "ɤ": [69], "ɥ": [70], "ɦ": [71], "ɧ": [72], "ɨ": [73], "ɪ": [74], "ɫ": [75], "ɬ": [76], "ɭ": [77], "ɮ": [78], "ɯ": [79], "ɰ": [80], "ɱ": [81], "ɲ": [82], "ɳ": [83], "ɴ": [84], "ɵ": [85], "ɶ": [86], "ɸ": [87], "ɹ": [88], "ɺ": [89], "ɻ": [90], "ɽ": [91], "ɾ": [92], "ʀ": [93], "ʁ": [94], "ʂ": [95], "ʃ": [96], "ʄ": [97], "ʈ": [98], "ʉ": [99], "ʊ": [100], "ʋ": [101], "ʌ": [102], "ʍ": [103], "ʎ": [104], "ʏ": [105], "ʐ": [106], "ʑ": [107], "ʒ": [108], "ʔ": [109], "ʕ": [110], "ʘ": [111], "ʙ": [112], "ʛ": [113], "ʜ": [114], "ʝ": [115], "ʟ": [116], "ʡ": [117], "ʢ": [118], "ʲ": [119], "ˈ": [120], "ˌ": [121], "ː": [122], "ˑ": [123], "˞": [124], "β": [125], "θ": [126], "χ": [127], "ᵻ": [128], "ⱱ": [129], "0": [130], # tones "1": [131], "2": [132], "3": [133], "4": [134], "5": [135], "6": [136], "7": [137], "8": [138], "9": [139], "\u0327": [140], # combining cedilla "\u0303": [141], # combining tilde "\u032a": [142], # combining bridge below "\u032f": [143], # combining inverted breve below "\u0329": [144], # combining vertical line below "ʰ": [145], "ˤ": [146], "ε": [147], "↓": [148], "#": [149], # Icelandic '"': [150], # Russian "↑": [151], "\u033a": [152], # Basque "\u033b": [153], } PHONEME_MAPS = { # Brazilian Portuguese "pt-br": {"c": ["k"]} } ALPHABETS = { # Ukrainian "uk": { "_": [0], "^": [1], "$": [2], " ": [3], "!": [4], "'": [5], ",": [6], "-": [7], ".": [8], ":": [9], ";": [10], "?": [11], "а": [12], "б": [13], "в": [14], "г": [15], "ґ": [16], "д": [17], "е": [18], "є": [19], "ж": [20], "з": [21], "и": [22], "і": [23], "ї": [24], "й": [25], "к": [26], "л": [27], "м": [28], "н": [29], "о": [30], "п": [31], "р": [32], "с": [33], "т": [34], "у": [35], "ф": [36], "х": [37], "ц": [38], "ч": [39], "ш": [40], "щ": [41], "ь": [42], "ю": [43], "я": [44], "\u0301": [45], # combining acute accent "\u0306": [46], # combining breve "\u0308": [47], # combining diaeresis "—": [48], # em dash } } def phonemize( text: str, phonemizer: Phonemizer, phoneme_map: Optional[Dict[str, List[str]]] = None, ) -> List[str]: phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True) # Phonemes are decomposed into unicode codepoints unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str)) if not phoneme_map: return unmapped_phonemes # Phonemes can be mapped to lists of other phonemes mapped_phonemes = [] for phoneme in unmapped_phonemes: sub_phonemes = phoneme_map.get(phoneme) if sub_phonemes: mapped_phonemes.extend(sub_phonemes) else: mapped_phonemes.append(phoneme) return mapped_phonemes def phonemes_to_ids( phonemes: Iterable[str], phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None, missing_phonemes: "Optional[Counter[str]]" = None, pad: Optional[str] = "_", bos: Optional[str] = "^", eos: Optional[str] = "$", ) -> List[int]: if phoneme_id_map is None: phoneme_id_map = DEFAULT_PHONEME_ID_MAP phoneme_ids: List[int] = [] if bos: phoneme_ids.extend(phoneme_id_map[bos]) if pad: phoneme_ids.extend(phoneme_id_map[pad]) for phoneme in phonemes: mapped_phoneme_ids = phoneme_id_map.get(phoneme) if mapped_phoneme_ids: phoneme_ids.extend(mapped_phoneme_ids) if pad: phoneme_ids.extend(phoneme_id_map[pad]) elif missing_phonemes is not None: # Make note of missing phonemes missing_phonemes[phoneme] += 1 if eos: phoneme_ids.extend(phoneme_id_map[eos]) return phoneme_ids # ----------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("language") parser.add_argument( "--phoneme-type", choices=list(PhonemeType), default=PhonemeType.ESPEAK, help="Type of phonemes to use (default: espeak)", ) parser.add_argument( "--text-casing", choices=("ignore", "lower", "upper", "casefold"), default="ignore", help="Casing applied to utterance text", ) args = parser.parse_args() phonemizer: Optional[Phonemizer] = None if args.text_casing == "lower": casing = str.lower elif args.text_casing == "upper": casing = str.upper else: # ignore casing = lambda s: s if args.phoneme_type == PhonemeType.TEXT: # Use text directly phoneme_id_map = ALPHABETS[args.language] else: # Use eSpeak phonemizer = Phonemizer(args.language) phoneme_id_map = DEFAULT_PHONEME_ID_MAP phoneme_map = PHONEME_MAPS.get(args.language) missing_phonemes: "Counter[str]" = Counter() for line in sys.stdin: line = line.strip() if not line: continue if args.phoneme_type == PhonemeType.TEXT: phonemes = list(unicodedata.normalize("NFD", casing(line))) else: assert phonemizer is not None phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map) phoneme_ids = phonemes_to_ids( phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes ) json.dump( { "text": line, "phonemes": phonemes, "phoneme_ids": phoneme_ids, }, sys.stdout, ensure_ascii=False, ) print("") if missing_phonemes: print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr) for phoneme, count in missing_phonemes.most_common(): print(phoneme, count, file=sys.stderr) if __name__ == "__main__": main()