diff --git a/src/python/piper_train/check_phonemes.py b/src/python/piper_train/check_phonemes.py index cbf44c6..a17d37c 100644 --- a/src/python/piper_train/check_phonemes.py +++ b/src/python/piper_train/check_phonemes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -import csv import json import sys import unicodedata @@ -9,6 +8,7 @@ from .phonemize import DEFAULT_PHONEME_ID_MAP def main() -> None: + used_phonemes: Counter[str] = Counter() missing_phonemes: Counter[str] = Counter() for line in sys.stdin: @@ -18,23 +18,37 @@ def main() -> None: utt = json.loads(line) for phoneme in utt["phonemes"]: + used_phonemes[phoneme] += 1 + if phoneme not in DEFAULT_PHONEME_ID_MAP: missing_phonemes[phoneme] += 1 if missing_phonemes: print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr) - writer = csv.writer(sys.stdout) - for phoneme, count in missing_phonemes.most_common(): - hex_phoneme = hex(ord(phoneme)) - writer.writerow( - ( - phoneme, - unicodedata.category(phoneme), - unicodedata.name(phoneme), - f"\\u{hex_phoneme}", - count, - ) - ) + + json.dump( + { + "used": { + phoneme: { + "count": count, + "hex": f"\\u{hex(ord(phoneme))}", + "name": unicodedata.category(phoneme), + "category": unicodedata.category(phoneme), + } + for phoneme, count in used_phonemes.most_common() + }, + "missing": { + phoneme: { + "count": count, + "hex": f"\\u{hex(ord(phoneme))}", + "name": unicodedata.category(phoneme), + "category": unicodedata.category(phoneme), + } + for phoneme, count in missing_phonemes.most_common() + } + }, + sys.stdout, + ) # ----------------------------------------------------------------------------- diff --git a/src/python/piper_train/phonemize.py b/src/python/piper_train/phonemize.py index 6b99fb2..ce68729 100644 --- a/src/python/piper_train/phonemize.py +++ b/src/python/piper_train/phonemize.py @@ -158,6 +158,8 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { "ˤ": [146], "ε": [147], "": [148], + "#": [149], # Icelandic + '"': [150], # Russian }