Add new phonemes

This commit is contained in:
Michael Hansen
2023-05-10 12:19:37 -05:00
parent c53493eeb2
commit 5ca42b0753
2 changed files with 29 additions and 13 deletions

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python3
import csv
import json
import sys
import unicodedata
@@ -9,6 +8,7 @@ from .phonemize import DEFAULT_PHONEME_ID_MAP
def main() -> None:
used_phonemes: Counter[str] = Counter()
missing_phonemes: Counter[str] = Counter()
for line in sys.stdin:
@@ -18,23 +18,37 @@ def main() -> None:
utt = json.loads(line)
for phoneme in utt["phonemes"]:
used_phonemes[phoneme] += 1
if phoneme not in DEFAULT_PHONEME_ID_MAP:
missing_phonemes[phoneme] += 1
if missing_phonemes:
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
writer = csv.writer(sys.stdout)
for phoneme, count in missing_phonemes.most_common():
hex_phoneme = hex(ord(phoneme))
writer.writerow(
(
phoneme,
unicodedata.category(phoneme),
unicodedata.name(phoneme),
f"\\u{hex_phoneme}",
count,
)
)
json.dump(
{
"used": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in used_phonemes.most_common()
},
"missing": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in missing_phonemes.most_common()
}
},
sys.stdout,
)
# -----------------------------------------------------------------------------

View File

@@ -158,6 +158,8 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"ˤ": [146],
"ε": [147],
"": [148],
"#": [149], # Icelandic
'"': [150], # Russian
}