mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-16 13:25:30 +00:00
Add new phonemes
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
@@ -9,6 +8,7 @@ from .phonemize import DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
|
||||
def main() -> None:
|
||||
used_phonemes: Counter[str] = Counter()
|
||||
missing_phonemes: Counter[str] = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
@@ -18,23 +18,37 @@ def main() -> None:
|
||||
|
||||
utt = json.loads(line)
|
||||
for phoneme in utt["phonemes"]:
|
||||
used_phonemes[phoneme] += 1
|
||||
|
||||
if phoneme not in DEFAULT_PHONEME_ID_MAP:
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
|
||||
writer = csv.writer(sys.stdout)
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
hex_phoneme = hex(ord(phoneme))
|
||||
writer.writerow(
|
||||
(
|
||||
phoneme,
|
||||
unicodedata.category(phoneme),
|
||||
unicodedata.name(phoneme),
|
||||
f"\\u{hex_phoneme}",
|
||||
count,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"used": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in used_phonemes.most_common()
|
||||
},
|
||||
"missing": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in missing_phonemes.most_common()
|
||||
}
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@@ -158,6 +158,8 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"ˤ": [146],
|
||||
"ε": [147],
|
||||
"": [148],
|
||||
"#": [149], # Icelandic
|
||||
'"': [150], # Russian
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user