Add uk tests sentences

This commit is contained in:
Michael Hansen
2023-05-17 11:12:39 -05:00
parent bee661db8d
commit 27b81a800d
4 changed files with 64 additions and 9 deletions

View File

@@ -300,18 +300,55 @@ def phonemes_to_ids(
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("language")
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
args = parser.parse_args()
phonemizer = Phonemizer(args.language)
phonemizer: Optional[Phonemizer] = None
if args.text_casing == "lower":
casing = str.lower
elif args.text_casing == "upper":
casing = str.upper
else:
# ignore
casing = lambda s: s
if args.phoneme_type == PhonemeType.TEXT:
# Use text directly
phoneme_id_map = ALPHABETS[args.language]
else:
# Use eSpeak
phonemizer = Phonemizer(args.language)
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
phoneme_map = PHONEME_MAPS.get(args.language)
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(phonemes)
if args.phoneme_type == PhonemeType.TEXT:
phonemes = list(unicodedata.normalize("NFD", casing(line)))
else:
assert phonemizer is not None
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
)
json.dump(
{
"text": line,
@@ -323,6 +360,11 @@ def main() -> None:
)
print("")
if missing_phonemes:
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
for phoneme, count in missing_phonemes.most_common():
print(phoneme, count, file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -303,12 +303,11 @@ def phonemize_batch_text(
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = []
for phoneme in utt.phonemes:
if phoneme in alphabet:
utt.phoneme_ids.extend(alphabet[phoneme])
else:
utt.missing_phonemes[phoneme] += 1
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
phoneme_id_map=alphabet,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,