diff --git a/src/python/piper_train/check_phonemes.py b/src/python/piper_train/check_phonemes.py new file mode 100644 index 0000000..cbf44c6 --- /dev/null +++ b/src/python/piper_train/check_phonemes.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import csv +import json +import sys +import unicodedata +from collections import Counter + +from .phonemize import DEFAULT_PHONEME_ID_MAP + + +def main() -> None: + missing_phonemes: Counter[str] = Counter() + + for line in sys.stdin: + line = line.strip() + if not line: + continue + + utt = json.loads(line) + for phoneme in utt["phonemes"]: + if phoneme not in DEFAULT_PHONEME_ID_MAP: + missing_phonemes[phoneme] += 1 + + if missing_phonemes: + print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr) + writer = csv.writer(sys.stdout) + for phoneme, count in missing_phonemes.most_common(): + hex_phoneme = hex(ord(phoneme)) + writer.writerow( + ( + phoneme, + unicodedata.category(phoneme), + unicodedata.name(phoneme), + f"\\u{hex_phoneme}", + count, + ) + ) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/src/python/piper_train/phonemize.py b/src/python/piper_train/phonemize.py index 3acc2fb..81799a1 100644 --- a/src/python/piper_train/phonemize.py +++ b/src/python/piper_train/phonemize.py @@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Mapping, Optional from espeak_phonemizer import Phonemizer +MAX_PHONEMES = 256 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { "_": [0], "^": [1], @@ -135,6 +136,25 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { "χ": [127], "ᵻ": [128], "ⱱ": [129], + "0": [130], # tones + "1": [131], + "2": [132], + "3": [133], + "4": [134], + "5": [135], + "6": [136], + "7": [137], + "8": [138], + "9": [139], + "\u0327": [140], # combining cedilla + "\u0303": [141], # combining tilde + "\u032a": [142], # combining bridge below + "\u032f": [143], # combining inverted breve below + "\u0329": [144], # combining vertical line below + "ʰ": [145], + "ˤ": [146], + "ε": [147], + "": [148], } diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 722b399..1eae600 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -8,7 +8,7 @@ import logging import os from collections import Counter from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass +from dataclasses import dataclass, field from multiprocessing import JoinableQueue, Process, Queue from pathlib import Path from typing import Dict, Iterable, List, Optional @@ -16,7 +16,7 @@ from typing import Dict, Iterable, List, Optional from espeak_phonemizer import Phonemizer from .norm_audio import cache_norm_audio, make_silence_detector -from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize +from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES _LOGGER = logging.getLogger("preprocess") @@ -49,6 +49,9 @@ def main() -> None: parser.add_argument( "--speaker-id", type=int, help="Add speaker id to single speaker dataset" ) + parser.add_argument( + "--skip-audio", action="store_true", help="Don't preprocess audio" + ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to the console" ) @@ -120,9 +123,7 @@ def main() -> None: "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8}, "phoneme_map": {}, "phoneme_id_map": DEFAULT_PHONEME_ID_MAP, - "num_symbols": len( - set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values())) - ), + "num_symbols": MAX_PHONEMES, "num_speakers": len(speaker_counts), "speaker_id_map": speaker_ids, }, @@ -160,21 +161,33 @@ def main() -> None: queue_in.put(utt_batch) _LOGGER.debug("Waiting for jobs to finish") + missing_phonemes: Counter[str] = Counter() for _ in range(num_utterances): utt = queue_out.get() if utt is not None: if utt.speaker is not None: utt.speaker_id = speaker_ids[utt.speaker] + utt_dict = dataclasses.asdict(utt) + utt_dict.pop("missing_phonemes") + # JSONL json.dump( - dataclasses.asdict(utt), + utt_dict, dataset_file, ensure_ascii=False, cls=PathEncoder, ) print("", file=dataset_file) + missing_phonemes.update(utt.missing_phonemes) + + if missing_phonemes: + for phoneme, count in missing_phonemes.most_common(): + _LOGGER.warning("Missing %s (%s)", phoneme, count) + + _LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes)) + # Signal workers to stop for proc in processes: queue_in.put(None) @@ -201,13 +214,17 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: try: _LOGGER.debug(utt) utt.phonemes = phonemize(utt.text, phonemizer) - utt.phoneme_ids = phonemes_to_ids(utt.phonemes) - utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio( - utt.audio_path, - args.cache_dir, - silence_detector, - args.sample_rate, + utt.phoneme_ids = phonemes_to_ids( + utt.phonemes, + missing_phonemes=utt.missing_phonemes, ) + if not args.skip_audio: + utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio( + utt.audio_path, + args.cache_dir, + silence_detector, + args.sample_rate, + ) queue_out.put(utt) except TimeoutError: _LOGGER.error("Skipping utterance due to timeout: %s", utt) @@ -233,6 +250,7 @@ class Utterance: phoneme_ids: Optional[List[int]] = None audio_norm_path: Optional[Path] = None audio_spec_path: Optional[Path] = None + missing_phonemes: Counter[str] = field(default_factory=Counter) class PathEncoder(json.JSONEncoder):