Extending phoneme set to 256

2026-06-02 18:07:03 +00:00 · 2023-05-07 10:58:37 -05:00
parent 7d9a59ab91
commit d62340b68e
3 changed files with 93 additions and 12 deletions
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import csv
+import json
+import sys
+import unicodedata
+from collections import Counter
+
+from .phonemize import DEFAULT_PHONEME_ID_MAP
+
+
+def main() -> None:
+    missing_phonemes: Counter[str] = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        for phoneme in utt["phonemes"]:
+            if phoneme not in DEFAULT_PHONEME_ID_MAP:
+                missing_phonemes[phoneme] += 1
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
+        writer = csv.writer(sys.stdout)
+        for phoneme, count in missing_phonemes.most_common():
+            hex_phoneme = hex(ord(phoneme))
+            writer.writerow(
+                (
+                    phoneme,
+                    unicodedata.category(phoneme),
+                    unicodedata.name(phoneme),
+                    f"\\u{hex_phoneme}",
+                    count,
+                )
+            )
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
@@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Mapping, Optional

 from espeak_phonemizer import Phonemizer

+MAX_PHONEMES = 256
 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "_": [0],
    "^": [1],
@@ -135,6 +136,25 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "χ": [127],
    "ᵻ": [128],
    "ⱱ": [129],
+    "0": [130],  # tones
+    "1": [131],
+    "2": [132],
+    "3": [133],
+    "4": [134],
+    "5": [135],
+    "6": [136],
+    "7": [137],
+    "8": [138],
+    "9": [139],
+    "\u0327": [140],  # combining cedilla
+    "\u0303": [141],  # combining tilde
+    "\u032a": [142],  # combining bridge below
+    "\u032f": [143],  # combining inverted breve below
+    "\u0329": [144],  # combining vertical line below
+    "ʰ": [145],
+    "ˤ": [146],
+    "ε": [147],
+    "": [148],
 }


@@ -8,7 +8,7 @@ import logging
 import os
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
@@ -16,7 +16,7 @@ from typing import Dict, Iterable, List, Optional
 from espeak_phonemizer import Phonemizer

 from .norm_audio import cache_norm_audio, make_silence_detector
-from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
+from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES

 _LOGGER = logging.getLogger("preprocess")

@@ -49,6 +49,9 @@ def main() -> None:
    parser.add_argument(
        "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
    )
+    parser.add_argument(
+        "--skip-audio", action="store_true", help="Don't preprocess audio"
+    )
    parser.add_argument(
        "--debug", action="store_true", help="Print DEBUG messages to the console"
    )
@@ -120,9 +123,7 @@ def main() -> None:
                "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
                "phoneme_map": {},
                "phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
-                "num_symbols": len(
-                    set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
-                ),
+                "num_symbols": MAX_PHONEMES,
                "num_speakers": len(speaker_counts),
                "speaker_id_map": speaker_ids,
            },
@@ -160,21 +161,33 @@ def main() -> None:
            queue_in.put(utt_batch)

        _LOGGER.debug("Waiting for jobs to finish")
+        missing_phonemes: Counter[str] = Counter()
        for _ in range(num_utterances):
            utt = queue_out.get()
            if utt is not None:
                if utt.speaker is not None:
                    utt.speaker_id = speaker_ids[utt.speaker]

+                utt_dict = dataclasses.asdict(utt)
+                utt_dict.pop("missing_phonemes")
+
                # JSONL
                json.dump(
-                    dataclasses.asdict(utt),
+                    utt_dict,
                    dataset_file,
                    ensure_ascii=False,
                    cls=PathEncoder,
                )
                print("", file=dataset_file)

+                missing_phonemes.update(utt.missing_phonemes)
+
+        if missing_phonemes:
+            for phoneme, count in missing_phonemes.most_common():
+                _LOGGER.warning("Missing %s (%s)", phoneme, count)
+
+            _LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
+
    # Signal workers to stop
    for proc in processes:
        queue_in.put(None)
@@ -201,13 +214,17 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
                try:
                    _LOGGER.debug(utt)
                    utt.phonemes = phonemize(utt.text, phonemizer)
-                    utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
-                    utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
-                        utt.audio_path,
-                        args.cache_dir,
-                        silence_detector,
-                        args.sample_rate,
+                    utt.phoneme_ids = phonemes_to_ids(
+                        utt.phonemes,
+                        missing_phonemes=utt.missing_phonemes,
                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
                    queue_out.put(utt)
                except TimeoutError:
                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
@@ -233,6 +250,7 @@ class Utterance:
    phoneme_ids: Optional[List[int]] = None
    audio_norm_path: Optional[Path] = None
    audio_spec_path: Optional[Path] = None
+    missing_phonemes: Counter[str] = field(default_factory=Counter)


 class PathEncoder(json.JSONEncoder):