Add text phonemes to preprocess

2026-06-01 01:17:02 +00:00 · 2023-06-08 15:37:20 -05:00
parent a66ac02ca7
commit a7eab8ed03
5 changed files with 178 additions and 15 deletions
@@ -45,7 +45,7 @@ def main() -> None:
                    "category": unicodedata.category(phoneme),
                }
                for phoneme, count in missing_phonemes.most_common()
-            }
+            },
        },
        sys.stdout,
    )
@@ -2,7 +2,6 @@
 import argparse
 import logging
 from pathlib import Path
-from typing import Optional

 import torch

@@ -41,7 +40,6 @@ def main():
    model_g = model.model_g

    num_symbols = model_g.n_vocab
-    num_speakers = model_g.n_speakers

    # Inference only
    model_g.eval()
@@ -3,10 +3,20 @@ import json
 import sys
 import unicodedata
 from collections import Counter
+from enum import Enum
 from typing import Dict, Iterable, List, Mapping, Optional

 from espeak_phonemizer import Phonemizer

+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    """Phonemes come from espeak-ng"""
+
+    TEXT = "text"
+    """Phonemes come from text itself"""
+
+
 MAX_PHONEMES = 256
 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "_": [0],
@@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    '"': [150],  # Russian
 }

+ALPHABETS = {
+    # Ukrainian
+    "uk": {
+        "_": [0],
+        "^": [1],
+        "$": [2],
+        " ": [3],
+        "!": [4],
+        "'": [5],
+        ",": [6],
+        "-": [7],
+        ".": [8],
+        ":": [9],
+        ";": [10],
+        "?": [11],
+        "а": [12],
+        "б": [13],
+        "в": [14],
+        "г": [15],
+        "ґ": [16],
+        "д": [17],
+        "е": [18],
+        "є": [19],
+        "ж": [20],
+        "з": [21],
+        "и": [22],
+        "і": [23],
+        "ї": [24],
+        "й": [25],
+        "к": [26],
+        "л": [27],
+        "м": [28],
+        "н": [29],
+        "о": [30],
+        "п": [31],
+        "р": [32],
+        "с": [33],
+        "т": [34],
+        "у": [35],
+        "ф": [36],
+        "х": [37],
+        "ц": [38],
+        "ч": [39],
+        "ш": [40],
+        "щ": [41],
+        "ь": [42],
+        "ю": [43],
+        "я": [44],
+    }
+}
+

 def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
    phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
@@ -6,8 +6,8 @@ import itertools
 import json
 import logging
 import os
+import unicodedata
 from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
@@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional
 from espeak_phonemizer import Phonemizer

 from .norm_audio import cache_norm_audio, make_silence_detector
-from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
+from .phonemize import (
+    ALPHABETS,
+    DEFAULT_PHONEME_ID_MAP,
+    MAX_PHONEMES,
+    PhonemeType,
+    phonemes_to_ids,
+    phonemize,
+)

 _LOGGER = logging.getLogger("preprocess")

@@ -49,6 +56,20 @@ def main() -> None:
    parser.add_argument(
        "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
    )
+    #
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    #
    parser.add_argument(
        "--skip-audio", action="store_true", help="Don't preprocess audio"
    )
@@ -89,7 +110,12 @@ def main() -> None:
    _LOGGER.debug("Counting number of speakers/utterances in the dataset")
    speaker_counts: Counter[str] = Counter()
    num_utterances = 0
-    for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
+    for utt in make_dataset(
+        args.input_dir,
+        args.single_speaker,
+        args.speaker_id,
+        args.skip_audio,
+    ):
        speaker = utt.speaker or ""
        speaker_counts[speaker] += 1
        num_utterances += 1
@@ -121,8 +147,11 @@ def main() -> None:
                    "voice": args.language,
                },
                "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
+                "phoneme_type": str(args.phoneme_type),
                "phoneme_map": {},
-                "phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
+                "phoneme_id_map": ALPHABETS[args.language]
+                if args.phoneme_type == PhonemeType.TEXT
+                else DEFAULT_PHONEME_ID_MAP,
                "num_symbols": MAX_PHONEMES,
                "num_speakers": len(speaker_counts),
                "speaker_id_map": speaker_ids,
@@ -143,8 +172,13 @@ def main() -> None:
    queue_out: "Queue[Optional[Utterance]]" = Queue()

    # Start workers
+    if args.phoneme_type == PhonemeType.TEXT:
+        target = phonemize_batch_text
+    else:
+        target = phonemize_batch_espeak
+
    processes = [
-        Process(target=process_batch, args=(args, queue_in, queue_out))
+        Process(target=target, args=(args, queue_in, queue_out))
        for _ in range(args.max_workers)
    ]
    for proc in processes:
@@ -155,7 +189,12 @@ def main() -> None:
    )
    with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
        for utt_batch in batched(
-            make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
+            make_dataset(
+                args.input_dir,
+                args.single_speaker,
+                args.speaker_id,
+                args.skip_audio,
+            ),
            batch_size,
        ):
            queue_in.put(utt_batch)
@@ -200,8 +239,24 @@ def main() -> None:
 # -----------------------------------------------------------------------------


-def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
+def get_text_casing(casing: str):
+    if casing == "lower":
+        return str.lower
+
+    if casing == "upper":
+        return str.upper
+
+    if casing == "casefold":
+        return str.casefold
+
+    return lambda s: s
+
+
+def phonemize_batch_espeak(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
    try:
+        casing = get_text_casing(args.text_casing)
        silence_detector = make_silence_detector()
        phonemizer = Phonemizer(default_voice=args.language)

@@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
            for utt in utt_batch:
                try:
                    _LOGGER.debug(utt)
-                    utt.phonemes = phonemize(utt.text, phonemizer)
+                    utt.phonemes = phonemize(casing(utt.text), phonemizer)
                    utt.phoneme_ids = phonemes_to_ids(
                        utt.phonemes,
                        missing_phonemes=utt.missing_phonemes,
@@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:

            queue_in.task_done()
    except Exception:
-        _LOGGER.exception("process_batch")
+        _LOGGER.exception("phonemize_batch_espeak")
+
+
+def phonemize_batch_text(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
+    try:
+        casing = get_text_casing(args.text_casing)
+        silence_detector = make_silence_detector()
+        alphabet = ALPHABETS[args.language]
+
+        while True:
+            utt_batch = queue_in.get()
+            if utt_batch is None:
+                break
+
+            for utt in utt_batch:
+                try:
+                    _LOGGER.debug(utt)
+                    utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
+                    utt.phoneme_ids = []
+                    for phoneme in utt.phonemes:
+                        if phoneme in alphabet:
+                            utt.phoneme_ids.extend(alphabet[phoneme])
+                        else:
+                            utt.missing_phonemes[phoneme] += 1
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
+                    queue_out.put(utt)
+                except TimeoutError:
+                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
+                except Exception:
+                    _LOGGER.exception("Failed to process utterance: %s", utt)
+                    queue_out.put(None)
+
+            queue_in.task_done()
+    except Exception:
+        _LOGGER.exception("phonemize_batch_text")


 # -----------------------------------------------------------------------------
@@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder):


 def ljspeech_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
+    dataset_dir: Path,
+    is_single_speaker: bool,
+    speaker_id: Optional[int] = None,
+    skip_audio: bool = False,
 ) -> Iterable[Utterance]:
    # filename|speaker|text
    # speaker is optional
@@ -298,7 +398,7 @@ def ljspeech_dataset(
                # Try with .wav
                wav_path = wav_dir / f"{filename}.wav"

-            if not wav_path.exists():
+            if (not skip_audio) and (not wav_path.exists()):
                _LOGGER.warning("Missing %s", filename)
                continue

@@ -308,7 +408,10 @@ def ljspeech_dataset(


 def mycroft_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
+    dataset_dir: Path,
+    is_single_speaker: bool,
+    speaker_id: Optional[int] = None,
+    skip_audio: bool = False,
 ) -> Iterable[Utterance]:
    speaker_id = 0
    for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
@@ -4,6 +4,7 @@ import csv
 import sys
 from collections import Counter, defaultdict

+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--speaker-number", type=int)