Merge branch 'rhasspy:master' into master

2026-06-10 13:42:26 +00:00 · 2023-06-10 09:15:39 -05:00
parent 1052b7e38c d94387374f
commit 49cb95f157
53 changed files with 1967 additions and 888 deletions
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import json
+import sys
+import unicodedata
+from collections import Counter
+
+from .phonemize import DEFAULT_PHONEME_ID_MAP
+
+
+def main() -> None:
+    used_phonemes: "Counter[str]" = Counter()
+    missing_phonemes: "Counter[str]" = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        for phoneme in utt["phonemes"]:
+            used_phonemes[phoneme] += 1
+
+            if phoneme not in DEFAULT_PHONEME_ID_MAP:
+                missing_phonemes[phoneme] += 1
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
+
+    json.dump(
+        {
+            "used": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in used_phonemes.most_common()
+            },
+            "missing": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in missing_phonemes.most_common()
+            },
+        },
+        sys.stdout,
+    )
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
@@ -2,7 +2,6 @@
 import argparse
 import logging
 from pathlib import Path
-from typing import Optional

 import torch

@@ -41,7 +40,6 @@ def main():
    model_g = model.model_g

    num_symbols = model_g.n_vocab
-    num_speakers = model_g.n_speakers

    # Inference only
    model_g.eval()
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import json
+import re
+import shutil
+import statistics
+import subprocess
+import sys
+import threading
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict, dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+from .norm_audio import make_silence_detector, trim_silence
+
+_DIR = Path(__file__).parent
+
+# Removed from the speaking rate calculation
+_PUNCTUATION = re.compile(".。,，?¿？؟!！;；:：-—")
+
+
+class ExcludeReason(str, Enum):
+    MISSING = "file_missing"
+    EMPTY = "file_empty"
+    LOW = "rate_low"
+    HIGH = "rate_high"
+
+
+@dataclass
+class Utterance:
+    id: str
+    text: str
+    duration_sec: float
+    speaker: str
+    exclude_reason: Optional[ExcludeReason] = None
+    rate: float = 0.0
+
+    def __post_init__(self):
+        if self.duration_sec > 0:
+            # Don't include punctuation is speaking rate calculation since we
+            # remove silence.
+            text_nopunct = _PUNCTUATION.sub("", self.text)
+            self.rate = len(text_nopunct) / self.duration_sec
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--write-json", help="Path to write information about excluded utterances"
+    )
+    parser.add_argument(
+        "--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
+    )
+    parser.add_argument("--scale-lower", type=float, default=2.0)
+    parser.add_argument("--scale-upper", type=float, default=2.0)
+    args = parser.parse_args()
+
+    if not shutil.which("ffprobe"):
+        raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
+
+    dataset_dir = Path(args.dataset_dir)
+    wav_dir = dataset_dir / "wav"
+    if not wav_dir.is_dir():
+        wav_dir = dataset_dir / "wavs"
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+
+    text_and_audio = []
+    for row in reader:
+        filename, text = row[0], row[-1]
+        speaker = row[1] if len(row) > 2 else "default"
+
+        # Try file name relative to metadata
+        wav_path = dataset_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = dataset_dir / f"{filename}.wav"
+
+        if not wav_path.exists():
+            # Try wav/ or wavs/
+            wav_path = wav_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = wav_dir / f"{filename}.wav"
+
+        text_and_audio.append((filename, text, wav_path, speaker))
+
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    # speaker -> [rate]
+    utts_by_speaker = defaultdict(list)
+    process_utterance = ProcessUtterance()
+    with ThreadPoolExecutor() as executor:
+        for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
+            utts_by_speaker[utt.speaker].append(utt)
+
+    is_multispeaker = len(utts_by_speaker) > 1
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    speaker_details = {}
+    for speaker, utts in utts_by_speaker.items():
+        rates = [utt.rate for utt in utts]
+        if rates:
+            # Exclude rates well outside the 25%/75% quantiles
+            rate_qs = statistics.quantiles(rates, n=4)
+            q1 = rate_qs[0]  # 25%
+            q3 = rate_qs[-1]  # 75%
+            iqr = q3 - q1
+            lower = q1 - (args.scale_lower * iqr)
+            upper = q3 + (args.scale_upper * iqr)
+            speaker_details[speaker] = {
+                "min": min(rates),
+                "max": max(rates),
+                "quanties": rate_qs,
+                "lower": lower,
+                "upper": upper,
+            }
+
+            for utt in utts:
+                if utt.rate < lower:
+                    utt.exclude_reason = ExcludeReason.LOW
+                elif utt.rate > upper:
+                    utt.exclude_reason = ExcludeReason.HIGH
+                else:
+                    if is_multispeaker:
+                        writer.writerow((utt.id, utt.speaker, utt.text))
+                    else:
+                        writer.writerow((utt.id, utt.text))
+
+    if args.write_json:
+        speaker_excluded = {
+            speaker: [
+                asdict(utt)
+                for utt in utts_by_speaker[speaker]
+                if utt.exclude_reason is not None
+            ]
+            for speaker in speaker_details
+        }
+
+        with open(args.write_json, "w") as json_file:
+            json.dump(
+                {
+                    speaker: {
+                        "details": speaker_details[speaker],
+                        "num_utterances": len(utts_by_speaker[speaker]),
+                        "num_excluded": len(speaker_excluded[speaker]),
+                        "excluded": speaker_excluded[speaker],
+                    }
+                    for speaker in speaker_details
+                },
+                json_file,
+                indent=4,
+                ensure_ascii=False,
+            )
+
+
+class ProcessUtterance:
+    def __init__(self):
+        self.thread_data = threading.local()
+
+    def __call__(
+        self, utt_id: str, text: str, wav_path: Path, speaker: str
+    ) -> Utterance:
+        if not wav_path.exists():
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.MISSING,
+            )
+
+        if wav_path.stat().st_size == 0:
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.EMPTY,
+            )
+
+        return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
+
+    def get_duration(self, audio_path: Path) -> float:
+        """Uses ffmpeg to get audio duration."""
+        if not hasattr(self.thread_data, "detector"):
+            self.thread_data.detector = make_silence_detector()
+
+        vad_sample_rate = 16000
+        audio_16khz_bytes = subprocess.check_output(
+            [
+                "ffmpeg",
+                "-i",
+                str(audio_path),
+                "-f",
+                "s16le",
+                "-acodec",
+                "pcm_s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(vad_sample_rate),
+                "pipe:",
+            ],
+            stderr=subprocess.DEVNULL,
+        )
+
+        # Normalize
+        audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
+            np.float32
+        )
+        audio_16khz /= np.abs(np.max(audio_16khz))
+
+        # Get speaking duration
+        offset_sec, duration_sec = trim_silence(
+            audio_16khz,
+            self.thread_data.detector,
+            threshold=0.8,
+            samples_per_chunk=480,
+            sample_rate=vad_sample_rate,
+            keep_chunks_before=2,
+            keep_chunks_after=2,
+        )
+
+        if duration_sec is None:
+            # Speech goes to end of audio
+            if len(audio_16khz) > 0:
+                duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
+            else:
+                duration_sec = 0.0
+
+        return duration_sec
+
+        # return float(
+        #     subprocess.check_output(
+        #         [
+        #             "ffprobe",
+        #             "-i",
+        #             str(audio_path),
+        #             "-show_entries",
+        #             "format=duration",
+        #             "-v",
+        #             "quiet",
+        #             "-of",
+        #             "csv=p=0",
+        #         ],
+        #         stderr=subprocess.DEVNULL,
+        #         universal_newlines=True,
+        #     ).strip()
+        # )
+
+
+if __name__ == "__main__":
+    main()
@@ -1,9 +1,23 @@
+import argparse
+import json
+import sys
 import unicodedata
 from collections import Counter
+from enum import Enum
 from typing import Dict, Iterable, List, Mapping, Optional

 from espeak_phonemizer import Phonemizer

+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    """Phonemes come from espeak-ng"""
+
+    TEXT = "text"
+    """Phonemes come from text itself"""
+
+
+MAX_PHONEMES = 256
 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "_": [0],
    "^": [1],
@@ -135,14 +149,115 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "χ": [127],
    "ᵻ": [128],
    "ⱱ": [129],
+    "0": [130],  # tones
+    "1": [131],
+    "2": [132],
+    "3": [133],
+    "4": [134],
+    "5": [135],
+    "6": [136],
+    "7": [137],
+    "8": [138],
+    "9": [139],
+    "\u0327": [140],  # combining cedilla
+    "\u0303": [141],  # combining tilde
+    "\u032a": [142],  # combining bridge below
+    "\u032f": [143],  # combining inverted breve below
+    "\u0329": [144],  # combining vertical line below
+    "ʰ": [145],
+    "ˤ": [146],
+    "ε": [147],
+    "↓": [148],
+    "#": [149],  # Icelandic
+    '"': [150],  # Russian
+    "↑": [151],
+    "\u033a": [152],  # Basque
+    "\u033b": [153],
+}
+
+PHONEME_MAPS = {
+    # Brazilian Portuguese
+    "pt-br": {"c": ["k"]}
+}
+
+ALPHABETS = {
+    # Ukrainian
+    "uk": {
+        "_": [0],
+        "^": [1],
+        "$": [2],
+        " ": [3],
+        "!": [4],
+        "'": [5],
+        ",": [6],
+        "-": [7],
+        ".": [8],
+        ":": [9],
+        ";": [10],
+        "?": [11],
+        "а": [12],
+        "б": [13],
+        "в": [14],
+        "г": [15],
+        "ґ": [16],
+        "д": [17],
+        "е": [18],
+        "є": [19],
+        "ж": [20],
+        "з": [21],
+        "и": [22],
+        "і": [23],
+        "ї": [24],
+        "й": [25],
+        "к": [26],
+        "л": [27],
+        "м": [28],
+        "н": [29],
+        "о": [30],
+        "п": [31],
+        "р": [32],
+        "с": [33],
+        "т": [34],
+        "у": [35],
+        "ф": [36],
+        "х": [37],
+        "ц": [38],
+        "ч": [39],
+        "ш": [40],
+        "щ": [41],
+        "ь": [42],
+        "ю": [43],
+        "я": [44],
+        "\u0301": [45],  # combining acute accent
+        "\u0306": [46],  # combining breve
+        "\u0308": [47],  # combining diaeresis
+        "—": [48],  # em dash
+    }
 }


-def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
+def phonemize(
+    text: str,
+    phonemizer: Phonemizer,
+    phoneme_map: Optional[Dict[str, List[str]]] = None,
+) -> List[str]:
    phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)

    # Phonemes are decomposed into unicode codepoints
-    return list(unicodedata.normalize("NFD", phonemes_str))
+    unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
+    if not phoneme_map:
+        return unmapped_phonemes
+
+    # Phonemes can be mapped to lists of other phonemes
+    mapped_phonemes = []
+    for phoneme in unmapped_phonemes:
+        sub_phonemes = phoneme_map.get(phoneme)
+        if sub_phonemes:
+            mapped_phonemes.extend(sub_phonemes)
+        else:
+            mapped_phonemes.append(phoneme)
+
+    return mapped_phonemes


 def phonemes_to_ids(
@@ -179,3 +294,79 @@ def phonemes_to_ids(
        phoneme_ids.extend(phoneme_id_map[eos])

    return phoneme_ids
+
+
+# -----------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("language")
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    args = parser.parse_args()
+
+    phonemizer: Optional[Phonemizer] = None
+
+    if args.text_casing == "lower":
+        casing = str.lower
+    elif args.text_casing == "upper":
+        casing = str.upper
+    else:
+        # ignore
+        casing = lambda s: s
+
+    if args.phoneme_type == PhonemeType.TEXT:
+        # Use text directly
+        phoneme_id_map = ALPHABETS[args.language]
+    else:
+        # Use eSpeak
+        phonemizer = Phonemizer(args.language)
+        phoneme_id_map = DEFAULT_PHONEME_ID_MAP
+
+    phoneme_map = PHONEME_MAPS.get(args.language)
+    missing_phonemes: "Counter[str]" = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        if args.phoneme_type == PhonemeType.TEXT:
+            phonemes = list(unicodedata.normalize("NFD", casing(line)))
+        else:
+            assert phonemizer is not None
+            phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
+
+        phoneme_ids = phonemes_to_ids(
+            phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
+        )
+        json.dump(
+            {
+                "text": line,
+                "phonemes": phonemes,
+                "phoneme_ids": phoneme_ids,
+            },
+            sys.stdout,
+            ensure_ascii=False,
+        )
+        print("")
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
+        for phoneme, count in missing_phonemes.most_common():
+            print(phoneme, count, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
@@ -6,9 +6,9 @@ import itertools
 import json
 import logging
 import os
+import unicodedata
 from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
@@ -16,7 +16,15 @@ from typing import Dict, Iterable, List, Optional
 from espeak_phonemizer import Phonemizer

 from .norm_audio import cache_norm_audio, make_silence_detector
-from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
+from .phonemize import (
+    ALPHABETS,
+    DEFAULT_PHONEME_ID_MAP,
+    MAX_PHONEMES,
+    PHONEME_MAPS,
+    PhonemeType,
+    phonemes_to_ids,
+    phonemize,
+)

 _LOGGER = logging.getLogger("preprocess")

@@ -49,6 +57,23 @@ def main() -> None:
    parser.add_argument(
        "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
    )
+    #
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    #
+    parser.add_argument(
+        "--skip-audio", action="store_true", help="Don't preprocess audio"
+    )
    parser.add_argument(
        "--debug", action="store_true", help="Print DEBUG messages to the console"
    )
@@ -84,9 +109,9 @@ def main() -> None:

    # Count speakers
    _LOGGER.debug("Counting number of speakers/utterances in the dataset")
-    speaker_counts: Counter[str] = Counter()
+    speaker_counts: "Counter[str]" = Counter()
    num_utterances = 0
-    for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
+    for utt in make_dataset(args):
        speaker = utt.speaker or ""
        speaker_counts[speaker] += 1
        num_utterances += 1
@@ -118,11 +143,12 @@ def main() -> None:
                    "voice": args.language,
                },
                "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
+                "phoneme_type": str(args.phoneme_type),
                "phoneme_map": {},
-                "phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
-                "num_symbols": len(
-                    set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
-                ),
+                "phoneme_id_map": ALPHABETS[args.language]
+                if args.phoneme_type == PhonemeType.TEXT
+                else DEFAULT_PHONEME_ID_MAP,
+                "num_symbols": MAX_PHONEMES,
                "num_speakers": len(speaker_counts),
                "speaker_id_map": speaker_ids,
            },
@@ -142,8 +168,13 @@ def main() -> None:
    queue_out: "Queue[Optional[Utterance]]" = Queue()

    # Start workers
+    if args.phoneme_type == PhonemeType.TEXT:
+        target = phonemize_batch_text
+    else:
+        target = phonemize_batch_espeak
+
    processes = [
-        Process(target=process_batch, args=(args, queue_in, queue_out))
+        Process(target=target, args=(args, queue_in, queue_out))
        for _ in range(args.max_workers)
    ]
    for proc in processes:
@@ -154,27 +185,39 @@ def main() -> None:
    )
    with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
        for utt_batch in batched(
-            make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
+            make_dataset(args),
            batch_size,
        ):
            queue_in.put(utt_batch)

        _LOGGER.debug("Waiting for jobs to finish")
+        missing_phonemes: "Counter[str]" = Counter()
        for _ in range(num_utterances):
            utt = queue_out.get()
            if utt is not None:
                if utt.speaker is not None:
                    utt.speaker_id = speaker_ids[utt.speaker]

+                utt_dict = dataclasses.asdict(utt)
+                utt_dict.pop("missing_phonemes")
+
                # JSONL
                json.dump(
-                    dataclasses.asdict(utt),
+                    utt_dict,
                    dataset_file,
                    ensure_ascii=False,
                    cls=PathEncoder,
                )
                print("", file=dataset_file)

+                missing_phonemes.update(utt.missing_phonemes)
+
+        if missing_phonemes:
+            for phoneme, count in missing_phonemes.most_common():
+                _LOGGER.warning("Missing %s (%s)", phoneme, count)
+
+            _LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
+
    # Signal workers to stop
    for proc in processes:
        queue_in.put(None)
@@ -187,10 +230,27 @@ def main() -> None:
 # -----------------------------------------------------------------------------


-def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
+def get_text_casing(casing: str):
+    if casing == "lower":
+        return str.lower
+
+    if casing == "upper":
+        return str.upper
+
+    if casing == "casefold":
+        return str.casefold
+
+    return lambda s: s
+
+
+def phonemize_batch_espeak(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
    try:
+        casing = get_text_casing(args.text_casing)
        silence_detector = make_silence_detector()
        phonemizer = Phonemizer(default_voice=args.language)
+        phoneme_map = PHONEME_MAPS.get(args.language)

        while True:
            utt_batch = queue_in.get()
@@ -200,14 +260,20 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
            for utt in utt_batch:
                try:
                    _LOGGER.debug(utt)
-                    utt.phonemes = phonemize(utt.text, phonemizer)
-                    utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
-                    utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
-                        utt.audio_path,
-                        args.cache_dir,
-                        silence_detector,
-                        args.sample_rate,
+                    utt.phonemes = phonemize(
+                        casing(utt.text), phonemizer, phoneme_map=phoneme_map
                    )
+                    utt.phoneme_ids = phonemes_to_ids(
+                        utt.phonemes,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
                    queue_out.put(utt)
                except TimeoutError:
                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
@@ -217,7 +283,48 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:

            queue_in.task_done()
    except Exception:
-        _LOGGER.exception("process_batch")
+        _LOGGER.exception("phonemize_batch_espeak")
+
+
+def phonemize_batch_text(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
+    try:
+        casing = get_text_casing(args.text_casing)
+        silence_detector = make_silence_detector()
+        alphabet = ALPHABETS[args.language]
+
+        while True:
+            utt_batch = queue_in.get()
+            if utt_batch is None:
+                break
+
+            for utt in utt_batch:
+                try:
+                    _LOGGER.debug(utt)
+                    utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
+                    utt.phoneme_ids = phonemes_to_ids(
+                        utt.phonemes,
+                        phoneme_id_map=alphabet,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
+                    queue_out.put(utt)
+                except TimeoutError:
+                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
+                except Exception:
+                    _LOGGER.exception("Failed to process utterance: %s", utt)
+                    queue_out.put(None)
+
+            queue_in.task_done()
+    except Exception:
+        _LOGGER.exception("phonemize_batch_text")


 # -----------------------------------------------------------------------------
@@ -233,6 +340,7 @@ class Utterance:
    phoneme_ids: Optional[List[int]] = None
    audio_norm_path: Optional[Path] = None
    audio_spec_path: Optional[Path] = None
+    missing_phonemes: "Counter[str]" = field(default_factory=Counter)


 class PathEncoder(json.JSONEncoder):
@@ -242,9 +350,12 @@ class PathEncoder(json.JSONEncoder):
        return super().default(o)


-def ljspeech_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
-) -> Iterable[Utterance]:
+def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    speaker_id = args.speaker_id
+    skip_audio = args.skip_audio
+
    # filename|speaker|text
    # speaker is optional
    metadata_path = dataset_dir / "metadata.csv"
@@ -257,7 +368,7 @@ def ljspeech_dataset(
    with open(metadata_path, "r", encoding="utf-8") as csv_file:
        reader = csv.reader(csv_file, delimiter="|")
        for row in reader:
-            assert len(row) >= 2, "Not enough colums"
+            assert len(row) >= 2, "Not enough columns"

            speaker: Optional[str] = None
            if is_single_speaker or (len(row) == 2):
@@ -280,18 +391,25 @@ def ljspeech_dataset(
                # Try with .wav
                wav_path = wav_dir / f"{filename}.wav"

-            if not wav_path.exists():
-                _LOGGER.warning("Missing %s", filename)
-                continue
+            if not skip_audio:
+                if not wav_path.exists():
+                    _LOGGER.warning("Missing %s", filename)
+                    continue
+
+                if wav_path.stat().st_size == 0:
+                    _LOGGER.warning("Empty file: %s", wav_path)
+                    continue

            yield Utterance(
                text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
            )


-def mycroft_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
-) -> Iterable[Utterance]:
+def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    skip_audio = args.skip_audio
+
    speaker_id = 0
    for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
        speaker = metadata_path.parent.name if not is_single_speaker else None
@@ -301,15 +419,15 @@ def mycroft_dataset(
            for row in reader:
                filename, text = row[0], row[1]
                wav_path = metadata_path.parent / filename
-                yield Utterance(
-                    text=text,
-                    audio_path=wav_path,
-                    speaker=speaker,
-                    speaker_id=speaker_id if not is_single_speaker else None,
-                )
+                if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
+                    yield Utterance(
+                        text=text,
+                        audio_path=wav_path,
+                        speaker=speaker,
+                        speaker_id=speaker_id if not is_single_speaker else None,
+                    )
        speaker_id += 1

-
 # -----------------------------------------------------------------------------


@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import sys
+from collections import Counter, defaultdict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--speaker-number", type=int)
+    parser.add_argument("--speaker-name")
+    args = parser.parse_args()
+
+    assert (args.speaker_number is not None) or (args.speaker_name is not None)
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    if args.speaker_name is not None:
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            if args.speaker_name == speaker_id:
+                writer.writerow((audio, text))
+    else:
+        utterances = defaultdict(list)
+        counts = Counter()
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            utterances[speaker_id].append((audio, text))
+            counts[speaker_id] += 1
+
+        writer = csv.writer(sys.stdout, delimiter="|")
+        for i, (speaker_id, _count) in enumerate(counts.most_common()):
+            if i == args.speaker_number:
+                for row in utterances[speaker_id]:
+                    writer.writerow(row)
+
+                print(speaker_id, file=sys.stderr)
+                break
+
+
+if __name__ == "__main__":
+    main()
@@ -8,7 +8,8 @@ docker run \
  --user "$(id -u):$(id -g)" \
  --ipc=host \
  -v "${HOME}:${HOME}" \
+  -v /media/cache:/media/cache:ro \
  -v /etc/hostname:/etc/hostname:ro \
  -v /etc/localtime:/etc/localtime:ro \
-  piper-train \
+  larynx2-train \
  "$@"