Add filter utterances

2026-04-21 07:14:49 +00:00 · 2023-05-11 20:48:58 -05:00
parent 10b136cdf8
commit 5a64768924
3 changed files with 267 additions and 110 deletions
--- a/src/python/piper_train/preprocess.py
+++ b/src/python/piper_train/preprocess.py
@@ -11,7 +11,7 @@ from collections import Counter
 from dataclasses import dataclass, field
 from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, Iterable, List, Optional, Tuple

 import librosa
 from espeak_phonemizer import Phonemizer
@@ -71,13 +71,6 @@ def main() -> None:
        help="Casing applied to utterance text",
    )
    #
-    parser.add_argument(
-        "--speaking-rate-min", type=float, help="Minimum speaking rate (chars/sec)"
-    )
-    parser.add_argument(
-        "--speaking-rate-max", type=float, help="Maximum speaking rate (chars/sec)"
-    )
-    #
    parser.add_argument(
        "--skip-audio", action="store_true", help="Don't preprocess audio"
    )
@@ -355,32 +348,6 @@ class PathEncoder(json.JSONEncoder):
        return super().default(o)


-def is_good_speaking_rate(
-    text: str,
-    wav_path: Path,
-    args: argparse.Namespace,
-) -> bool:
-    min_rate: Optional[float] = args.speaking_rate_min
-    max_rate: Optional[float] = args.speaking_rate_max
-
-    if (min_rate is None) and (max_rate is None):
-        return True
-
-    if len(text) == 0:
-        return False
-
-    duration = librosa.get_duration(path=wav_path)
-    rate = len(text) / duration
-
-    if (min_rate is not None) and (rate < min_rate):
-        return False
-
-    if (max_rate is not None) and (rate > max_rate):
-        return False
-
-    return True
-
-
 def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
    dataset_dir = args.input_dir
    is_single_speaker = args.single_speaker
@@ -431,10 +398,6 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
                    _LOGGER.warning("Empty file: %s", wav_path)
                    continue

-                if not is_good_speaking_rate(text, wav_path, args):
-                    _LOGGER.warning("Bad speaking rate: %s", wav_path)
-                    continue
-
            yield Utterance(
                text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
            )