Add speaking rate check

2026-06-03 10:27:01 +00:00 · 2023-06-08 15:37:49 -05:00
parent 2ac55b57f5
commit 6577951061
1 changed files with 53 additions and 14 deletions
@@ -13,6 +13,7 @@ from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional

+import librosa
 from espeak_phonemizer import Phonemizer

 from .norm_audio import cache_norm_audio, make_silence_detector
@@ -70,6 +71,13 @@ def main() -> None:
        help="Casing applied to utterance text",
    )
    #
+    parser.add_argument(
+        "--speaking-rate-min", type=float, help="Minimum speaking rate (chars/sec)"
+    )
+    parser.add_argument(
+        "--speaking-rate-max", type=float, help="Maximum speaking rate (chars/sec)"
+    )
+    #
    parser.add_argument(
        "--skip-audio", action="store_true", help="Don't preprocess audio"
    )
@@ -357,6 +365,32 @@ class PathEncoder(json.JSONEncoder):
        return super().default(o)


+def is_good_speaking_rate(
+    text: str,
+    wav_path: Path,
+    args: argparse.Namespace,
+) -> bool:
+    min_rate: Optional[float] = args.speaking_rate_min
+    max_rate: Optional[float] = args.speaking_rate_max
+
+    if (min_rate is None) and (max_rate is None):
+        return True
+
+    if len(text) == 0:
+        return False
+
+    duration = librosa.get_duration(path=wav_path)
+    rate = len(text) / duration
+
+    if (min_rate is not None) and (rate < min_rate):
+        return False
+
+    if (max_rate is not None) and (rate > max_rate):
+        return False
+
+    return True
+
+
 def ljspeech_dataset(
    dataset_dir: Path,
    is_single_speaker: bool,
@@ -375,7 +409,7 @@ def ljspeech_dataset(
    with open(metadata_path, "r", encoding="utf-8") as csv_file:
        reader = csv.reader(csv_file, delimiter="|")
        for row in reader:
-            assert len(row) >= 2, "Not enough colums"
+            assert len(row) >= 2, "Not enough columns"

            speaker: Optional[str] = None
            if is_single_speaker or (len(row) == 2):
@@ -398,14 +432,18 @@ def ljspeech_dataset(
                # Try with .wav
                wav_path = wav_dir / f"{filename}.wav"

-            wav_exists = wav_exists.exists()
-            if (not skip_audio) and wav_exists and (wav_path.stat().st_size == 0):
-                _LOGGER.warning("Empty file: %s", wav_path)
-                continue
+            if not skip_audio:
+                if not wav_path.exists():
+                    _LOGGER.warning("Missing %s", filename)
+                    continue

-            if (not skip_audio) and (not wav_exists):
-                _LOGGER.warning("Missing %s", filename)
-                continue
+                if wav_path.stat().st_size == 0:
+                    _LOGGER.warning("Empty file: %s", wav_path)
+                    continue
+
+                if not is_good_speaking_rate(text, wav_path, args):
+                    _LOGGER.warning("Bad speaking rate: %s", wav_path)
+                    continue

            yield Utterance(
                text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
@@ -427,12 +465,13 @@ def mycroft_dataset(
            for row in reader:
                filename, text = row[0], row[1]
                wav_path = metadata_path.parent / filename
-                yield Utterance(
-                    text=text,
-                    audio_path=wav_path,
-                    speaker=speaker,
-                    speaker_id=speaker_id if not is_single_speaker else None,
-                )
+                if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
+                    yield Utterance(
+                        text=text,
+                        audio_path=wav_path,
+                        speaker=speaker,
+                        speaker_id=speaker_id if not is_single_speaker else None,
+                    )
        speaker_id += 1