mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-19 14:54:50 +00:00
Add filter utterances
This commit is contained in:
committed by
Michael Hansen
parent
10b136cdf8
commit
5a64768924
266
src/python/piper_train/filter_utterances.py
Normal file
266
src/python/piper_train/filter_utterances.py
Normal file
@@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import statistics
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .norm_audio import make_silence_detector, trim_silence, SileroVoiceActivityDetector
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
# Removed from the speaking rate calculation
|
||||
_PUNCTUATION = re.compile(".。,,?¿?؟!!;;::-—")
|
||||
|
||||
|
||||
class ExcludeReason(str, Enum):
|
||||
MISSING = "file_missing"
|
||||
EMPTY = "file_empty"
|
||||
LOW = "rate_low"
|
||||
HIGH = "rate_high"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
id: str
|
||||
text: str
|
||||
duration_sec: float
|
||||
speaker: str
|
||||
exclude_reason: Optional[ExcludeReason] = None
|
||||
rate: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.duration_sec > 0:
|
||||
# Don't include punctuation is speaking rate calculation since we
|
||||
# remove silence.
|
||||
text_nopunct = _PUNCTUATION.sub("", self.text)
|
||||
self.rate = len(text_nopunct) / self.duration_sec
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--write-json", help="Path to write information about excluded utterances"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
|
||||
)
|
||||
parser.add_argument("--scale-lower", type=float, default=2.0)
|
||||
parser.add_argument("--scale-upper", type=float, default=2.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not shutil.which("ffprobe"):
|
||||
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
|
||||
|
||||
dataset_dir = Path(args.dataset_dir)
|
||||
wav_dir = dataset_dir / "wav"
|
||||
if not wav_dir.is_dir():
|
||||
wav_dir = dataset_dir / "wavs"
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
|
||||
text_and_audio = []
|
||||
for row in reader:
|
||||
filename, text = row[0], row[-1]
|
||||
speaker = row[1] if len(row) > 2 else "default"
|
||||
|
||||
# Try file name relative to metadata
|
||||
wav_path = dataset_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = dataset_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try wav/ or wavs/
|
||||
wav_path = wav_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
text_and_audio.append((filename, text, wav_path, speaker))
|
||||
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
# speaker -> [rate]
|
||||
utts_by_speaker = defaultdict(list)
|
||||
process_utterance = ProcessUtterance()
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
|
||||
utts_by_speaker[utt.speaker].append(utt)
|
||||
|
||||
is_multispeaker = len(utts_by_speaker) > 1
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
speaker_details = {}
|
||||
for speaker, utts in utts_by_speaker.items():
|
||||
rates = [utt.rate for utt in utts]
|
||||
if rates:
|
||||
# Exclude rates well outside the 25%/75% quantiles
|
||||
rate_qs = statistics.quantiles(rates, n=4)
|
||||
q1 = rate_qs[0] # 25%
|
||||
q3 = rate_qs[-1] # 75%
|
||||
iqr = q3 - q1
|
||||
lower = q1 - (args.scale_lower * iqr)
|
||||
upper = q3 + (args.scale_upper * iqr)
|
||||
speaker_details[speaker] = {
|
||||
"min": min(rates),
|
||||
"max": max(rates),
|
||||
"quanties": rate_qs,
|
||||
"lower": lower,
|
||||
"upper": upper,
|
||||
}
|
||||
|
||||
for utt in utts:
|
||||
if utt.rate < lower:
|
||||
utt.exclude_reason = ExcludeReason.LOW
|
||||
elif utt.rate > upper:
|
||||
utt.exclude_reason = ExcludeReason.HIGH
|
||||
else:
|
||||
if is_multispeaker:
|
||||
writer.writerow((utt.id, utt.text, utt.speaker))
|
||||
else:
|
||||
writer.writerow((utt.id, utt.text))
|
||||
|
||||
if args.write_json:
|
||||
speaker_excluded = {
|
||||
speaker: [
|
||||
asdict(utt)
|
||||
for utt in utts_by_speaker[speaker]
|
||||
if utt.exclude_reason is not None
|
||||
]
|
||||
for speaker in speaker_details
|
||||
}
|
||||
|
||||
with open(args.write_json, "w") as json_file:
|
||||
json.dump(
|
||||
{
|
||||
speaker: {
|
||||
"details": speaker_details[speaker],
|
||||
"num_utterances": len(utts_by_speaker[speaker]),
|
||||
"num_excluded": len(speaker_excluded[speaker]),
|
||||
"excluded": speaker_excluded[speaker],
|
||||
}
|
||||
for speaker in speaker_details
|
||||
},
|
||||
json_file,
|
||||
indent=4,
|
||||
)
|
||||
|
||||
|
||||
class ProcessUtterance:
|
||||
def __init__(self):
|
||||
self.thread_data = threading.local()
|
||||
|
||||
def __call__(
|
||||
self, utt_id: str, text: str, wav_path: Path, speaker: str
|
||||
) -> Utterance:
|
||||
if not wav_path.exists():
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.MISSING,
|
||||
)
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.EMPTY,
|
||||
)
|
||||
|
||||
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
|
||||
|
||||
def get_duration(self, audio_path: Path) -> float:
|
||||
"""Uses ffmpeg to get audio duration."""
|
||||
if not hasattr(self.thread_data, "detector"):
|
||||
self.thread_data.detector = make_silence_detector()
|
||||
|
||||
vad_sample_rate = 16000
|
||||
audio_16khz_bytes = subprocess.check_output(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
str(audio_path),
|
||||
"-f",
|
||||
"s16le",
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
str(vad_sample_rate),
|
||||
"pipe:",
|
||||
],
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
# Normalize
|
||||
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
|
||||
np.float32
|
||||
)
|
||||
audio_16khz /= np.abs(np.max(audio_16khz))
|
||||
|
||||
# Get speaking duration
|
||||
offset_sec, duration_sec = trim_silence(
|
||||
audio_16khz,
|
||||
self.thread_data.detector,
|
||||
threshold=0.5,
|
||||
samples_per_chunk=480,
|
||||
sample_rate=vad_sample_rate,
|
||||
keep_chunks_before=2,
|
||||
keep_chunks_after=2,
|
||||
)
|
||||
|
||||
if duration_sec is None:
|
||||
# Speech goes to end of audio
|
||||
if len(audio_16khz) > 0:
|
||||
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
|
||||
else:
|
||||
duration_sec = 0.0
|
||||
|
||||
return duration_sec
|
||||
|
||||
# return float(
|
||||
# subprocess.check_output(
|
||||
# [
|
||||
# "ffprobe",
|
||||
# "-i",
|
||||
# str(audio_path),
|
||||
# "-show_entries",
|
||||
# "format=duration",
|
||||
# "-v",
|
||||
# "quiet",
|
||||
# "-of",
|
||||
# "csv=p=0",
|
||||
# ],
|
||||
# stderr=subprocess.DEVNULL,
|
||||
# universal_newlines=True,
|
||||
# ).strip()
|
||||
# )
|
||||
|
||||
|
||||
def make_silence_detector() -> SileroVoiceActivityDetector:
|
||||
silence_model = _DIR / "norm_audio" / "models" / "silero_vad.onnx"
|
||||
return SileroVoiceActivityDetector(silence_model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,72 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dataset_dir", default=Path.cwd())
|
||||
parser.add_argument("--csv", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_dir = Path(args.dataset_dir)
|
||||
wav_dir = dataset_dir / "wav"
|
||||
if not wav_dir.is_dir():
|
||||
wav_dir = dataset_dir / "wavs"
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
rates = []
|
||||
for row in reader:
|
||||
filename, text = row[0], row[-1]
|
||||
|
||||
# Try file name relative to metadata
|
||||
wav_path = dataset_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = dataset_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try wav/ or wavs/
|
||||
wav_path = wav_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
print("Missing", wav_path, file=sys.stderr)
|
||||
continue
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
print("Empty", wav_path, file=sys.stderr)
|
||||
continue
|
||||
|
||||
duration = librosa.get_duration(path=wav_path)
|
||||
rate = duration / len(text)
|
||||
|
||||
if args.csv:
|
||||
writer.writerow((filename, text, duration, rate))
|
||||
else:
|
||||
rates.append(rate)
|
||||
|
||||
if not args.csv:
|
||||
json.dump(
|
||||
{
|
||||
"rates": rates,
|
||||
"mean": statistics.mean(rates),
|
||||
"median": statistics.median(rates),
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -11,7 +11,7 @@ from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import librosa
|
||||
from espeak_phonemizer import Phonemizer
|
||||
@@ -71,13 +71,6 @@ def main() -> None:
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--speaking-rate-min", type=float, help="Minimum speaking rate (chars/sec)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaking-rate-max", type=float, help="Maximum speaking rate (chars/sec)"
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||
)
|
||||
@@ -355,32 +348,6 @@ class PathEncoder(json.JSONEncoder):
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def is_good_speaking_rate(
|
||||
text: str,
|
||||
wav_path: Path,
|
||||
args: argparse.Namespace,
|
||||
) -> bool:
|
||||
min_rate: Optional[float] = args.speaking_rate_min
|
||||
max_rate: Optional[float] = args.speaking_rate_max
|
||||
|
||||
if (min_rate is None) and (max_rate is None):
|
||||
return True
|
||||
|
||||
if len(text) == 0:
|
||||
return False
|
||||
|
||||
duration = librosa.get_duration(path=wav_path)
|
||||
rate = len(text) / duration
|
||||
|
||||
if (min_rate is not None) and (rate < min_rate):
|
||||
return False
|
||||
|
||||
if (max_rate is not None) and (rate > max_rate):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
dataset_dir = args.input_dir
|
||||
is_single_speaker = args.single_speaker
|
||||
@@ -431,10 +398,6 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
_LOGGER.warning("Empty file: %s", wav_path)
|
||||
continue
|
||||
|
||||
if not is_good_speaking_rate(text, wav_path, args):
|
||||
_LOGGER.warning("Bad speaking rate: %s", wav_path)
|
||||
continue
|
||||
|
||||
yield Utterance(
|
||||
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user