Add filter utterances

This commit is contained in:
Michael Hansen
2023-05-11 20:48:58 -05:00
committed by Michael Hansen
parent 10b136cdf8
commit 5a64768924
3 changed files with 267 additions and 110 deletions

View File

@@ -0,0 +1,266 @@
#!/usr/bin/env python3
import argparse
import csv
import json
import re
import sys
import statistics
import shutil
import subprocess
import threading
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from dataclasses import asdict, dataclass
from enum import Enum
from pathlib import Path
from typing import Optional
import numpy as np
from .norm_audio import make_silence_detector, trim_silence, SileroVoiceActivityDetector
_DIR = Path(__file__).parent
# Removed from the speaking rate calculation
_PUNCTUATION = re.compile(".。,?¿?؟!;:-—")
class ExcludeReason(str, Enum):
MISSING = "file_missing"
EMPTY = "file_empty"
LOW = "rate_low"
HIGH = "rate_high"
@dataclass
class Utterance:
id: str
text: str
duration_sec: float
speaker: str
exclude_reason: Optional[ExcludeReason] = None
rate: float = 0.0
def __post_init__(self):
if self.duration_sec > 0:
# Don't include punctuation is speaking rate calculation since we
# remove silence.
text_nopunct = _PUNCTUATION.sub("", self.text)
self.rate = len(text_nopunct) / self.duration_sec
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--write-json", help="Path to write information about excluded utterances"
)
parser.add_argument(
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
)
parser.add_argument("--scale-lower", type=float, default=2.0)
parser.add_argument("--scale-upper", type=float, default=2.0)
args = parser.parse_args()
if not shutil.which("ffprobe"):
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
dataset_dir = Path(args.dataset_dir)
wav_dir = dataset_dir / "wav"
if not wav_dir.is_dir():
wav_dir = dataset_dir / "wavs"
reader = csv.reader(sys.stdin, delimiter="|")
text_and_audio = []
for row in reader:
filename, text = row[0], row[-1]
speaker = row[1] if len(row) > 2 else "default"
# Try file name relative to metadata
wav_path = dataset_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = dataset_dir / f"{filename}.wav"
if not wav_path.exists():
# Try wav/ or wavs/
wav_path = wav_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
text_and_audio.append((filename, text, wav_path, speaker))
writer = csv.writer(sys.stdout, delimiter="|")
# speaker -> [rate]
utts_by_speaker = defaultdict(list)
process_utterance = ProcessUtterance()
with ThreadPoolExecutor() as executor:
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
utts_by_speaker[utt.speaker].append(utt)
is_multispeaker = len(utts_by_speaker) > 1
writer = csv.writer(sys.stdout, delimiter="|")
speaker_details = {}
for speaker, utts in utts_by_speaker.items():
rates = [utt.rate for utt in utts]
if rates:
# Exclude rates well outside the 25%/75% quantiles
rate_qs = statistics.quantiles(rates, n=4)
q1 = rate_qs[0] # 25%
q3 = rate_qs[-1] # 75%
iqr = q3 - q1
lower = q1 - (args.scale_lower * iqr)
upper = q3 + (args.scale_upper * iqr)
speaker_details[speaker] = {
"min": min(rates),
"max": max(rates),
"quanties": rate_qs,
"lower": lower,
"upper": upper,
}
for utt in utts:
if utt.rate < lower:
utt.exclude_reason = ExcludeReason.LOW
elif utt.rate > upper:
utt.exclude_reason = ExcludeReason.HIGH
else:
if is_multispeaker:
writer.writerow((utt.id, utt.text, utt.speaker))
else:
writer.writerow((utt.id, utt.text))
if args.write_json:
speaker_excluded = {
speaker: [
asdict(utt)
for utt in utts_by_speaker[speaker]
if utt.exclude_reason is not None
]
for speaker in speaker_details
}
with open(args.write_json, "w") as json_file:
json.dump(
{
speaker: {
"details": speaker_details[speaker],
"num_utterances": len(utts_by_speaker[speaker]),
"num_excluded": len(speaker_excluded[speaker]),
"excluded": speaker_excluded[speaker],
}
for speaker in speaker_details
},
json_file,
indent=4,
)
class ProcessUtterance:
def __init__(self):
self.thread_data = threading.local()
def __call__(
self, utt_id: str, text: str, wav_path: Path, speaker: str
) -> Utterance:
if not wav_path.exists():
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.MISSING,
)
if wav_path.stat().st_size == 0:
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.EMPTY,
)
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
def get_duration(self, audio_path: Path) -> float:
"""Uses ffmpeg to get audio duration."""
if not hasattr(self.thread_data, "detector"):
self.thread_data.detector = make_silence_detector()
vad_sample_rate = 16000
audio_16khz_bytes = subprocess.check_output(
[
"ffmpeg",
"-i",
str(audio_path),
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"-ac",
"1",
"-ar",
str(vad_sample_rate),
"pipe:",
],
stderr=subprocess.DEVNULL,
)
# Normalize
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
np.float32
)
audio_16khz /= np.abs(np.max(audio_16khz))
# Get speaking duration
offset_sec, duration_sec = trim_silence(
audio_16khz,
self.thread_data.detector,
threshold=0.5,
samples_per_chunk=480,
sample_rate=vad_sample_rate,
keep_chunks_before=2,
keep_chunks_after=2,
)
if duration_sec is None:
# Speech goes to end of audio
if len(audio_16khz) > 0:
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
else:
duration_sec = 0.0
return duration_sec
# return float(
# subprocess.check_output(
# [
# "ffprobe",
# "-i",
# str(audio_path),
# "-show_entries",
# "format=duration",
# "-v",
# "quiet",
# "-of",
# "csv=p=0",
# ],
# stderr=subprocess.DEVNULL,
# universal_newlines=True,
# ).strip()
# )
def make_silence_detector() -> SileroVoiceActivityDetector:
silence_model = _DIR / "norm_audio" / "models" / "silero_vad.onnx"
return SileroVoiceActivityDetector(silence_model)
if __name__ == "__main__":
main()

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
import argparse
import csv
import json
import sys
import statistics
from pathlib import Path
import librosa
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", default=Path.cwd())
parser.add_argument("--csv", action="store_true")
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
wav_dir = dataset_dir / "wav"
if not wav_dir.is_dir():
wav_dir = dataset_dir / "wavs"
reader = csv.reader(sys.stdin, delimiter="|")
writer = csv.writer(sys.stdout, delimiter="|")
rates = []
for row in reader:
filename, text = row[0], row[-1]
# Try file name relative to metadata
wav_path = dataset_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = dataset_dir / f"{filename}.wav"
if not wav_path.exists():
# Try wav/ or wavs/
wav_path = wav_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
if not wav_path.exists():
print("Missing", wav_path, file=sys.stderr)
continue
if wav_path.stat().st_size == 0:
print("Empty", wav_path, file=sys.stderr)
continue
duration = librosa.get_duration(path=wav_path)
rate = duration / len(text)
if args.csv:
writer.writerow((filename, text, duration, rate))
else:
rates.append(rate)
if not args.csv:
json.dump(
{
"rates": rates,
"mean": statistics.mean(rates),
"median": statistics.median(rates),
},
sys.stdout,
)
if __name__ == "__main__":
main()

View File

@@ -11,7 +11,7 @@ from collections import Counter
from dataclasses import dataclass, field
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
from typing import Dict, Iterable, List, Optional
from typing import Dict, Iterable, List, Optional, Tuple
import librosa
from espeak_phonemizer import Phonemizer
@@ -71,13 +71,6 @@ def main() -> None:
help="Casing applied to utterance text",
)
#
parser.add_argument(
"--speaking-rate-min", type=float, help="Minimum speaking rate (chars/sec)"
)
parser.add_argument(
"--speaking-rate-max", type=float, help="Maximum speaking rate (chars/sec)"
)
#
parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio"
)
@@ -355,32 +348,6 @@ class PathEncoder(json.JSONEncoder):
return super().default(o)
def is_good_speaking_rate(
text: str,
wav_path: Path,
args: argparse.Namespace,
) -> bool:
min_rate: Optional[float] = args.speaking_rate_min
max_rate: Optional[float] = args.speaking_rate_max
if (min_rate is None) and (max_rate is None):
return True
if len(text) == 0:
return False
duration = librosa.get_duration(path=wav_path)
rate = len(text) / duration
if (min_rate is not None) and (rate < min_rate):
return False
if (max_rate is not None) and (rate > max_rate):
return False
return True
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
dataset_dir = args.input_dir
is_single_speaker = args.single_speaker
@@ -431,10 +398,6 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
_LOGGER.warning("Empty file: %s", wav_path)
continue
if not is_good_speaking_rate(text, wav_path, args):
_LOGGER.warning("Bad speaking rate: %s", wav_path)
continue
yield Utterance(
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
)