Merge branch 'rhasspy:master' into master

This commit is contained in:
Mateo Cedillo
2023-06-10 09:15:39 -05:00
committed by GitHub
53 changed files with 1967 additions and 888 deletions

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import json
import sys
import unicodedata
from collections import Counter
from .phonemize import DEFAULT_PHONEME_ID_MAP
def main() -> None:
used_phonemes: "Counter[str]" = Counter()
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
utt = json.loads(line)
for phoneme in utt["phonemes"]:
used_phonemes[phoneme] += 1
if phoneme not in DEFAULT_PHONEME_ID_MAP:
missing_phonemes[phoneme] += 1
if missing_phonemes:
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
json.dump(
{
"used": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in used_phonemes.most_common()
},
"missing": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in missing_phonemes.most_common()
},
},
sys.stdout,
)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -2,7 +2,6 @@
import argparse
import logging
from pathlib import Path
from typing import Optional
import torch
@@ -41,7 +40,6 @@ def main():
model_g = model.model_g
num_symbols = model_g.n_vocab
num_speakers = model_g.n_speakers
# Inference only
model_g.eval()

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
import argparse
import csv
import json
import re
import shutil
import statistics
import subprocess
import sys
import threading
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from dataclasses import asdict, dataclass
from enum import Enum
from pathlib import Path
from typing import Optional
import numpy as np
from .norm_audio import make_silence_detector, trim_silence
_DIR = Path(__file__).parent
# Removed from the speaking rate calculation
_PUNCTUATION = re.compile(".。,?¿?؟!;:-—")
class ExcludeReason(str, Enum):
MISSING = "file_missing"
EMPTY = "file_empty"
LOW = "rate_low"
HIGH = "rate_high"
@dataclass
class Utterance:
id: str
text: str
duration_sec: float
speaker: str
exclude_reason: Optional[ExcludeReason] = None
rate: float = 0.0
def __post_init__(self):
if self.duration_sec > 0:
# Don't include punctuation is speaking rate calculation since we
# remove silence.
text_nopunct = _PUNCTUATION.sub("", self.text)
self.rate = len(text_nopunct) / self.duration_sec
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--write-json", help="Path to write information about excluded utterances"
)
parser.add_argument(
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
)
parser.add_argument("--scale-lower", type=float, default=2.0)
parser.add_argument("--scale-upper", type=float, default=2.0)
args = parser.parse_args()
if not shutil.which("ffprobe"):
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
dataset_dir = Path(args.dataset_dir)
wav_dir = dataset_dir / "wav"
if not wav_dir.is_dir():
wav_dir = dataset_dir / "wavs"
reader = csv.reader(sys.stdin, delimiter="|")
text_and_audio = []
for row in reader:
filename, text = row[0], row[-1]
speaker = row[1] if len(row) > 2 else "default"
# Try file name relative to metadata
wav_path = dataset_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = dataset_dir / f"{filename}.wav"
if not wav_path.exists():
# Try wav/ or wavs/
wav_path = wav_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
text_and_audio.append((filename, text, wav_path, speaker))
writer = csv.writer(sys.stdout, delimiter="|")
# speaker -> [rate]
utts_by_speaker = defaultdict(list)
process_utterance = ProcessUtterance()
with ThreadPoolExecutor() as executor:
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
utts_by_speaker[utt.speaker].append(utt)
is_multispeaker = len(utts_by_speaker) > 1
writer = csv.writer(sys.stdout, delimiter="|")
speaker_details = {}
for speaker, utts in utts_by_speaker.items():
rates = [utt.rate for utt in utts]
if rates:
# Exclude rates well outside the 25%/75% quantiles
rate_qs = statistics.quantiles(rates, n=4)
q1 = rate_qs[0] # 25%
q3 = rate_qs[-1] # 75%
iqr = q3 - q1
lower = q1 - (args.scale_lower * iqr)
upper = q3 + (args.scale_upper * iqr)
speaker_details[speaker] = {
"min": min(rates),
"max": max(rates),
"quanties": rate_qs,
"lower": lower,
"upper": upper,
}
for utt in utts:
if utt.rate < lower:
utt.exclude_reason = ExcludeReason.LOW
elif utt.rate > upper:
utt.exclude_reason = ExcludeReason.HIGH
else:
if is_multispeaker:
writer.writerow((utt.id, utt.speaker, utt.text))
else:
writer.writerow((utt.id, utt.text))
if args.write_json:
speaker_excluded = {
speaker: [
asdict(utt)
for utt in utts_by_speaker[speaker]
if utt.exclude_reason is not None
]
for speaker in speaker_details
}
with open(args.write_json, "w") as json_file:
json.dump(
{
speaker: {
"details": speaker_details[speaker],
"num_utterances": len(utts_by_speaker[speaker]),
"num_excluded": len(speaker_excluded[speaker]),
"excluded": speaker_excluded[speaker],
}
for speaker in speaker_details
},
json_file,
indent=4,
ensure_ascii=False,
)
class ProcessUtterance:
def __init__(self):
self.thread_data = threading.local()
def __call__(
self, utt_id: str, text: str, wav_path: Path, speaker: str
) -> Utterance:
if not wav_path.exists():
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.MISSING,
)
if wav_path.stat().st_size == 0:
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.EMPTY,
)
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
def get_duration(self, audio_path: Path) -> float:
"""Uses ffmpeg to get audio duration."""
if not hasattr(self.thread_data, "detector"):
self.thread_data.detector = make_silence_detector()
vad_sample_rate = 16000
audio_16khz_bytes = subprocess.check_output(
[
"ffmpeg",
"-i",
str(audio_path),
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"-ac",
"1",
"-ar",
str(vad_sample_rate),
"pipe:",
],
stderr=subprocess.DEVNULL,
)
# Normalize
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
np.float32
)
audio_16khz /= np.abs(np.max(audio_16khz))
# Get speaking duration
offset_sec, duration_sec = trim_silence(
audio_16khz,
self.thread_data.detector,
threshold=0.8,
samples_per_chunk=480,
sample_rate=vad_sample_rate,
keep_chunks_before=2,
keep_chunks_after=2,
)
if duration_sec is None:
# Speech goes to end of audio
if len(audio_16khz) > 0:
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
else:
duration_sec = 0.0
return duration_sec
# return float(
# subprocess.check_output(
# [
# "ffprobe",
# "-i",
# str(audio_path),
# "-show_entries",
# "format=duration",
# "-v",
# "quiet",
# "-of",
# "csv=p=0",
# ],
# stderr=subprocess.DEVNULL,
# universal_newlines=True,
# ).strip()
# )
if __name__ == "__main__":
main()

View File

@@ -1,9 +1,23 @@
import argparse
import json
import sys
import unicodedata
from collections import Counter
from enum import Enum
from typing import Dict, Iterable, List, Mapping, Optional
from espeak_phonemizer import Phonemizer
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
MAX_PHONEMES = 256
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"_": [0],
"^": [1],
@@ -135,14 +149,115 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"χ": [127],
"": [128],
"": [129],
"0": [130], # tones
"1": [131],
"2": [132],
"3": [133],
"4": [134],
"5": [135],
"6": [136],
"7": [137],
"8": [138],
"9": [139],
"\u0327": [140], # combining cedilla
"\u0303": [141], # combining tilde
"\u032a": [142], # combining bridge below
"\u032f": [143], # combining inverted breve below
"\u0329": [144], # combining vertical line below
"ʰ": [145],
"ˤ": [146],
"ε": [147],
"": [148],
"#": [149], # Icelandic
'"': [150], # Russian
"": [151],
"\u033a": [152], # Basque
"\u033b": [153],
}
PHONEME_MAPS = {
# Brazilian Portuguese
"pt-br": {"c": ["k"]}
}
ALPHABETS = {
# Ukrainian
"uk": {
"_": [0],
"^": [1],
"$": [2],
" ": [3],
"!": [4],
"'": [5],
",": [6],
"-": [7],
".": [8],
":": [9],
";": [10],
"?": [11],
"а": [12],
"б": [13],
"в": [14],
"г": [15],
"ґ": [16],
"д": [17],
"е": [18],
"є": [19],
"ж": [20],
"з": [21],
"и": [22],
"і": [23],
"ї": [24],
"й": [25],
"к": [26],
"л": [27],
"м": [28],
"н": [29],
"о": [30],
"п": [31],
"р": [32],
"с": [33],
"т": [34],
"у": [35],
"ф": [36],
"х": [37],
"ц": [38],
"ч": [39],
"ш": [40],
"щ": [41],
"ь": [42],
"ю": [43],
"я": [44],
"\u0301": [45], # combining acute accent
"\u0306": [46], # combining breve
"\u0308": [47], # combining diaeresis
"": [48], # em dash
}
}
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
def phonemize(
text: str,
phonemizer: Phonemizer,
phoneme_map: Optional[Dict[str, List[str]]] = None,
) -> List[str]:
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
# Phonemes are decomposed into unicode codepoints
return list(unicodedata.normalize("NFD", phonemes_str))
unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
if not phoneme_map:
return unmapped_phonemes
# Phonemes can be mapped to lists of other phonemes
mapped_phonemes = []
for phoneme in unmapped_phonemes:
sub_phonemes = phoneme_map.get(phoneme)
if sub_phonemes:
mapped_phonemes.extend(sub_phonemes)
else:
mapped_phonemes.append(phoneme)
return mapped_phonemes
def phonemes_to_ids(
@@ -179,3 +294,79 @@ def phonemes_to_ids(
phoneme_ids.extend(phoneme_id_map[eos])
return phoneme_ids
# -----------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("language")
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
args = parser.parse_args()
phonemizer: Optional[Phonemizer] = None
if args.text_casing == "lower":
casing = str.lower
elif args.text_casing == "upper":
casing = str.upper
else:
# ignore
casing = lambda s: s
if args.phoneme_type == PhonemeType.TEXT:
# Use text directly
phoneme_id_map = ALPHABETS[args.language]
else:
# Use eSpeak
phonemizer = Phonemizer(args.language)
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
phoneme_map = PHONEME_MAPS.get(args.language)
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
if args.phoneme_type == PhonemeType.TEXT:
phonemes = list(unicodedata.normalize("NFD", casing(line)))
else:
assert phonemizer is not None
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
)
json.dump(
{
"text": line,
"phonemes": phonemes,
"phoneme_ids": phoneme_ids,
},
sys.stdout,
ensure_ascii=False,
)
print("")
if missing_phonemes:
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
for phoneme, count in missing_phonemes.most_common():
print(phoneme, count, file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -6,9 +6,9 @@ import itertools
import json
import logging
import os
import unicodedata
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from dataclasses import dataclass, field
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
from typing import Dict, Iterable, List, Optional
@@ -16,7 +16,15 @@ from typing import Dict, Iterable, List, Optional
from espeak_phonemizer import Phonemizer
from .norm_audio import cache_norm_audio, make_silence_detector
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
from .phonemize import (
ALPHABETS,
DEFAULT_PHONEME_ID_MAP,
MAX_PHONEMES,
PHONEME_MAPS,
PhonemeType,
phonemes_to_ids,
phonemize,
)
_LOGGER = logging.getLogger("preprocess")
@@ -49,6 +57,23 @@ def main() -> None:
parser.add_argument(
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
)
#
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
#
parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio"
)
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
@@ -84,9 +109,9 @@ def main() -> None:
# Count speakers
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
speaker_counts: Counter[str] = Counter()
speaker_counts: "Counter[str]" = Counter()
num_utterances = 0
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
for utt in make_dataset(args):
speaker = utt.speaker or ""
speaker_counts[speaker] += 1
num_utterances += 1
@@ -118,11 +143,12 @@ def main() -> None:
"voice": args.language,
},
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_type": str(args.phoneme_type),
"phoneme_map": {},
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
"num_symbols": len(
set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
),
"phoneme_id_map": ALPHABETS[args.language]
if args.phoneme_type == PhonemeType.TEXT
else DEFAULT_PHONEME_ID_MAP,
"num_symbols": MAX_PHONEMES,
"num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids,
},
@@ -142,8 +168,13 @@ def main() -> None:
queue_out: "Queue[Optional[Utterance]]" = Queue()
# Start workers
if args.phoneme_type == PhonemeType.TEXT:
target = phonemize_batch_text
else:
target = phonemize_batch_espeak
processes = [
Process(target=process_batch, args=(args, queue_in, queue_out))
Process(target=target, args=(args, queue_in, queue_out))
for _ in range(args.max_workers)
]
for proc in processes:
@@ -154,27 +185,39 @@ def main() -> None:
)
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
for utt_batch in batched(
make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
make_dataset(args),
batch_size,
):
queue_in.put(utt_batch)
_LOGGER.debug("Waiting for jobs to finish")
missing_phonemes: "Counter[str]" = Counter()
for _ in range(num_utterances):
utt = queue_out.get()
if utt is not None:
if utt.speaker is not None:
utt.speaker_id = speaker_ids[utt.speaker]
utt_dict = dataclasses.asdict(utt)
utt_dict.pop("missing_phonemes")
# JSONL
json.dump(
dataclasses.asdict(utt),
utt_dict,
dataset_file,
ensure_ascii=False,
cls=PathEncoder,
)
print("", file=dataset_file)
missing_phonemes.update(utt.missing_phonemes)
if missing_phonemes:
for phoneme, count in missing_phonemes.most_common():
_LOGGER.warning("Missing %s (%s)", phoneme, count)
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
# Signal workers to stop
for proc in processes:
queue_in.put(None)
@@ -187,10 +230,27 @@ def main() -> None:
# -----------------------------------------------------------------------------
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
def get_text_casing(casing: str):
if casing == "lower":
return str.lower
if casing == "upper":
return str.upper
if casing == "casefold":
return str.casefold
return lambda s: s
def phonemize_batch_espeak(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
phonemizer = Phonemizer(default_voice=args.language)
phoneme_map = PHONEME_MAPS.get(args.language)
while True:
utt_batch = queue_in.get()
@@ -200,14 +260,20 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = phonemize(utt.text, phonemizer)
utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
utt.phonemes = phonemize(
casing(utt.text), phonemizer, phoneme_map=phoneme_map
)
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
@@ -217,7 +283,48 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
queue_in.task_done()
except Exception:
_LOGGER.exception("process_batch")
_LOGGER.exception("phonemize_batch_espeak")
def phonemize_batch_text(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
alphabet = ALPHABETS[args.language]
while True:
utt_batch = queue_in.get()
if utt_batch is None:
break
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
phoneme_id_map=alphabet,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
except Exception:
_LOGGER.exception("Failed to process utterance: %s", utt)
queue_out.put(None)
queue_in.task_done()
except Exception:
_LOGGER.exception("phonemize_batch_text")
# -----------------------------------------------------------------------------
@@ -233,6 +340,7 @@ class Utterance:
phoneme_ids: Optional[List[int]] = None
audio_norm_path: Optional[Path] = None
audio_spec_path: Optional[Path] = None
missing_phonemes: "Counter[str]" = field(default_factory=Counter)
class PathEncoder(json.JSONEncoder):
@@ -242,9 +350,12 @@ class PathEncoder(json.JSONEncoder):
return super().default(o)
def ljspeech_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
) -> Iterable[Utterance]:
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
dataset_dir = args.input_dir
is_single_speaker = args.single_speaker
speaker_id = args.speaker_id
skip_audio = args.skip_audio
# filename|speaker|text
# speaker is optional
metadata_path = dataset_dir / "metadata.csv"
@@ -257,7 +368,7 @@ def ljspeech_dataset(
with open(metadata_path, "r", encoding="utf-8") as csv_file:
reader = csv.reader(csv_file, delimiter="|")
for row in reader:
assert len(row) >= 2, "Not enough colums"
assert len(row) >= 2, "Not enough columns"
speaker: Optional[str] = None
if is_single_speaker or (len(row) == 2):
@@ -280,18 +391,25 @@ def ljspeech_dataset(
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
if not wav_path.exists():
_LOGGER.warning("Missing %s", filename)
continue
if not skip_audio:
if not wav_path.exists():
_LOGGER.warning("Missing %s", filename)
continue
if wav_path.stat().st_size == 0:
_LOGGER.warning("Empty file: %s", wav_path)
continue
yield Utterance(
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
)
def mycroft_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
) -> Iterable[Utterance]:
def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
dataset_dir = args.input_dir
is_single_speaker = args.single_speaker
skip_audio = args.skip_audio
speaker_id = 0
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
speaker = metadata_path.parent.name if not is_single_speaker else None
@@ -301,15 +419,15 @@ def mycroft_dataset(
for row in reader:
filename, text = row[0], row[1]
wav_path = metadata_path.parent / filename
yield Utterance(
text=text,
audio_path=wav_path,
speaker=speaker,
speaker_id=speaker_id if not is_single_speaker else None,
)
if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
yield Utterance(
text=text,
audio_path=wav_path,
speaker=speaker,
speaker_id=speaker_id if not is_single_speaker else None,
)
speaker_id += 1
# -----------------------------------------------------------------------------

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import argparse
import csv
import sys
from collections import Counter, defaultdict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--speaker-number", type=int)
parser.add_argument("--speaker-name")
args = parser.parse_args()
assert (args.speaker_number is not None) or (args.speaker_name is not None)
reader = csv.reader(sys.stdin, delimiter="|")
writer = csv.writer(sys.stdout, delimiter="|")
if args.speaker_name is not None:
for row in reader:
audio, speaker_id, text = row[0], row[1], row[-1]
if args.speaker_name == speaker_id:
writer.writerow((audio, text))
else:
utterances = defaultdict(list)
counts = Counter()
for row in reader:
audio, speaker_id, text = row[0], row[1], row[-1]
utterances[speaker_id].append((audio, text))
counts[speaker_id] += 1
writer = csv.writer(sys.stdout, delimiter="|")
for i, (speaker_id, _count) in enumerate(counts.most_common()):
if i == args.speaker_number:
for row in utterances[speaker_id]:
writer.writerow(row)
print(speaker_id, file=sys.stderr)
break
if __name__ == "__main__":
main()

View File

@@ -8,7 +8,8 @@ docker run \
--user "$(id -u):$(id -g)" \
--ipc=host \
-v "${HOME}:${HOME}" \
-v /media/cache:/media/cache:ro \
-v /etc/hostname:/etc/hostname:ro \
-v /etc/localtime:/etc/localtime:ro \
piper-train \
larynx2-train \
"$@"