Add text phonemes to preprocess

This commit is contained in:
Michael Hansen
2023-06-08 15:37:20 -05:00
parent a66ac02ca7
commit a7eab8ed03
5 changed files with 178 additions and 15 deletions

View File

@@ -45,7 +45,7 @@ def main() -> None:
"category": unicodedata.category(phoneme), "category": unicodedata.category(phoneme),
} }
for phoneme, count in missing_phonemes.most_common() for phoneme, count in missing_phonemes.most_common()
} },
}, },
sys.stdout, sys.stdout,
) )

View File

@@ -2,7 +2,6 @@
import argparse import argparse
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Optional
import torch import torch
@@ -41,7 +40,6 @@ def main():
model_g = model.model_g model_g = model.model_g
num_symbols = model_g.n_vocab num_symbols = model_g.n_vocab
num_speakers = model_g.n_speakers
# Inference only # Inference only
model_g.eval() model_g.eval()

View File

@@ -3,10 +3,20 @@ import json
import sys import sys
import unicodedata import unicodedata
from collections import Counter from collections import Counter
from enum import Enum
from typing import Dict, Iterable, List, Mapping, Optional from typing import Dict, Iterable, List, Mapping, Optional
from espeak_phonemizer import Phonemizer from espeak_phonemizer import Phonemizer
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
MAX_PHONEMES = 256 MAX_PHONEMES = 256
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"_": [0], "_": [0],
@@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
'"': [150], # Russian '"': [150], # Russian
} }
ALPHABETS = {
# Ukrainian
"uk": {
"_": [0],
"^": [1],
"$": [2],
" ": [3],
"!": [4],
"'": [5],
",": [6],
"-": [7],
".": [8],
":": [9],
";": [10],
"?": [11],
"а": [12],
"б": [13],
"в": [14],
"г": [15],
"ґ": [16],
"д": [17],
"е": [18],
"є": [19],
"ж": [20],
"з": [21],
"и": [22],
"і": [23],
"ї": [24],
"й": [25],
"к": [26],
"л": [27],
"м": [28],
"н": [29],
"о": [30],
"п": [31],
"р": [32],
"с": [33],
"т": [34],
"у": [35],
"ф": [36],
"х": [37],
"ц": [38],
"ч": [39],
"ш": [40],
"щ": [41],
"ь": [42],
"ю": [43],
"я": [44],
}
}
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]: def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True) phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)

View File

@@ -6,8 +6,8 @@ import itertools
import json import json
import logging import logging
import os import os
import unicodedata
from collections import Counter from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field from dataclasses import dataclass, field
from multiprocessing import JoinableQueue, Process, Queue from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path from pathlib import Path
@@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional
from espeak_phonemizer import Phonemizer from espeak_phonemizer import Phonemizer
from .norm_audio import cache_norm_audio, make_silence_detector from .norm_audio import cache_norm_audio, make_silence_detector
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES from .phonemize import (
ALPHABETS,
DEFAULT_PHONEME_ID_MAP,
MAX_PHONEMES,
PhonemeType,
phonemes_to_ids,
phonemize,
)
_LOGGER = logging.getLogger("preprocess") _LOGGER = logging.getLogger("preprocess")
@@ -49,6 +56,20 @@ def main() -> None:
parser.add_argument( parser.add_argument(
"--speaker-id", type=int, help="Add speaker id to single speaker dataset" "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
) )
#
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
#
parser.add_argument( parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio" "--skip-audio", action="store_true", help="Don't preprocess audio"
) )
@@ -89,7 +110,12 @@ def main() -> None:
_LOGGER.debug("Counting number of speakers/utterances in the dataset") _LOGGER.debug("Counting number of speakers/utterances in the dataset")
speaker_counts: Counter[str] = Counter() speaker_counts: Counter[str] = Counter()
num_utterances = 0 num_utterances = 0
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id): for utt in make_dataset(
args.input_dir,
args.single_speaker,
args.speaker_id,
args.skip_audio,
):
speaker = utt.speaker or "" speaker = utt.speaker or ""
speaker_counts[speaker] += 1 speaker_counts[speaker] += 1
num_utterances += 1 num_utterances += 1
@@ -121,8 +147,11 @@ def main() -> None:
"voice": args.language, "voice": args.language,
}, },
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8}, "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_type": str(args.phoneme_type),
"phoneme_map": {}, "phoneme_map": {},
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP, "phoneme_id_map": ALPHABETS[args.language]
if args.phoneme_type == PhonemeType.TEXT
else DEFAULT_PHONEME_ID_MAP,
"num_symbols": MAX_PHONEMES, "num_symbols": MAX_PHONEMES,
"num_speakers": len(speaker_counts), "num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids, "speaker_id_map": speaker_ids,
@@ -143,8 +172,13 @@ def main() -> None:
queue_out: "Queue[Optional[Utterance]]" = Queue() queue_out: "Queue[Optional[Utterance]]" = Queue()
# Start workers # Start workers
if args.phoneme_type == PhonemeType.TEXT:
target = phonemize_batch_text
else:
target = phonemize_batch_espeak
processes = [ processes = [
Process(target=process_batch, args=(args, queue_in, queue_out)) Process(target=target, args=(args, queue_in, queue_out))
for _ in range(args.max_workers) for _ in range(args.max_workers)
] ]
for proc in processes: for proc in processes:
@@ -155,7 +189,12 @@ def main() -> None:
) )
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file: with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
for utt_batch in batched( for utt_batch in batched(
make_dataset(args.input_dir, args.single_speaker, args.speaker_id), make_dataset(
args.input_dir,
args.single_speaker,
args.speaker_id,
args.skip_audio,
),
batch_size, batch_size,
): ):
queue_in.put(utt_batch) queue_in.put(utt_batch)
@@ -200,8 +239,24 @@ def main() -> None:
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue): def get_text_casing(casing: str):
if casing == "lower":
return str.lower
if casing == "upper":
return str.upper
if casing == "casefold":
return str.casefold
return lambda s: s
def phonemize_batch_espeak(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try: try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector() silence_detector = make_silence_detector()
phonemizer = Phonemizer(default_voice=args.language) phonemizer = Phonemizer(default_voice=args.language)
@@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
for utt in utt_batch: for utt in utt_batch:
try: try:
_LOGGER.debug(utt) _LOGGER.debug(utt)
utt.phonemes = phonemize(utt.text, phonemizer) utt.phonemes = phonemize(casing(utt.text), phonemizer)
utt.phoneme_ids = phonemes_to_ids( utt.phoneme_ids = phonemes_to_ids(
utt.phonemes, utt.phonemes,
missing_phonemes=utt.missing_phonemes, missing_phonemes=utt.missing_phonemes,
@@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
queue_in.task_done() queue_in.task_done()
except Exception: except Exception:
_LOGGER.exception("process_batch") _LOGGER.exception("phonemize_batch_espeak")
def phonemize_batch_text(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
alphabet = ALPHABETS[args.language]
while True:
utt_batch = queue_in.get()
if utt_batch is None:
break
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = []
for phoneme in utt.phonemes:
if phoneme in alphabet:
utt.phoneme_ids.extend(alphabet[phoneme])
else:
utt.missing_phonemes[phoneme] += 1
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
except Exception:
_LOGGER.exception("Failed to process utterance: %s", utt)
queue_out.put(None)
queue_in.task_done()
except Exception:
_LOGGER.exception("phonemize_batch_text")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder):
def ljspeech_dataset( def ljspeech_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None dataset_dir: Path,
is_single_speaker: bool,
speaker_id: Optional[int] = None,
skip_audio: bool = False,
) -> Iterable[Utterance]: ) -> Iterable[Utterance]:
# filename|speaker|text # filename|speaker|text
# speaker is optional # speaker is optional
@@ -298,7 +398,7 @@ def ljspeech_dataset(
# Try with .wav # Try with .wav
wav_path = wav_dir / f"{filename}.wav" wav_path = wav_dir / f"{filename}.wav"
if not wav_path.exists(): if (not skip_audio) and (not wav_path.exists()):
_LOGGER.warning("Missing %s", filename) _LOGGER.warning("Missing %s", filename)
continue continue
@@ -308,7 +408,10 @@ def ljspeech_dataset(
def mycroft_dataset( def mycroft_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None dataset_dir: Path,
is_single_speaker: bool,
speaker_id: Optional[int] = None,
skip_audio: bool = False,
) -> Iterable[Utterance]: ) -> Iterable[Utterance]:
speaker_id = 0 speaker_id = 0
for metadata_path in dataset_dir.glob("**/*-metadata.txt"): for metadata_path in dataset_dir.glob("**/*-metadata.txt"):

View File

@@ -4,6 +4,7 @@ import csv
import sys import sys
from collections import Counter, defaultdict from collections import Counter, defaultdict
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--speaker-number", type=int) parser.add_argument("--speaker-number", type=int)