Add text phonemes to preprocess

This commit is contained in:
Michael Hansen
2023-06-08 15:37:20 -05:00
parent a66ac02ca7
commit a7eab8ed03
5 changed files with 178 additions and 15 deletions

View File

@@ -45,7 +45,7 @@ def main() -> None:
"category": unicodedata.category(phoneme),
}
for phoneme, count in missing_phonemes.most_common()
}
},
},
sys.stdout,
)

View File

@@ -2,7 +2,6 @@
import argparse
import logging
from pathlib import Path
from typing import Optional
import torch
@@ -41,7 +40,6 @@ def main():
model_g = model.model_g
num_symbols = model_g.n_vocab
num_speakers = model_g.n_speakers
# Inference only
model_g.eval()

View File

@@ -3,10 +3,20 @@ import json
import sys
import unicodedata
from collections import Counter
from enum import Enum
from typing import Dict, Iterable, List, Mapping, Optional
from espeak_phonemizer import Phonemizer
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
MAX_PHONEMES = 256
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"_": [0],
@@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
'"': [150], # Russian
}
ALPHABETS = {
# Ukrainian
"uk": {
"_": [0],
"^": [1],
"$": [2],
" ": [3],
"!": [4],
"'": [5],
",": [6],
"-": [7],
".": [8],
":": [9],
";": [10],
"?": [11],
"а": [12],
"б": [13],
"в": [14],
"г": [15],
"ґ": [16],
"д": [17],
"е": [18],
"є": [19],
"ж": [20],
"з": [21],
"и": [22],
"і": [23],
"ї": [24],
"й": [25],
"к": [26],
"л": [27],
"м": [28],
"н": [29],
"о": [30],
"п": [31],
"р": [32],
"с": [33],
"т": [34],
"у": [35],
"ф": [36],
"х": [37],
"ц": [38],
"ч": [39],
"ш": [40],
"щ": [41],
"ь": [42],
"ю": [43],
"я": [44],
}
}
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)

View File

@@ -6,8 +6,8 @@ import itertools
import json
import logging
import os
import unicodedata
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
@@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional
from espeak_phonemizer import Phonemizer
from .norm_audio import cache_norm_audio, make_silence_detector
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
from .phonemize import (
ALPHABETS,
DEFAULT_PHONEME_ID_MAP,
MAX_PHONEMES,
PhonemeType,
phonemes_to_ids,
phonemize,
)
_LOGGER = logging.getLogger("preprocess")
@@ -49,6 +56,20 @@ def main() -> None:
parser.add_argument(
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
)
#
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
#
parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio"
)
@@ -89,7 +110,12 @@ def main() -> None:
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
speaker_counts: Counter[str] = Counter()
num_utterances = 0
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
for utt in make_dataset(
args.input_dir,
args.single_speaker,
args.speaker_id,
args.skip_audio,
):
speaker = utt.speaker or ""
speaker_counts[speaker] += 1
num_utterances += 1
@@ -121,8 +147,11 @@ def main() -> None:
"voice": args.language,
},
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_type": str(args.phoneme_type),
"phoneme_map": {},
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
"phoneme_id_map": ALPHABETS[args.language]
if args.phoneme_type == PhonemeType.TEXT
else DEFAULT_PHONEME_ID_MAP,
"num_symbols": MAX_PHONEMES,
"num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids,
@@ -143,8 +172,13 @@ def main() -> None:
queue_out: "Queue[Optional[Utterance]]" = Queue()
# Start workers
if args.phoneme_type == PhonemeType.TEXT:
target = phonemize_batch_text
else:
target = phonemize_batch_espeak
processes = [
Process(target=process_batch, args=(args, queue_in, queue_out))
Process(target=target, args=(args, queue_in, queue_out))
for _ in range(args.max_workers)
]
for proc in processes:
@@ -155,7 +189,12 @@ def main() -> None:
)
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
for utt_batch in batched(
make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
make_dataset(
args.input_dir,
args.single_speaker,
args.speaker_id,
args.skip_audio,
),
batch_size,
):
queue_in.put(utt_batch)
@@ -200,8 +239,24 @@ def main() -> None:
# -----------------------------------------------------------------------------
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
def get_text_casing(casing: str):
if casing == "lower":
return str.lower
if casing == "upper":
return str.upper
if casing == "casefold":
return str.casefold
return lambda s: s
def phonemize_batch_espeak(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
phonemizer = Phonemizer(default_voice=args.language)
@@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = phonemize(utt.text, phonemizer)
utt.phonemes = phonemize(casing(utt.text), phonemizer)
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
@@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
queue_in.task_done()
except Exception:
_LOGGER.exception("process_batch")
_LOGGER.exception("phonemize_batch_espeak")
def phonemize_batch_text(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
alphabet = ALPHABETS[args.language]
while True:
utt_batch = queue_in.get()
if utt_batch is None:
break
for utt in utt_batch:
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = []
for phoneme in utt.phonemes:
if phoneme in alphabet:
utt.phoneme_ids.extend(alphabet[phoneme])
else:
utt.missing_phonemes[phoneme] += 1
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
except Exception:
_LOGGER.exception("Failed to process utterance: %s", utt)
queue_out.put(None)
queue_in.task_done()
except Exception:
_LOGGER.exception("phonemize_batch_text")
# -----------------------------------------------------------------------------
@@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder):
def ljspeech_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
dataset_dir: Path,
is_single_speaker: bool,
speaker_id: Optional[int] = None,
skip_audio: bool = False,
) -> Iterable[Utterance]:
# filename|speaker|text
# speaker is optional
@@ -298,7 +398,7 @@ def ljspeech_dataset(
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
if not wav_path.exists():
if (not skip_audio) and (not wav_path.exists()):
_LOGGER.warning("Missing %s", filename)
continue
@@ -308,7 +408,10 @@ def ljspeech_dataset(
def mycroft_dataset(
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
dataset_dir: Path,
is_single_speaker: bool,
speaker_id: Optional[int] = None,
skip_audio: bool = False,
) -> Iterable[Utterance]:
speaker_id = 0
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):

View File

@@ -4,6 +4,7 @@ import csv
import sys
from collections import Counter, defaultdict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--speaker-number", type=int)