mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-16 13:25:30 +00:00
Add text phonemes to preprocess
This commit is contained in:
@@ -45,7 +45,7 @@ def main() -> None:
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in missing_phonemes.most_common()
|
||||
}
|
||||
},
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@@ -41,7 +40,6 @@ def main():
|
||||
model_g = model.model_g
|
||||
|
||||
num_symbols = model_g.n_vocab
|
||||
num_speakers = model_g.n_speakers
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
@@ -3,10 +3,20 @@ import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from enum import Enum
|
||||
from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
MAX_PHONEMES = 256
|
||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"_": [0],
|
||||
@@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
'"': [150], # Russian
|
||||
}
|
||||
|
||||
ALPHABETS = {
|
||||
# Ukrainian
|
||||
"uk": {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
",": [6],
|
||||
"-": [7],
|
||||
".": [8],
|
||||
":": [9],
|
||||
";": [10],
|
||||
"?": [11],
|
||||
"а": [12],
|
||||
"б": [13],
|
||||
"в": [14],
|
||||
"г": [15],
|
||||
"ґ": [16],
|
||||
"д": [17],
|
||||
"е": [18],
|
||||
"є": [19],
|
||||
"ж": [20],
|
||||
"з": [21],
|
||||
"и": [22],
|
||||
"і": [23],
|
||||
"ї": [24],
|
||||
"й": [25],
|
||||
"к": [26],
|
||||
"л": [27],
|
||||
"м": [28],
|
||||
"н": [29],
|
||||
"о": [30],
|
||||
"п": [31],
|
||||
"р": [32],
|
||||
"с": [33],
|
||||
"т": [34],
|
||||
"у": [35],
|
||||
"ф": [36],
|
||||
"х": [37],
|
||||
"ц": [38],
|
||||
"ч": [39],
|
||||
"ш": [40],
|
||||
"щ": [41],
|
||||
"ь": [42],
|
||||
"ю": [43],
|
||||
"я": [44],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
|
||||
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
||||
|
||||
@@ -6,8 +6,8 @@ import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
@@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
|
||||
from .phonemize import (
|
||||
ALPHABETS,
|
||||
DEFAULT_PHONEME_ID_MAP,
|
||||
MAX_PHONEMES,
|
||||
PhonemeType,
|
||||
phonemes_to_ids,
|
||||
phonemize,
|
||||
)
|
||||
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
@@ -49,6 +56,20 @@ def main() -> None:
|
||||
parser.add_argument(
|
||||
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||
)
|
||||
@@ -89,7 +110,12 @@ def main() -> None:
|
||||
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
|
||||
speaker_counts: Counter[str] = Counter()
|
||||
num_utterances = 0
|
||||
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
|
||||
for utt in make_dataset(
|
||||
args.input_dir,
|
||||
args.single_speaker,
|
||||
args.speaker_id,
|
||||
args.skip_audio,
|
||||
):
|
||||
speaker = utt.speaker or ""
|
||||
speaker_counts[speaker] += 1
|
||||
num_utterances += 1
|
||||
@@ -121,8 +147,11 @@ def main() -> None:
|
||||
"voice": args.language,
|
||||
},
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_type": str(args.phoneme_type),
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
|
||||
"phoneme_id_map": ALPHABETS[args.language]
|
||||
if args.phoneme_type == PhonemeType.TEXT
|
||||
else DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": MAX_PHONEMES,
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
@@ -143,8 +172,13 @@ def main() -> None:
|
||||
queue_out: "Queue[Optional[Utterance]]" = Queue()
|
||||
|
||||
# Start workers
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
target = phonemize_batch_text
|
||||
else:
|
||||
target = phonemize_batch_espeak
|
||||
|
||||
processes = [
|
||||
Process(target=process_batch, args=(args, queue_in, queue_out))
|
||||
Process(target=target, args=(args, queue_in, queue_out))
|
||||
for _ in range(args.max_workers)
|
||||
]
|
||||
for proc in processes:
|
||||
@@ -155,7 +189,12 @@ def main() -> None:
|
||||
)
|
||||
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
||||
for utt_batch in batched(
|
||||
make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
|
||||
make_dataset(
|
||||
args.input_dir,
|
||||
args.single_speaker,
|
||||
args.speaker_id,
|
||||
args.skip_audio,
|
||||
),
|
||||
batch_size,
|
||||
):
|
||||
queue_in.put(utt_batch)
|
||||
@@ -200,8 +239,24 @@ def main() -> None:
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
|
||||
def get_text_casing(casing: str):
|
||||
if casing == "lower":
|
||||
return str.lower
|
||||
|
||||
if casing == "upper":
|
||||
return str.upper
|
||||
|
||||
if casing == "casefold":
|
||||
return str.casefold
|
||||
|
||||
return lambda s: s
|
||||
|
||||
|
||||
def phonemize_batch_espeak(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
phonemizer = Phonemizer(default_voice=args.language)
|
||||
|
||||
@@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = phonemize(utt.text, phonemizer)
|
||||
utt.phonemes = phonemize(casing(utt.text), phonemizer)
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
@@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("process_batch")
|
||||
_LOGGER.exception("phonemize_batch_espeak")
|
||||
|
||||
|
||||
def phonemize_batch_text(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
alphabet = ALPHABETS[args.language]
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
if utt_batch is None:
|
||||
break
|
||||
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
|
||||
utt.phoneme_ids = []
|
||||
for phoneme in utt.phonemes:
|
||||
if phoneme in alphabet:
|
||||
utt.phoneme_ids.extend(alphabet[phoneme])
|
||||
else:
|
||||
utt.missing_phonemes[phoneme] += 1
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
except Exception:
|
||||
_LOGGER.exception("Failed to process utterance: %s", utt)
|
||||
queue_out.put(None)
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("phonemize_batch_text")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder):
|
||||
|
||||
|
||||
def ljspeech_dataset(
|
||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
||||
dataset_dir: Path,
|
||||
is_single_speaker: bool,
|
||||
speaker_id: Optional[int] = None,
|
||||
skip_audio: bool = False,
|
||||
) -> Iterable[Utterance]:
|
||||
# filename|speaker|text
|
||||
# speaker is optional
|
||||
@@ -298,7 +398,7 @@ def ljspeech_dataset(
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
if (not skip_audio) and (not wav_path.exists()):
|
||||
_LOGGER.warning("Missing %s", filename)
|
||||
continue
|
||||
|
||||
@@ -308,7 +408,10 @@ def ljspeech_dataset(
|
||||
|
||||
|
||||
def mycroft_dataset(
|
||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
||||
dataset_dir: Path,
|
||||
is_single_speaker: bool,
|
||||
speaker_id: Optional[int] = None,
|
||||
skip_audio: bool = False,
|
||||
) -> Iterable[Utterance]:
|
||||
speaker_id = 0
|
||||
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
|
||||
|
||||
@@ -4,6 +4,7 @@ import csv
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--speaker-number", type=int)
|
||||
|
||||
Reference in New Issue
Block a user