mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-17 22:05:30 +00:00
Add text phonemes to preprocess
This commit is contained in:
@@ -45,7 +45,7 @@ def main() -> None:
|
|||||||
"category": unicodedata.category(phoneme),
|
"category": unicodedata.category(phoneme),
|
||||||
}
|
}
|
||||||
for phoneme, count in missing_phonemes.most_common()
|
for phoneme, count in missing_phonemes.most_common()
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
sys.stdout,
|
sys.stdout,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -41,7 +40,6 @@ def main():
|
|||||||
model_g = model.model_g
|
model_g = model.model_g
|
||||||
|
|
||||||
num_symbols = model_g.n_vocab
|
num_symbols = model_g.n_vocab
|
||||||
num_speakers = model_g.n_speakers
|
|
||||||
|
|
||||||
# Inference only
|
# Inference only
|
||||||
model_g.eval()
|
model_g.eval()
|
||||||
|
|||||||
@@ -3,10 +3,20 @@ import json
|
|||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from enum import Enum
|
||||||
from typing import Dict, Iterable, List, Mapping, Optional
|
from typing import Dict, Iterable, List, Mapping, Optional
|
||||||
|
|
||||||
from espeak_phonemizer import Phonemizer
|
from espeak_phonemizer import Phonemizer
|
||||||
|
|
||||||
|
|
||||||
|
class PhonemeType(str, Enum):
|
||||||
|
ESPEAK = "espeak"
|
||||||
|
"""Phonemes come from espeak-ng"""
|
||||||
|
|
||||||
|
TEXT = "text"
|
||||||
|
"""Phonemes come from text itself"""
|
||||||
|
|
||||||
|
|
||||||
MAX_PHONEMES = 256
|
MAX_PHONEMES = 256
|
||||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||||
"_": [0],
|
"_": [0],
|
||||||
@@ -162,6 +172,57 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
|||||||
'"': [150], # Russian
|
'"': [150], # Russian
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALPHABETS = {
|
||||||
|
# Ukrainian
|
||||||
|
"uk": {
|
||||||
|
"_": [0],
|
||||||
|
"^": [1],
|
||||||
|
"$": [2],
|
||||||
|
" ": [3],
|
||||||
|
"!": [4],
|
||||||
|
"'": [5],
|
||||||
|
",": [6],
|
||||||
|
"-": [7],
|
||||||
|
".": [8],
|
||||||
|
":": [9],
|
||||||
|
";": [10],
|
||||||
|
"?": [11],
|
||||||
|
"а": [12],
|
||||||
|
"б": [13],
|
||||||
|
"в": [14],
|
||||||
|
"г": [15],
|
||||||
|
"ґ": [16],
|
||||||
|
"д": [17],
|
||||||
|
"е": [18],
|
||||||
|
"є": [19],
|
||||||
|
"ж": [20],
|
||||||
|
"з": [21],
|
||||||
|
"и": [22],
|
||||||
|
"і": [23],
|
||||||
|
"ї": [24],
|
||||||
|
"й": [25],
|
||||||
|
"к": [26],
|
||||||
|
"л": [27],
|
||||||
|
"м": [28],
|
||||||
|
"н": [29],
|
||||||
|
"о": [30],
|
||||||
|
"п": [31],
|
||||||
|
"р": [32],
|
||||||
|
"с": [33],
|
||||||
|
"т": [34],
|
||||||
|
"у": [35],
|
||||||
|
"ф": [36],
|
||||||
|
"х": [37],
|
||||||
|
"ц": [38],
|
||||||
|
"ч": [39],
|
||||||
|
"ш": [40],
|
||||||
|
"щ": [41],
|
||||||
|
"ь": [42],
|
||||||
|
"ю": [43],
|
||||||
|
"я": [44],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
|
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
|
||||||
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ import itertools
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import unicodedata
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from multiprocessing import JoinableQueue, Process, Queue
|
from multiprocessing import JoinableQueue, Process, Queue
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -16,7 +16,14 @@ from typing import Dict, Iterable, List, Optional
|
|||||||
from espeak_phonemizer import Phonemizer
|
from espeak_phonemizer import Phonemizer
|
||||||
|
|
||||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||||
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
|
from .phonemize import (
|
||||||
|
ALPHABETS,
|
||||||
|
DEFAULT_PHONEME_ID_MAP,
|
||||||
|
MAX_PHONEMES,
|
||||||
|
PhonemeType,
|
||||||
|
phonemes_to_ids,
|
||||||
|
phonemize,
|
||||||
|
)
|
||||||
|
|
||||||
_LOGGER = logging.getLogger("preprocess")
|
_LOGGER = logging.getLogger("preprocess")
|
||||||
|
|
||||||
@@ -49,6 +56,20 @@ def main() -> None:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
||||||
)
|
)
|
||||||
|
#
|
||||||
|
parser.add_argument(
|
||||||
|
"--phoneme-type",
|
||||||
|
choices=list(PhonemeType),
|
||||||
|
default=PhonemeType.ESPEAK,
|
||||||
|
help="Type of phonemes to use (default: espeak)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--text-casing",
|
||||||
|
choices=("ignore", "lower", "upper", "casefold"),
|
||||||
|
default="ignore",
|
||||||
|
help="Casing applied to utterance text",
|
||||||
|
)
|
||||||
|
#
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||||
)
|
)
|
||||||
@@ -89,7 +110,12 @@ def main() -> None:
|
|||||||
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
|
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
|
||||||
speaker_counts: Counter[str] = Counter()
|
speaker_counts: Counter[str] = Counter()
|
||||||
num_utterances = 0
|
num_utterances = 0
|
||||||
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
|
for utt in make_dataset(
|
||||||
|
args.input_dir,
|
||||||
|
args.single_speaker,
|
||||||
|
args.speaker_id,
|
||||||
|
args.skip_audio,
|
||||||
|
):
|
||||||
speaker = utt.speaker or ""
|
speaker = utt.speaker or ""
|
||||||
speaker_counts[speaker] += 1
|
speaker_counts[speaker] += 1
|
||||||
num_utterances += 1
|
num_utterances += 1
|
||||||
@@ -121,8 +147,11 @@ def main() -> None:
|
|||||||
"voice": args.language,
|
"voice": args.language,
|
||||||
},
|
},
|
||||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||||
|
"phoneme_type": str(args.phoneme_type),
|
||||||
"phoneme_map": {},
|
"phoneme_map": {},
|
||||||
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
|
"phoneme_id_map": ALPHABETS[args.language]
|
||||||
|
if args.phoneme_type == PhonemeType.TEXT
|
||||||
|
else DEFAULT_PHONEME_ID_MAP,
|
||||||
"num_symbols": MAX_PHONEMES,
|
"num_symbols": MAX_PHONEMES,
|
||||||
"num_speakers": len(speaker_counts),
|
"num_speakers": len(speaker_counts),
|
||||||
"speaker_id_map": speaker_ids,
|
"speaker_id_map": speaker_ids,
|
||||||
@@ -143,8 +172,13 @@ def main() -> None:
|
|||||||
queue_out: "Queue[Optional[Utterance]]" = Queue()
|
queue_out: "Queue[Optional[Utterance]]" = Queue()
|
||||||
|
|
||||||
# Start workers
|
# Start workers
|
||||||
|
if args.phoneme_type == PhonemeType.TEXT:
|
||||||
|
target = phonemize_batch_text
|
||||||
|
else:
|
||||||
|
target = phonemize_batch_espeak
|
||||||
|
|
||||||
processes = [
|
processes = [
|
||||||
Process(target=process_batch, args=(args, queue_in, queue_out))
|
Process(target=target, args=(args, queue_in, queue_out))
|
||||||
for _ in range(args.max_workers)
|
for _ in range(args.max_workers)
|
||||||
]
|
]
|
||||||
for proc in processes:
|
for proc in processes:
|
||||||
@@ -155,7 +189,12 @@ def main() -> None:
|
|||||||
)
|
)
|
||||||
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
||||||
for utt_batch in batched(
|
for utt_batch in batched(
|
||||||
make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
|
make_dataset(
|
||||||
|
args.input_dir,
|
||||||
|
args.single_speaker,
|
||||||
|
args.speaker_id,
|
||||||
|
args.skip_audio,
|
||||||
|
),
|
||||||
batch_size,
|
batch_size,
|
||||||
):
|
):
|
||||||
queue_in.put(utt_batch)
|
queue_in.put(utt_batch)
|
||||||
@@ -200,8 +239,24 @@ def main() -> None:
|
|||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
|
def get_text_casing(casing: str):
|
||||||
|
if casing == "lower":
|
||||||
|
return str.lower
|
||||||
|
|
||||||
|
if casing == "upper":
|
||||||
|
return str.upper
|
||||||
|
|
||||||
|
if casing == "casefold":
|
||||||
|
return str.casefold
|
||||||
|
|
||||||
|
return lambda s: s
|
||||||
|
|
||||||
|
|
||||||
|
def phonemize_batch_espeak(
|
||||||
|
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
|
casing = get_text_casing(args.text_casing)
|
||||||
silence_detector = make_silence_detector()
|
silence_detector = make_silence_detector()
|
||||||
phonemizer = Phonemizer(default_voice=args.language)
|
phonemizer = Phonemizer(default_voice=args.language)
|
||||||
|
|
||||||
@@ -213,7 +268,7 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
|||||||
for utt in utt_batch:
|
for utt in utt_batch:
|
||||||
try:
|
try:
|
||||||
_LOGGER.debug(utt)
|
_LOGGER.debug(utt)
|
||||||
utt.phonemes = phonemize(utt.text, phonemizer)
|
utt.phonemes = phonemize(casing(utt.text), phonemizer)
|
||||||
utt.phoneme_ids = phonemes_to_ids(
|
utt.phoneme_ids = phonemes_to_ids(
|
||||||
utt.phonemes,
|
utt.phonemes,
|
||||||
missing_phonemes=utt.missing_phonemes,
|
missing_phonemes=utt.missing_phonemes,
|
||||||
@@ -234,7 +289,49 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
|||||||
|
|
||||||
queue_in.task_done()
|
queue_in.task_done()
|
||||||
except Exception:
|
except Exception:
|
||||||
_LOGGER.exception("process_batch")
|
_LOGGER.exception("phonemize_batch_espeak")
|
||||||
|
|
||||||
|
|
||||||
|
def phonemize_batch_text(
|
||||||
|
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
casing = get_text_casing(args.text_casing)
|
||||||
|
silence_detector = make_silence_detector()
|
||||||
|
alphabet = ALPHABETS[args.language]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
utt_batch = queue_in.get()
|
||||||
|
if utt_batch is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
for utt in utt_batch:
|
||||||
|
try:
|
||||||
|
_LOGGER.debug(utt)
|
||||||
|
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
|
||||||
|
utt.phoneme_ids = []
|
||||||
|
for phoneme in utt.phonemes:
|
||||||
|
if phoneme in alphabet:
|
||||||
|
utt.phoneme_ids.extend(alphabet[phoneme])
|
||||||
|
else:
|
||||||
|
utt.missing_phonemes[phoneme] += 1
|
||||||
|
if not args.skip_audio:
|
||||||
|
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||||
|
utt.audio_path,
|
||||||
|
args.cache_dir,
|
||||||
|
silence_detector,
|
||||||
|
args.sample_rate,
|
||||||
|
)
|
||||||
|
queue_out.put(utt)
|
||||||
|
except TimeoutError:
|
||||||
|
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||||
|
except Exception:
|
||||||
|
_LOGGER.exception("Failed to process utterance: %s", utt)
|
||||||
|
queue_out.put(None)
|
||||||
|
|
||||||
|
queue_in.task_done()
|
||||||
|
except Exception:
|
||||||
|
_LOGGER.exception("phonemize_batch_text")
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@@ -261,7 +358,10 @@ class PathEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
|
|
||||||
def ljspeech_dataset(
|
def ljspeech_dataset(
|
||||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
dataset_dir: Path,
|
||||||
|
is_single_speaker: bool,
|
||||||
|
speaker_id: Optional[int] = None,
|
||||||
|
skip_audio: bool = False,
|
||||||
) -> Iterable[Utterance]:
|
) -> Iterable[Utterance]:
|
||||||
# filename|speaker|text
|
# filename|speaker|text
|
||||||
# speaker is optional
|
# speaker is optional
|
||||||
@@ -298,7 +398,7 @@ def ljspeech_dataset(
|
|||||||
# Try with .wav
|
# Try with .wav
|
||||||
wav_path = wav_dir / f"{filename}.wav"
|
wav_path = wav_dir / f"{filename}.wav"
|
||||||
|
|
||||||
if not wav_path.exists():
|
if (not skip_audio) and (not wav_path.exists()):
|
||||||
_LOGGER.warning("Missing %s", filename)
|
_LOGGER.warning("Missing %s", filename)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -308,7 +408,10 @@ def ljspeech_dataset(
|
|||||||
|
|
||||||
|
|
||||||
def mycroft_dataset(
|
def mycroft_dataset(
|
||||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
dataset_dir: Path,
|
||||||
|
is_single_speaker: bool,
|
||||||
|
speaker_id: Optional[int] = None,
|
||||||
|
skip_audio: bool = False,
|
||||||
) -> Iterable[Utterance]:
|
) -> Iterable[Utterance]:
|
||||||
speaker_id = 0
|
speaker_id = 0
|
||||||
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
|
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import csv
|
|||||||
import sys
|
import sys
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--speaker-number", type=int)
|
parser.add_argument("--speaker-number", type=int)
|
||||||
|
|||||||
Reference in New Issue
Block a user