mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-21 07:14:49 +00:00
Use piper-phonemize instead
This commit is contained in:
@@ -1,372 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from enum import Enum
|
||||
from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
MAX_PHONEMES = 256
|
||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
"(": [6],
|
||||
")": [7],
|
||||
",": [8],
|
||||
"-": [9],
|
||||
".": [10],
|
||||
":": [11],
|
||||
";": [12],
|
||||
"?": [13],
|
||||
"a": [14],
|
||||
"b": [15],
|
||||
"c": [16],
|
||||
"d": [17],
|
||||
"e": [18],
|
||||
"f": [19],
|
||||
"h": [20],
|
||||
"i": [21],
|
||||
"j": [22],
|
||||
"k": [23],
|
||||
"l": [24],
|
||||
"m": [25],
|
||||
"n": [26],
|
||||
"o": [27],
|
||||
"p": [28],
|
||||
"q": [29],
|
||||
"r": [30],
|
||||
"s": [31],
|
||||
"t": [32],
|
||||
"u": [33],
|
||||
"v": [34],
|
||||
"w": [35],
|
||||
"x": [36],
|
||||
"y": [37],
|
||||
"z": [38],
|
||||
"æ": [39],
|
||||
"ç": [40],
|
||||
"ð": [41],
|
||||
"ø": [42],
|
||||
"ħ": [43],
|
||||
"ŋ": [44],
|
||||
"œ": [45],
|
||||
"ǀ": [46],
|
||||
"ǁ": [47],
|
||||
"ǂ": [48],
|
||||
"ǃ": [49],
|
||||
"ɐ": [50],
|
||||
"ɑ": [51],
|
||||
"ɒ": [52],
|
||||
"ɓ": [53],
|
||||
"ɔ": [54],
|
||||
"ɕ": [55],
|
||||
"ɖ": [56],
|
||||
"ɗ": [57],
|
||||
"ɘ": [58],
|
||||
"ə": [59],
|
||||
"ɚ": [60],
|
||||
"ɛ": [61],
|
||||
"ɜ": [62],
|
||||
"ɞ": [63],
|
||||
"ɟ": [64],
|
||||
"ɠ": [65],
|
||||
"ɡ": [66],
|
||||
"ɢ": [67],
|
||||
"ɣ": [68],
|
||||
"ɤ": [69],
|
||||
"ɥ": [70],
|
||||
"ɦ": [71],
|
||||
"ɧ": [72],
|
||||
"ɨ": [73],
|
||||
"ɪ": [74],
|
||||
"ɫ": [75],
|
||||
"ɬ": [76],
|
||||
"ɭ": [77],
|
||||
"ɮ": [78],
|
||||
"ɯ": [79],
|
||||
"ɰ": [80],
|
||||
"ɱ": [81],
|
||||
"ɲ": [82],
|
||||
"ɳ": [83],
|
||||
"ɴ": [84],
|
||||
"ɵ": [85],
|
||||
"ɶ": [86],
|
||||
"ɸ": [87],
|
||||
"ɹ": [88],
|
||||
"ɺ": [89],
|
||||
"ɻ": [90],
|
||||
"ɽ": [91],
|
||||
"ɾ": [92],
|
||||
"ʀ": [93],
|
||||
"ʁ": [94],
|
||||
"ʂ": [95],
|
||||
"ʃ": [96],
|
||||
"ʄ": [97],
|
||||
"ʈ": [98],
|
||||
"ʉ": [99],
|
||||
"ʊ": [100],
|
||||
"ʋ": [101],
|
||||
"ʌ": [102],
|
||||
"ʍ": [103],
|
||||
"ʎ": [104],
|
||||
"ʏ": [105],
|
||||
"ʐ": [106],
|
||||
"ʑ": [107],
|
||||
"ʒ": [108],
|
||||
"ʔ": [109],
|
||||
"ʕ": [110],
|
||||
"ʘ": [111],
|
||||
"ʙ": [112],
|
||||
"ʛ": [113],
|
||||
"ʜ": [114],
|
||||
"ʝ": [115],
|
||||
"ʟ": [116],
|
||||
"ʡ": [117],
|
||||
"ʢ": [118],
|
||||
"ʲ": [119],
|
||||
"ˈ": [120],
|
||||
"ˌ": [121],
|
||||
"ː": [122],
|
||||
"ˑ": [123],
|
||||
"˞": [124],
|
||||
"β": [125],
|
||||
"θ": [126],
|
||||
"χ": [127],
|
||||
"ᵻ": [128],
|
||||
"ⱱ": [129],
|
||||
"0": [130], # tones
|
||||
"1": [131],
|
||||
"2": [132],
|
||||
"3": [133],
|
||||
"4": [134],
|
||||
"5": [135],
|
||||
"6": [136],
|
||||
"7": [137],
|
||||
"8": [138],
|
||||
"9": [139],
|
||||
"\u0327": [140], # combining cedilla
|
||||
"\u0303": [141], # combining tilde
|
||||
"\u032a": [142], # combining bridge below
|
||||
"\u032f": [143], # combining inverted breve below
|
||||
"\u0329": [144], # combining vertical line below
|
||||
"ʰ": [145],
|
||||
"ˤ": [146],
|
||||
"ε": [147],
|
||||
"↓": [148],
|
||||
"#": [149], # Icelandic
|
||||
'"': [150], # Russian
|
||||
"↑": [151],
|
||||
"\u033a": [152], # Basque
|
||||
"\u033b": [153],
|
||||
}
|
||||
|
||||
PHONEME_MAPS = {
|
||||
# Brazilian Portuguese
|
||||
"pt-br": {"c": ["k"]}
|
||||
}
|
||||
|
||||
ALPHABETS = {
|
||||
# Ukrainian
|
||||
"uk": {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
",": [6],
|
||||
"-": [7],
|
||||
".": [8],
|
||||
":": [9],
|
||||
";": [10],
|
||||
"?": [11],
|
||||
"а": [12],
|
||||
"б": [13],
|
||||
"в": [14],
|
||||
"г": [15],
|
||||
"ґ": [16],
|
||||
"д": [17],
|
||||
"е": [18],
|
||||
"є": [19],
|
||||
"ж": [20],
|
||||
"з": [21],
|
||||
"и": [22],
|
||||
"і": [23],
|
||||
"ї": [24],
|
||||
"й": [25],
|
||||
"к": [26],
|
||||
"л": [27],
|
||||
"м": [28],
|
||||
"н": [29],
|
||||
"о": [30],
|
||||
"п": [31],
|
||||
"р": [32],
|
||||
"с": [33],
|
||||
"т": [34],
|
||||
"у": [35],
|
||||
"ф": [36],
|
||||
"х": [37],
|
||||
"ц": [38],
|
||||
"ч": [39],
|
||||
"ш": [40],
|
||||
"щ": [41],
|
||||
"ь": [42],
|
||||
"ю": [43],
|
||||
"я": [44],
|
||||
"\u0301": [45], # combining acute accent
|
||||
"\u0306": [46], # combining breve
|
||||
"\u0308": [47], # combining diaeresis
|
||||
"—": [48], # em dash
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def phonemize(
|
||||
text: str,
|
||||
phonemizer: Phonemizer,
|
||||
phoneme_map: Optional[Dict[str, List[str]]] = None,
|
||||
) -> List[str]:
|
||||
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
||||
|
||||
# Phonemes are decomposed into unicode codepoints
|
||||
unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
|
||||
if not phoneme_map:
|
||||
return unmapped_phonemes
|
||||
|
||||
# Phonemes can be mapped to lists of other phonemes
|
||||
mapped_phonemes = []
|
||||
for phoneme in unmapped_phonemes:
|
||||
sub_phonemes = phoneme_map.get(phoneme)
|
||||
if sub_phonemes:
|
||||
mapped_phonemes.extend(sub_phonemes)
|
||||
else:
|
||||
mapped_phonemes.append(phoneme)
|
||||
|
||||
return mapped_phonemes
|
||||
|
||||
|
||||
def phonemes_to_ids(
|
||||
phonemes: Iterable[str],
|
||||
phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None,
|
||||
missing_phonemes: "Optional[Counter[str]]" = None,
|
||||
pad: Optional[str] = "_",
|
||||
bos: Optional[str] = "^",
|
||||
eos: Optional[str] = "$",
|
||||
) -> List[int]:
|
||||
if phoneme_id_map is None:
|
||||
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
phoneme_ids: List[int] = []
|
||||
|
||||
if bos:
|
||||
phoneme_ids.extend(phoneme_id_map[bos])
|
||||
|
||||
if pad:
|
||||
phoneme_ids.extend(phoneme_id_map[pad])
|
||||
|
||||
for phoneme in phonemes:
|
||||
mapped_phoneme_ids = phoneme_id_map.get(phoneme)
|
||||
if mapped_phoneme_ids:
|
||||
phoneme_ids.extend(mapped_phoneme_ids)
|
||||
|
||||
if pad:
|
||||
phoneme_ids.extend(phoneme_id_map[pad])
|
||||
elif missing_phonemes is not None:
|
||||
# Make note of missing phonemes
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if eos:
|
||||
phoneme_ids.extend(phoneme_id_map[eos])
|
||||
|
||||
return phoneme_ids
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("language")
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
phonemizer: Optional[Phonemizer] = None
|
||||
|
||||
if args.text_casing == "lower":
|
||||
casing = str.lower
|
||||
elif args.text_casing == "upper":
|
||||
casing = str.upper
|
||||
else:
|
||||
# ignore
|
||||
casing = lambda s: s
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
# Use text directly
|
||||
phoneme_id_map = ALPHABETS[args.language]
|
||||
else:
|
||||
# Use eSpeak
|
||||
phonemizer = Phonemizer(args.language)
|
||||
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
phonemes = list(unicodedata.normalize("NFD", casing(line)))
|
||||
else:
|
||||
assert phonemizer is not None
|
||||
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
|
||||
|
||||
phoneme_ids = phonemes_to_ids(
|
||||
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
|
||||
)
|
||||
json.dump(
|
||||
{
|
||||
"text": line,
|
||||
"phonemes": phonemes,
|
||||
"phoneme_ids": phoneme_ids,
|
||||
},
|
||||
sys.stdout,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
print("")
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
print(phoneme, count, file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -9,28 +9,37 @@ import os
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
from piper_phonemize import (
|
||||
phonemize_espeak,
|
||||
phonemize_codepoints,
|
||||
phoneme_ids_espeak,
|
||||
phoneme_ids_codepoints,
|
||||
get_codepoints_map,
|
||||
get_espeak_map,
|
||||
get_max_phonemes,
|
||||
tashkeel_run,
|
||||
)
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
from .phonemize import (
|
||||
ALPHABETS,
|
||||
DEFAULT_PHONEME_ID_MAP,
|
||||
MAX_PHONEMES,
|
||||
PHONEME_MAPS,
|
||||
PhonemeType,
|
||||
phonemes_to_ids,
|
||||
phonemize,
|
||||
)
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@@ -150,10 +159,10 @@ def main() -> None:
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_type": args.phoneme_type.value,
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": ALPHABETS[args.language]
|
||||
"phoneme_id_map": get_codepoints_map()[args.language]
|
||||
if args.phoneme_type == PhonemeType.TEXT
|
||||
else DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": MAX_PHONEMES,
|
||||
else get_espeak_map(),
|
||||
"num_symbols": get_max_phonemes(),
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
"piper_version": _VERSION,
|
||||
@@ -255,8 +264,6 @@ def phonemize_batch_espeak(
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
phonemizer = Phonemizer(default_voice=args.language)
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
@@ -266,10 +273,15 @@ def phonemize_batch_espeak(
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = phonemize(
|
||||
casing(utt.text), phonemizer, phoneme_map=phoneme_map
|
||||
)
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
all_phonemes = phonemize_espeak(casing(utt.text), args.language)
|
||||
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_espeak(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
@@ -298,7 +310,6 @@ def phonemize_batch_text(
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
alphabet = ALPHABETS[args.language]
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
@@ -308,10 +319,16 @@ def phonemize_batch_text(
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
all_phonemes = phonemize_codepoints(casing(utt.text))
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_codepoints(
|
||||
args.language,
|
||||
utt.phonemes,
|
||||
phoneme_id_map=alphabet,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
|
||||
Reference in New Issue
Block a user