mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-18 06:15:30 +00:00
Extending phoneme set to 256
This commit is contained in:
43
src/python/piper_train/check_phonemes.py
Normal file
43
src/python/piper_train/check_phonemes.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
|
||||
def main() -> None:
|
||||
missing_phonemes: Counter[str] = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
for phoneme in utt["phonemes"]:
|
||||
if phoneme not in DEFAULT_PHONEME_ID_MAP:
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
|
||||
writer = csv.writer(sys.stdout)
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
hex_phoneme = hex(ord(phoneme))
|
||||
writer.writerow(
|
||||
(
|
||||
phoneme,
|
||||
unicodedata.category(phoneme),
|
||||
unicodedata.name(phoneme),
|
||||
f"\\u{hex_phoneme}",
|
||||
count,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
MAX_PHONEMES = 256
|
||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
@@ -135,6 +136,25 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"χ": [127],
|
||||
"ᵻ": [128],
|
||||
"ⱱ": [129],
|
||||
"0": [130], # tones
|
||||
"1": [131],
|
||||
"2": [132],
|
||||
"3": [133],
|
||||
"4": [134],
|
||||
"5": [135],
|
||||
"6": [136],
|
||||
"7": [137],
|
||||
"8": [138],
|
||||
"9": [139],
|
||||
"\u0327": [140], # combining cedilla
|
||||
"\u0303": [141], # combining tilde
|
||||
"\u032a": [142], # combining bridge below
|
||||
"\u032f": [143], # combining inverted breve below
|
||||
"\u0329": [144], # combining vertical line below
|
||||
"ʰ": [145],
|
||||
"ˤ": [146],
|
||||
"ε": [147],
|
||||
"": [148],
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import logging
|
||||
import os
|
||||
from collections import Counter
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
@@ -16,7 +16,7 @@ from typing import Dict, Iterable, List, Optional
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
|
||||
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
@@ -49,6 +49,9 @@ def main() -> None:
|
||||
parser.add_argument(
|
||||
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
@@ -120,9 +123,7 @@ def main() -> None:
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": len(
|
||||
set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
|
||||
),
|
||||
"num_symbols": MAX_PHONEMES,
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
},
|
||||
@@ -160,21 +161,33 @@ def main() -> None:
|
||||
queue_in.put(utt_batch)
|
||||
|
||||
_LOGGER.debug("Waiting for jobs to finish")
|
||||
missing_phonemes: Counter[str] = Counter()
|
||||
for _ in range(num_utterances):
|
||||
utt = queue_out.get()
|
||||
if utt is not None:
|
||||
if utt.speaker is not None:
|
||||
utt.speaker_id = speaker_ids[utt.speaker]
|
||||
|
||||
utt_dict = dataclasses.asdict(utt)
|
||||
utt_dict.pop("missing_phonemes")
|
||||
|
||||
# JSONL
|
||||
json.dump(
|
||||
dataclasses.asdict(utt),
|
||||
utt_dict,
|
||||
dataset_file,
|
||||
ensure_ascii=False,
|
||||
cls=PathEncoder,
|
||||
)
|
||||
print("", file=dataset_file)
|
||||
|
||||
missing_phonemes.update(utt.missing_phonemes)
|
||||
|
||||
if missing_phonemes:
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
_LOGGER.warning("Missing %s (%s)", phoneme, count)
|
||||
|
||||
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
|
||||
|
||||
# Signal workers to stop
|
||||
for proc in processes:
|
||||
queue_in.put(None)
|
||||
@@ -201,13 +214,17 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = phonemize(utt.text, phonemizer)
|
||||
utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
@@ -233,6 +250,7 @@ class Utterance:
|
||||
phoneme_ids: Optional[List[int]] = None
|
||||
audio_norm_path: Optional[Path] = None
|
||||
audio_spec_path: Optional[Path] = None
|
||||
missing_phonemes: Counter[str] = field(default_factory=Counter)
|
||||
|
||||
|
||||
class PathEncoder(json.JSONEncoder):
|
||||
|
||||
Reference in New Issue
Block a user