Extending phoneme set to 256

This commit is contained in:
Michael Hansen
2023-05-07 10:58:37 -05:00
parent 7d9a59ab91
commit d62340b68e
3 changed files with 93 additions and 12 deletions

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import csv
import json
import sys
import unicodedata
from collections import Counter
from .phonemize import DEFAULT_PHONEME_ID_MAP
def main() -> None:
missing_phonemes: Counter[str] = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
utt = json.loads(line)
for phoneme in utt["phonemes"]:
if phoneme not in DEFAULT_PHONEME_ID_MAP:
missing_phonemes[phoneme] += 1
if missing_phonemes:
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
writer = csv.writer(sys.stdout)
for phoneme, count in missing_phonemes.most_common():
hex_phoneme = hex(ord(phoneme))
writer.writerow(
(
phoneme,
unicodedata.category(phoneme),
unicodedata.name(phoneme),
f"\\u{hex_phoneme}",
count,
)
)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Mapping, Optional
from espeak_phonemizer import Phonemizer
MAX_PHONEMES = 256
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"_": [0],
"^": [1],
@@ -135,6 +136,25 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
"χ": [127],
"": [128],
"": [129],
"0": [130], # tones
"1": [131],
"2": [132],
"3": [133],
"4": [134],
"5": [135],
"6": [136],
"7": [137],
"8": [138],
"9": [139],
"\u0327": [140], # combining cedilla
"\u0303": [141], # combining tilde
"\u032a": [142], # combining bridge below
"\u032f": [143], # combining inverted breve below
"\u0329": [144], # combining vertical line below
"ʰ": [145],
"ˤ": [146],
"ε": [147],
"": [148],
}

View File

@@ -8,7 +8,7 @@ import logging
import os
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from dataclasses import dataclass, field
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
from typing import Dict, Iterable, List, Optional
@@ -16,7 +16,7 @@ from typing import Dict, Iterable, List, Optional
from espeak_phonemizer import Phonemizer
from .norm_audio import cache_norm_audio, make_silence_detector
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize, MAX_PHONEMES
_LOGGER = logging.getLogger("preprocess")
@@ -49,6 +49,9 @@ def main() -> None:
parser.add_argument(
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
)
parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio"
)
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
@@ -120,9 +123,7 @@ def main() -> None:
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_map": {},
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
"num_symbols": len(
set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
),
"num_symbols": MAX_PHONEMES,
"num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids,
},
@@ -160,21 +161,33 @@ def main() -> None:
queue_in.put(utt_batch)
_LOGGER.debug("Waiting for jobs to finish")
missing_phonemes: Counter[str] = Counter()
for _ in range(num_utterances):
utt = queue_out.get()
if utt is not None:
if utt.speaker is not None:
utt.speaker_id = speaker_ids[utt.speaker]
utt_dict = dataclasses.asdict(utt)
utt_dict.pop("missing_phonemes")
# JSONL
json.dump(
dataclasses.asdict(utt),
utt_dict,
dataset_file,
ensure_ascii=False,
cls=PathEncoder,
)
print("", file=dataset_file)
missing_phonemes.update(utt.missing_phonemes)
if missing_phonemes:
for phoneme, count in missing_phonemes.most_common():
_LOGGER.warning("Missing %s (%s)", phoneme, count)
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
# Signal workers to stop
for proc in processes:
queue_in.put(None)
@@ -201,13 +214,17 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
try:
_LOGGER.debug(utt)
utt.phonemes = phonemize(utt.text, phonemizer)
utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
@@ -233,6 +250,7 @@ class Utterance:
phoneme_ids: Optional[List[int]] = None
audio_norm_path: Optional[Path] = None
audio_spec_path: Optional[Path] = None
missing_phonemes: Counter[str] = field(default_factory=Counter)
class PathEncoder(json.JSONEncoder):