From d41572021480d430433d0131c4f9c2589af2c129 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Sun, 30 Jul 2023 10:15:40 -0500 Subject: [PATCH 1/9] Add ro test sentences --- etc/test_sentences/ro.txt | 4 ++++ etc/test_sentences/test_ro.jsonl | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 etc/test_sentences/ro.txt create mode 100644 etc/test_sentences/test_ro.jsonl diff --git a/etc/test_sentences/ro.txt b/etc/test_sentences/ro.txt new file mode 100644 index 0000000..5044be4 --- /dev/null +++ b/etc/test_sentences/ro.txt @@ -0,0 +1,4 @@ +Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă. +De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont. +În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal. +Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă. diff --git a/etc/test_sentences/test_ro.jsonl b/etc/test_sentences/test_ro.jsonl new file mode 100644 index 0000000..3d1d299 --- /dev/null +++ b/etc/test_sentences/test_ro.jsonl @@ -0,0 +1,4 @@ +{"phoneme_ids":[1,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,33,0,26,0,3,0,19,0,121,0,18,0,26,0,27,0,25,0,120,0,18,0,26,0,3,0,120,0,27,0,28,0,32,0,21,0,23,0,3,0,96,0,21,0,3,0,25,0,121,0,18,0,32,0,18,0,27,0,92,0,27,0,24,0,120,0,27,0,17,0,108,0,21,0,23,0,3,0,121,0,14,0,32,0,25,0,27,0,31,0,19,0,120,0,18,0,92,0,21,0,23,0,3,0,23,0,121,0,14,0,92,0,18,0,3,0,31,0,18,0,3,0,25,0,121,0,14,0,26,0,21,0,19,0,120,0,18,0,31,0,32,0,59,0,3,0,28,0,30,0,21,0,26,0,3,0,121,0,14,0,28,0,14,0,92,0,120,0,21,0,32,0,31,0,22,0,14,0,3,0,28,0,18,0,3,0,32,0,96,0,120,0,18,0,30,0,3,0,14,0,3,0,121,0,33,0,26,0,33,0,121,0,21,0,3,0,31,0,28,0,120,0,18,0,23,0,32,0,30,0,33,0,3,0,17,0,18,0,3,0,19,0,120,0,27,0,30,0,25,0,14,0,3,0,121,0,33,0,26,0,33,0,74,0,3,0,120,0,14,0,30,0,23,0,3,0,23,0,121,0,27,0,24,0,27,0,92,0,120,0,14,0,32,0,3,0,14,0,32,0,120,0,33,0,26,0,32,0,96,0,119,0,3,0,23,0,73,0,26,0,17,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,14,0,3,0,31,0,54,0,14,0,92,0,120,0,18,0,24,0,33,0,74,0,3,0,31,0,18,0,3,0,30,0,18,0,19,0,30,0,120,0,14,0,23,0,32,0,59,0,3,0,73,0,26,0,3,0,28,0,121,0,21,0,23,0,59,0,32,0,120,0,33,0,30,0,21,0,24,0,18,0,3,0,17,0,18,0,3,0,120,0,14,0,28,0,59,0,3,0,17,0,21,0,26,0,3,0,14,0,32,0,25,0,120,0,27,0,31,0,19,0,18,0,92,0,121,0,59,0,10,0,2],"phonemes":["k","ˌ","u","r","k","u","b","ˈ","e","u","l"," ","j","ˌ","e","s","t","e"," ","u","n"," ","f","ˌ","e","n","o","m","ˈ","e","n"," ","ˈ","o","p","t","i","k"," ","ʃ","i"," ","m","ˌ","e","t","e","o","ɾ","o","l","ˈ","o","d","ʒ","i","k"," ","ˌ","a","t","m","o","s","f","ˈ","e","ɾ","i","k"," ","k","ˌ","a","ɾ","e"," ","s","e"," ","m","ˌ","a","n","i","f","ˈ","e","s","t","ə"," ","p","r","i","n"," ","ˌ","a","p","a","ɾ","ˈ","i","t","s","j","a"," ","p","e"," ","t","ʃ","ˈ","e","r"," ","a"," ","ˌ","u","n","u","ˌ","i"," ","s","p","ˈ","e","k","t","r","u"," ","d","e"," ","f","ˈ","o","r","m","a"," ","ˌ","u","n","u","ɪ"," ","ˈ","a","r","k"," ","k","ˌ","o","l","o","ɾ","ˈ","a","t"," ","a","t","ˈ","u","n","t","ʃ","ʲ"," ","k","ɨ","n","d"," ","l","u","m","ˈ","i","n","a"," ","s","ɔ","a","ɾ","ˈ","e","l","u","ɪ"," ","s","e"," ","r","e","f","r","ˈ","a","k","t","ə"," ","ɨ","n"," ","p","ˌ","i","k","ə","t","ˈ","u","r","i","l","e"," ","d","e"," ","ˈ","a","p","ə"," ","d","i","n"," ","a","t","m","ˈ","o","s","f","e","ɾ","ˌ","ə","."],"processed_text":"Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă.","text":"Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă."} +{"phoneme_ids":[1,0,17,0,18,0,3,0,32,0,96,0,18,0,24,0,18,0,3,0,25,0,14,0,74,0,3,0,25,0,120,0,33,0,24,0,32,0,18,0,3,0,121,0,27,0,92,0,119,0,119,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,3,0,31,0,18,0,3,0,27,0,15,0,31,0,120,0,18,0,30,0,34,0,59,0,3,0,17,0,120,0,33,0,28,0,59,0,3,0,28,0,24,0,120,0,54,0,14,0,22,0,18,0,8,0,3,0,23,0,73,0,26,0,17,0,3,0,31,0,54,0,14,0,92,0,120,0,18,0,24,0,18,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,121,0,14,0,28,0,30,0,27,0,28,0,22,0,120,0,14,0,32,0,3,0,17,0,18,0,3,0,121,0,27,0,92,0,21,0,38,0,120,0,27,0,26,0,32,0,10,0,2],"phonemes":["d","e"," ","t","ʃ","e","l","e"," ","m","a","ɪ"," ","m","ˈ","u","l","t","e"," ","ˌ","o","ɾ","ʲ","ʲ"," ","k","ˌ","u","r","k","u","b","ˈ","e","u","l"," ","s","e"," ","o","b","s","ˈ","e","r","v","ə"," ","d","ˈ","u","p","ə"," ","p","l","ˈ","ɔ","a","j","e",","," ","k","ɨ","n","d"," ","s","ɔ","a","ɾ","ˈ","e","l","e"," ","j","ˌ","e","s","t","e"," ","ˌ","a","p","r","o","p","j","ˈ","a","t"," ","d","e"," ","ˌ","o","ɾ","i","z","ˈ","o","n","t","."],"processed_text":"De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont.","text":"De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont."} +{"phoneme_ids":[1,0,73,0,26,0,3,0,23,0,27,0,26,0,17,0,120,0,21,0,32,0,31,0,21,0,74,0,3,0,15,0,120,0,33,0,26,0,18,0,3,0,17,0,18,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,59,0,8,0,3,0,73,0,26,0,3,0,19,0,120,0,14,0,32,0,31,0,14,0,3,0,28,0,121,0,18,0,92,0,18,0,32,0,120,0,18,0,24,0,33,0,74,0,3,0,17,0,18,0,3,0,28,0,24,0,120,0,54,0,14,0,22,0,18,0,8,0,3,0,33,0,26,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,100,0,3,0,31,0,121,0,18,0,23,0,33,0,26,0,17,0,120,0,14,0,30,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,34,0,21,0,38,0,120,0,21,0,15,0,21,0,24,0,3,0,17,0,18,0,14,0,31,0,120,0,33,0,28,0,30,0,14,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,33,0,74,0,3,0,28,0,30,0,121,0,21,0,26,0,32,0,96,0,21,0,28,0,120,0,14,0,24,0,10,0,2],"phonemes":["ɨ","n"," ","k","o","n","d","ˈ","i","t","s","i","ɪ"," ","b","ˈ","u","n","e"," ","d","e"," ","l","u","m","ˈ","i","n","ə",","," ","ɨ","n"," ","f","ˈ","a","t","s","a"," ","p","ˌ","e","ɾ","e","t","ˈ","e","l","u","ɪ"," ","d","e"," ","p","l","ˈ","ɔ","a","j","e",","," ","u","n"," ","k","ˌ","u","r","k","u","b","ˈ","e","ʊ"," ","s","ˌ","e","k","u","n","d","ˈ","a","r"," ","j","ˌ","e","s","t","e"," ","v","i","z","ˈ","i","b","i","l"," ","d","e","a","s","ˈ","u","p","r","a"," ","k","ˌ","u","r","k","u","b","ˈ","e","u","l","u","ɪ"," ","p","r","ˌ","i","n","t","ʃ","i","p","ˈ","a","l","."],"processed_text":"În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal.","text":"În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal."} +{"phoneme_ids":[1,0,14,0,32,0,96,0,121,0,18,0,31,0,32,0,14,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,25,0,14,0,74,0,3,0,31,0,24,0,120,0,14,0,15,0,3,0,17,0,21,0,26,0,3,0,23,0,14,0,120,0,33,0,38,0,14,0,3,0,17,0,120,0,33,0,15,0,24,0,18,0,74,0,3,0,30,0,18,0,19,0,24,0,120,0,18,0,23,0,31,0,21,0,74,0,3,0,14,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,21,0,74,0,3,0,73,0,26,0,3,0,28,0,121,0,21,0,23,0,59,0,32,0,120,0,33,0,30,0,21,0,24,0,18,0,3,0,17,0,18,0,3,0,120,0,14,0,28,0,59,0,3,0,96,0,21,0,3,0,121,0,14,0,92,0,18,0,3,0,27,0,3,0,31,0,18,0,23,0,34,0,120,0,18,0,26,0,32,0,31,0,59,0,3,0,17,0,18,0,3,0,23,0,33,0,24,0,120,0,27,0,92,0,119,0,119,0,3,0,27,0,28,0,120,0,33,0,31,0,59,0,10,0,2],"phonemes":["a","t","ʃ","ˌ","e","s","t","a"," ","j","ˌ","e","s","t","e"," ","m","a","ɪ"," ","s","l","ˈ","a","b"," ","d","i","n"," ","k","a","ˈ","u","z","a"," ","d","ˈ","u","b","l","e","ɪ"," ","r","e","f","l","ˈ","e","k","s","i","ɪ"," ","a"," ","l","u","m","ˈ","i","n","i","ɪ"," ","ɨ","n"," ","p","ˌ","i","k","ə","t","ˈ","u","r","i","l","e"," ","d","e"," ","ˈ","a","p","ə"," ","ʃ","i"," ","ˌ","a","ɾ","e"," ","o"," ","s","e","k","v","ˈ","e","n","t","s","ə"," ","d","e"," ","k","u","l","ˈ","o","ɾ","ʲ","ʲ"," ","o","p","ˈ","u","s","ə","."],"processed_text":"Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă.","text":"Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă."} From e1ee337ee1d544e9205238185f492cffcfef3b4c Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Sun, 30 Jul 2023 10:33:32 -0500 Subject: [PATCH 2/9] Use piper-phonemize instead --- src/python/piper_train/phonemize.py | 372 --------------------------- src/python/piper_train/preprocess.py | 63 +++-- src/python/requirements.txt | 6 +- 3 files changed, 43 insertions(+), 398 deletions(-) delete mode 100644 src/python/piper_train/phonemize.py diff --git a/src/python/piper_train/phonemize.py b/src/python/piper_train/phonemize.py deleted file mode 100644 index 46d646d..0000000 --- a/src/python/piper_train/phonemize.py +++ /dev/null @@ -1,372 +0,0 @@ -import argparse -import json -import sys -import unicodedata -from collections import Counter -from enum import Enum -from typing import Dict, Iterable, List, Mapping, Optional - -from espeak_phonemizer import Phonemizer - - -class PhonemeType(str, Enum): - ESPEAK = "espeak" - """Phonemes come from espeak-ng""" - - TEXT = "text" - """Phonemes come from text itself""" - - -MAX_PHONEMES = 256 -DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = { - "_": [0], - "^": [1], - "$": [2], - " ": [3], - "!": [4], - "'": [5], - "(": [6], - ")": [7], - ",": [8], - "-": [9], - ".": [10], - ":": [11], - ";": [12], - "?": [13], - "a": [14], - "b": [15], - "c": [16], - "d": [17], - "e": [18], - "f": [19], - "h": [20], - "i": [21], - "j": [22], - "k": [23], - "l": [24], - "m": [25], - "n": [26], - "o": [27], - "p": [28], - "q": [29], - "r": [30], - "s": [31], - "t": [32], - "u": [33], - "v": [34], - "w": [35], - "x": [36], - "y": [37], - "z": [38], - "æ": [39], - "ç": [40], - "ð": [41], - "ø": [42], - "ħ": [43], - "ŋ": [44], - "œ": [45], - "ǀ": [46], - "ǁ": [47], - "ǂ": [48], - "ǃ": [49], - "ɐ": [50], - "ɑ": [51], - "ɒ": [52], - "ɓ": [53], - "ɔ": [54], - "ɕ": [55], - "ɖ": [56], - "ɗ": [57], - "ɘ": [58], - "ə": [59], - "ɚ": [60], - "ɛ": [61], - "ɜ": [62], - "ɞ": [63], - "ɟ": [64], - "ɠ": [65], - "ɡ": [66], - "ɢ": [67], - "ɣ": [68], - "ɤ": [69], - "ɥ": [70], - "ɦ": [71], - "ɧ": [72], - "ɨ": [73], - "ɪ": [74], - "ɫ": [75], - "ɬ": [76], - "ɭ": [77], - "ɮ": [78], - "ɯ": [79], - "ɰ": [80], - "ɱ": [81], - "ɲ": [82], - "ɳ": [83], - "ɴ": [84], - "ɵ": [85], - "ɶ": [86], - "ɸ": [87], - "ɹ": [88], - "ɺ": [89], - "ɻ": [90], - "ɽ": [91], - "ɾ": [92], - "ʀ": [93], - "ʁ": [94], - "ʂ": [95], - "ʃ": [96], - "ʄ": [97], - "ʈ": [98], - "ʉ": [99], - "ʊ": [100], - "ʋ": [101], - "ʌ": [102], - "ʍ": [103], - "ʎ": [104], - "ʏ": [105], - "ʐ": [106], - "ʑ": [107], - "ʒ": [108], - "ʔ": [109], - "ʕ": [110], - "ʘ": [111], - "ʙ": [112], - "ʛ": [113], - "ʜ": [114], - "ʝ": [115], - "ʟ": [116], - "ʡ": [117], - "ʢ": [118], - "ʲ": [119], - "ˈ": [120], - "ˌ": [121], - "ː": [122], - "ˑ": [123], - "˞": [124], - "β": [125], - "θ": [126], - "χ": [127], - "ᵻ": [128], - "ⱱ": [129], - "0": [130], # tones - "1": [131], - "2": [132], - "3": [133], - "4": [134], - "5": [135], - "6": [136], - "7": [137], - "8": [138], - "9": [139], - "\u0327": [140], # combining cedilla - "\u0303": [141], # combining tilde - "\u032a": [142], # combining bridge below - "\u032f": [143], # combining inverted breve below - "\u0329": [144], # combining vertical line below - "ʰ": [145], - "ˤ": [146], - "ε": [147], - "↓": [148], - "#": [149], # Icelandic - '"': [150], # Russian - "↑": [151], - "\u033a": [152], # Basque - "\u033b": [153], -} - -PHONEME_MAPS = { - # Brazilian Portuguese - "pt-br": {"c": ["k"]} -} - -ALPHABETS = { - # Ukrainian - "uk": { - "_": [0], - "^": [1], - "$": [2], - " ": [3], - "!": [4], - "'": [5], - ",": [6], - "-": [7], - ".": [8], - ":": [9], - ";": [10], - "?": [11], - "а": [12], - "б": [13], - "в": [14], - "г": [15], - "ґ": [16], - "д": [17], - "е": [18], - "є": [19], - "ж": [20], - "з": [21], - "и": [22], - "і": [23], - "ї": [24], - "й": [25], - "к": [26], - "л": [27], - "м": [28], - "н": [29], - "о": [30], - "п": [31], - "р": [32], - "с": [33], - "т": [34], - "у": [35], - "ф": [36], - "х": [37], - "ц": [38], - "ч": [39], - "ш": [40], - "щ": [41], - "ь": [42], - "ю": [43], - "я": [44], - "\u0301": [45], # combining acute accent - "\u0306": [46], # combining breve - "\u0308": [47], # combining diaeresis - "—": [48], # em dash - } -} - - -def phonemize( - text: str, - phonemizer: Phonemizer, - phoneme_map: Optional[Dict[str, List[str]]] = None, -) -> List[str]: - phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True) - - # Phonemes are decomposed into unicode codepoints - unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str)) - if not phoneme_map: - return unmapped_phonemes - - # Phonemes can be mapped to lists of other phonemes - mapped_phonemes = [] - for phoneme in unmapped_phonemes: - sub_phonemes = phoneme_map.get(phoneme) - if sub_phonemes: - mapped_phonemes.extend(sub_phonemes) - else: - mapped_phonemes.append(phoneme) - - return mapped_phonemes - - -def phonemes_to_ids( - phonemes: Iterable[str], - phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None, - missing_phonemes: "Optional[Counter[str]]" = None, - pad: Optional[str] = "_", - bos: Optional[str] = "^", - eos: Optional[str] = "$", -) -> List[int]: - if phoneme_id_map is None: - phoneme_id_map = DEFAULT_PHONEME_ID_MAP - - phoneme_ids: List[int] = [] - - if bos: - phoneme_ids.extend(phoneme_id_map[bos]) - - if pad: - phoneme_ids.extend(phoneme_id_map[pad]) - - for phoneme in phonemes: - mapped_phoneme_ids = phoneme_id_map.get(phoneme) - if mapped_phoneme_ids: - phoneme_ids.extend(mapped_phoneme_ids) - - if pad: - phoneme_ids.extend(phoneme_id_map[pad]) - elif missing_phonemes is not None: - # Make note of missing phonemes - missing_phonemes[phoneme] += 1 - - if eos: - phoneme_ids.extend(phoneme_id_map[eos]) - - return phoneme_ids - - -# ----------------------------------------------------------------------------- - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("language") - parser.add_argument( - "--phoneme-type", - choices=list(PhonemeType), - default=PhonemeType.ESPEAK, - help="Type of phonemes to use (default: espeak)", - ) - parser.add_argument( - "--text-casing", - choices=("ignore", "lower", "upper", "casefold"), - default="ignore", - help="Casing applied to utterance text", - ) - args = parser.parse_args() - - phonemizer: Optional[Phonemizer] = None - - if args.text_casing == "lower": - casing = str.lower - elif args.text_casing == "upper": - casing = str.upper - else: - # ignore - casing = lambda s: s - - if args.phoneme_type == PhonemeType.TEXT: - # Use text directly - phoneme_id_map = ALPHABETS[args.language] - else: - # Use eSpeak - phonemizer = Phonemizer(args.language) - phoneme_id_map = DEFAULT_PHONEME_ID_MAP - - phoneme_map = PHONEME_MAPS.get(args.language) - missing_phonemes: "Counter[str]" = Counter() - - for line in sys.stdin: - line = line.strip() - if not line: - continue - - if args.phoneme_type == PhonemeType.TEXT: - phonemes = list(unicodedata.normalize("NFD", casing(line))) - else: - assert phonemizer is not None - phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map) - - phoneme_ids = phonemes_to_ids( - phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes - ) - json.dump( - { - "text": line, - "phonemes": phonemes, - "phoneme_ids": phoneme_ids, - }, - sys.stdout, - ensure_ascii=False, - ) - print("") - - if missing_phonemes: - print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr) - for phoneme, count in missing_phonemes.most_common(): - print(phoneme, count, file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 584e2b0..b89da57 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -9,28 +9,37 @@ import os import unicodedata from collections import Counter from dataclasses import dataclass, field +from enum import Enum from multiprocessing import JoinableQueue, Process, Queue from pathlib import Path from typing import Dict, Iterable, List, Optional -from espeak_phonemizer import Phonemizer +from piper_phonemize import ( + phonemize_espeak, + phonemize_codepoints, + phoneme_ids_espeak, + phoneme_ids_codepoints, + get_codepoints_map, + get_espeak_map, + get_max_phonemes, + tashkeel_run, +) from .norm_audio import cache_norm_audio, make_silence_detector -from .phonemize import ( - ALPHABETS, - DEFAULT_PHONEME_ID_MAP, - MAX_PHONEMES, - PHONEME_MAPS, - PhonemeType, - phonemes_to_ids, - phonemize, -) _DIR = Path(__file__).parent _VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip() _LOGGER = logging.getLogger("preprocess") +class PhonemeType(str, Enum): + ESPEAK = "espeak" + """Phonemes come from espeak-ng""" + + TEXT = "text" + """Phonemes come from text itself""" + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( @@ -150,10 +159,10 @@ def main() -> None: "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8}, "phoneme_type": args.phoneme_type.value, "phoneme_map": {}, - "phoneme_id_map": ALPHABETS[args.language] + "phoneme_id_map": get_codepoints_map()[args.language] if args.phoneme_type == PhonemeType.TEXT - else DEFAULT_PHONEME_ID_MAP, - "num_symbols": MAX_PHONEMES, + else get_espeak_map(), + "num_symbols": get_max_phonemes(), "num_speakers": len(speaker_counts), "speaker_id_map": speaker_ids, "piper_version": _VERSION, @@ -255,8 +264,6 @@ def phonemize_batch_espeak( try: casing = get_text_casing(args.text_casing) silence_detector = make_silence_detector() - phonemizer = Phonemizer(default_voice=args.language) - phoneme_map = PHONEME_MAPS.get(args.language) while True: utt_batch = queue_in.get() @@ -266,10 +273,15 @@ def phonemize_batch_espeak( for utt in utt_batch: try: _LOGGER.debug(utt) - utt.phonemes = phonemize( - casing(utt.text), phonemizer, phoneme_map=phoneme_map - ) - utt.phoneme_ids = phonemes_to_ids( + all_phonemes = phonemize_espeak(casing(utt.text), args.language) + + # Flatten + utt.phonemes = [ + phoneme + for sentence_phonemes in all_phonemes + for phoneme in sentence_phonemes + ] + utt.phoneme_ids = phoneme_ids_espeak( utt.phonemes, missing_phonemes=utt.missing_phonemes, ) @@ -298,7 +310,6 @@ def phonemize_batch_text( try: casing = get_text_casing(args.text_casing) silence_detector = make_silence_detector() - alphabet = ALPHABETS[args.language] while True: utt_batch = queue_in.get() @@ -308,10 +319,16 @@ def phonemize_batch_text( for utt in utt_batch: try: _LOGGER.debug(utt) - utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text))) - utt.phoneme_ids = phonemes_to_ids( + all_phonemes = phonemize_codepoints(casing(utt.text)) + # Flatten + utt.phonemes = [ + phoneme + for sentence_phonemes in all_phonemes + for phoneme in sentence_phonemes + ] + utt.phoneme_ids = phoneme_ids_codepoints( + args.language, utt.phonemes, - phoneme_id_map=alphabet, missing_phonemes=utt.missing_phonemes, ) if not args.skip_audio: diff --git a/src/python/requirements.txt b/src/python/requirements.txt index 9bca1cd..31b0763 100644 --- a/src/python/requirements.txt +++ b/src/python/requirements.txt @@ -1,7 +1,7 @@ cython>=0.29.0,<1 -espeak-phonemizer>=1.1.0,<2 +piper-phonemize~=1.0.0 librosa>=0.9.2,<1 numpy>=1.19.0 -onnxruntime~=1.11.0 +onnxruntime>=1.11.0 pytorch-lightning~=1.7.0 -torch~=1.11.0 +torch>=1.11.0,<2 From dcb4c828cd9663277c3175ddc0e313f7b525f9f4 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 14:33:58 -0500 Subject: [PATCH 3/9] sr and lb test sentences --- etc/test_sentences/lb.txt | 6 ++++++ etc/test_sentences/sr.txt | 8 ++++++++ etc/test_sentences/test_lb.jsonl | 6 ++++++ etc/test_sentences/test_sr.jsonl | 8 ++++++++ 4 files changed, 28 insertions(+) create mode 100644 etc/test_sentences/lb.txt create mode 100644 etc/test_sentences/sr.txt create mode 100644 etc/test_sentences/test_lb.jsonl create mode 100644 etc/test_sentences/test_sr.jsonl diff --git a/etc/test_sentences/lb.txt b/etc/test_sentences/lb.txt new file mode 100644 index 0000000..ace2963 --- /dev/null +++ b/etc/test_sentences/lb.txt @@ -0,0 +1,6 @@ +Et freet mech, Iech kennen ze léieren. +Schwätzt wannechgelift méi lues. +Vill Gléck fir däi Gebuertsdag. +Mäi Loftkësseboot ass voller Éilen. +Schwätz du Lëtzebuergesch? +E gudde Rutsch an d'neit Joer. diff --git a/etc/test_sentences/sr.txt b/etc/test_sentences/sr.txt new file mode 100644 index 0000000..c60df91 --- /dev/null +++ b/etc/test_sentences/sr.txt @@ -0,0 +1,8 @@ +Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише. +Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора. +Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости. +Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице. +Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене. +Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге. +Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице. +Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге. diff --git a/etc/test_sentences/test_lb.jsonl b/etc/test_sentences/test_lb.jsonl new file mode 100644 index 0000000..c306157 --- /dev/null +++ b/etc/test_sentences/test_lb.jsonl @@ -0,0 +1,6 @@ +{"phoneme_ids":[1,0,120,0,18,0,122,0,32,0,3,0,19,0,93,0,120,0,18,0,122,0,32,0,3,0,25,0,120,0,59,0,55,0,8,0,3,0,120,0,21,0,59,0,55,0,3,0,23,0,120,0,39,0,26,0,59,0,26,0,3,0,155,0,120,0,59,0,3,0,24,0,62,0,74,0,120,0,59,0,93,0,59,0,26,0,10,0,2],"phonemes":["ˈ","e","ː","t"," ","f","ʀ","ˈ","e","ː","t"," ","m","ˈ","ə","ɕ",","," ","ˈ","i","ə","ɕ"," ","k","ˈ","æ","n","ə","n"," ","ʦ","ˈ","ə"," ","l","ɜ","ɪ","ˈ","ə","ʀ","ə","n","."],"processed_text":"Et freet mech, Iech kennen ze léieren.","text":"Et freet mech, Iech kennen ze léieren."} +{"phoneme_ids":[1,0,96,0,34,0,120,0,18,0,32,0,155,0,32,0,3,0,34,0,121,0,51,0,26,0,39,0,55,0,154,0,120,0,59,0,24,0,21,0,19,0,32,0,3,0,25,0,120,0,62,0,74,0,3,0,24,0,120,0,33,0,59,0,31,0,10,0,2],"phonemes":["ʃ","v","ˈ","e","t","ʦ","t"," ","v","ˌ","ɑ","n","æ","ɕ","g","ˈ","ə","l","i","f","t"," ","m","ˈ","ɜ","ɪ"," ","l","ˈ","u","ə","s","."],"processed_text":"Schwätzt wannechgelift méi lues.","text":"Schwätzt wannechgelift méi lues."} +{"phoneme_ids":[1,0,19,0,120,0,21,0,24,0,3,0,154,0,24,0,120,0,18,0,23,0,3,0,19,0,120,0,21,0,122,0,94,0,3,0,17,0,120,0,39,0,122,0,74,0,3,0,154,0,59,0,15,0,120,0,33,0,122,0,94,0,32,0,31,0,17,0,14,0,122,0,156,0,10,0,2],"phonemes":["f","ˈ","i","l"," ","g","l","ˈ","e","k"," ","f","ˈ","i","ː","ʁ"," ","d","ˈ","æ","ː","ɪ"," ","g","ə","b","ˈ","u","ː","ʁ","t","s","d","a","ː","X","."],"processed_text":"Vill Gléck fir däi Gebuertsdag.","text":"Vill Gléck fir däi Gebuertsdag."} +{"phoneme_ids":[1,0,25,0,120,0,39,0,122,0,74,0,3,0,24,0,121,0,27,0,19,0,32,0,23,0,59,0,31,0,120,0,18,0,122,0,15,0,27,0,122,0,32,0,3,0,120,0,51,0,31,0,3,0,34,0,120,0,27,0,24,0,18,0,122,0,93,0,3,0,120,0,62,0,74,0,24,0,59,0,26,0,10,0,2],"phonemes":["m","ˈ","æ","ː","ɪ"," ","l","ˌ","o","f","t","k","ə","s","ˈ","e","ː","b","o","ː","t"," ","ˈ","ɑ","s"," ","v","ˈ","o","l","e","ː","ʀ"," ","ˈ","ɜ","ɪ","l","ə","n","."],"processed_text":"Mäi Loftkësseboot ass voller Éilen.","text":"Mäi Loftkësseboot ass voller Éilen."} +{"phoneme_ids":[1,0,96,0,34,0,120,0,18,0,32,0,155,0,3,0,17,0,120,0,33,0,122,0,3,0,24,0,121,0,59,0,155,0,59,0,15,0,120,0,33,0,122,0,94,0,22,0,59,0,96,0,13,0,2],"phonemes":["ʃ","v","ˈ","e","t","ʦ"," ","d","ˈ","u","ː"," ","l","ˌ","ə","ʦ","ə","b","ˈ","u","ː","ʁ","j","ə","ʃ","?"],"processed_text":"Schwätz du Lëtzebuergesch?","text":"Schwätz du Lëtzebuergesch?"} +{"phoneme_ids":[1,0,120,0,59,0,3,0,154,0,120,0,33,0,17,0,59,0,3,0,93,0,120,0,33,0,32,0,96,0,3,0,120,0,51,0,26,0,3,0,17,0,26,0,120,0,51,0,74,0,32,0,3,0,22,0,120,0,27,0,122,0,94,0,10,0,2],"phonemes":["ˈ","ə"," ","g","ˈ","u","d","ə"," ","ʀ","ˈ","u","t","ʃ"," ","ˈ","ɑ","n"," ","d","n","ˈ","ɑ","ɪ","t"," ","j","ˈ","o","ː","ʁ","."],"processed_text":"E gudde Rutsch an d'neit Joer.","text":"E gudde Rutsch an d'neit Joer."} diff --git a/etc/test_sentences/test_sr.jsonl b/etc/test_sentences/test_sr.jsonl new file mode 100644 index 0000000..e30e4c1 --- /dev/null +++ b/etc/test_sentences/test_sr.jsonl @@ -0,0 +1,8 @@ +{"phoneme_ids":[1,0,17,0,120,0,33,0,66,0,50,0,3,0,22,0,18,0,3,0,120,0,27,0,28,0,32,0,74,0,32,0,96,0,23,0,50,0,3,0,74,0,3,0,25,0,120,0,61,0,32,0,61,0,121,0,27,0,30,0,27,0,24,0,121,0,27,0,96,0,23,0,50,0,3,0,28,0,120,0,27,0,22,0,50,0,34,0,50,0,3,0,23,0,120,0,27,0,22,0,74,0,3,0,31,0,120,0,61,0,3,0,28,0,120,0,27,0,22,0,50,0,34,0,24,0,22,0,121,0,100,0,22,0,18,0,3,0,26,0,120,0,14,0,3,0,26,0,120,0,61,0,15,0,100,0,8,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,74,0,3,0,38,0,30,0,120,0,14,0,32,0,31,0,74,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,121,0,51,0,22,0,100,0,3,0,23,0,30,0,120,0,27,0,31,0,3,0,31,0,120,0,21,0,32,0,26,0,61,0,3,0,34,0,120,0,27,0,17,0,61,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,74,0,8,0,3,0,26,0,120,0,51,0,22,0,32,0,96,0,61,0,96,0,32,0,55,0,61,0,3,0,26,0,120,0,14,0,23,0,27,0,26,0,3,0,23,0,120,0,21,0,96,0,61,0,10,0,2],"phonemes":["d","ˈ","u","ɡ","ɐ"," ","j","e"," ","ˈ","o","p","t","ɪ","t","ʃ","k","ɐ"," ","ɪ"," ","m","ˈ","ɛ","t","ɛ","ˌ","o","r","o","l","ˌ","o","ʃ","k","ɐ"," ","p","ˈ","o","j","ɐ","v","ɐ"," ","k","ˈ","o","j","ɪ"," ","s","ˈ","ɛ"," ","p","ˈ","o","j","ɐ","v","l","j","ˌ","ʊ","j","e"," ","n","ˈ","a"," ","n","ˈ","ɛ","b","ʊ",","," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v","ɪ"," ","z","r","ˈ","a","t","s","ɪ"," ","p","r","ˈ","ɛ","l","ɐ","m","ˌ","ɑ","j","ʊ"," ","k","r","ˈ","o","s"," ","s","ˈ","i","t","n","ɛ"," ","v","ˈ","o","d","ɛ","n","ɛ"," ","k","ˈ","a","p","ɪ",","," ","n","ˈ","ɑ","j","t","ʃ","ɛ","ʃ","t","ɕ","ɛ"," ","n","ˈ","a","k","o","n"," ","k","ˈ","i","ʃ","ɛ","."],"processed_text":"Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише.","text":"Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише."} +{"phoneme_ids":[1,0,17,0,120,0,33,0,66,0,50,0,3,0,31,0,120,0,61,0,3,0,120,0,27,0,15,0,74,0,32,0,96,0,26,0,27,0,3,0,34,0,120,0,21,0,17,0,74,0,3,0,26,0,120,0,14,0,3,0,38,0,120,0,14,0,31,0,32,0,27,0,30,0,100,0,3,0,23,0,120,0,21,0,96,0,26,0,74,0,20,0,3,0,23,0,120,0,14,0,28,0,74,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,28,0,120,0,27,0,31,0,25,0,50,0,32,0,30,0,50,0,32,0,96,0,3,0,31,0,32,0,120,0,27,0,22,0,74,0,3,0,120,0,27,0,23,0,30,0,61,0,26,0,100,0,32,0,3,0,24,0,120,0,61,0,17,0,107,0,74,0,25,0,50,0,3,0,31,0,120,0,33,0,26,0,32,0,31,0,100,0,3,0,74,0,3,0,66,0,24,0,120,0,61,0,17,0,50,0,3,0,100,0,3,0,31,0,25,0,120,0,61,0,30,0,100,0,3,0,32,0,120,0,27,0,66,0,50,0,3,0,38,0,120,0,14,0,31,0,32,0,27,0,30,0,50,0,10,0,2],"phonemes":["d","ˈ","u","ɡ","ɐ"," ","s","ˈ","ɛ"," ","ˈ","o","b","ɪ","t","ʃ","n","o"," ","v","ˈ","i","d","ɪ"," ","n","ˈ","a"," ","z","ˈ","a","s","t","o","r","ʊ"," ","k","ˈ","i","ʃ","n","ɪ","h"," ","k","ˈ","a","p","ɪ"," ","k","ˈ","a","d","ɐ"," ","p","ˈ","o","s","m","ɐ","t","r","ɐ","t","ʃ"," ","s","t","ˈ","o","j","ɪ"," ","ˈ","o","k","r","ɛ","n","ʊ","t"," ","l","ˈ","ɛ","d","ʑ","ɪ","m","ɐ"," ","s","ˈ","u","n","t","s","ʊ"," ","ɪ"," ","ɡ","l","ˈ","ɛ","d","ɐ"," ","ʊ"," ","s","m","ˈ","ɛ","r","ʊ"," ","t","ˈ","o","ɡ","ɐ"," ","z","ˈ","a","s","t","o","r","ɐ","."],"processed_text":"Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора.","text":"Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора."} +{"phoneme_ids":[1,0,38,0,30,0,120,0,14,0,32,0,31,0,74,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,74,0,3,0,31,0,120,0,61,0,3,0,32,0,120,0,14,0,17,0,50,0,3,0,30,0,120,0,14,0,38,0,24,0,50,0,108,0,100,0,3,0,26,0,120,0,14,0,3,0,31,0,34,0,120,0,27,0,22,0,18,0,3,0,120,0,27,0,31,0,26,0,27,0,34,0,26,0,61,0,3,0,23,0,120,0,27,0,25,0,28,0,27,0,26,0,121,0,61,0,26,0,32,0,61,0,8,0,3,0,31,0,32,0,34,0,120,0,51,0,30,0,51,0,22,0,121,0,100,0,32,0,55,0,74,0,3,0,120,0,27,0,28,0,32,0,74,0,32,0,96,0,23,0,100,0,3,0,28,0,30,0,120,0,61,0,32,0,31,0,32,0,50,0,34,0,100,0,3,0,100,0,3,0,34,0,120,0,21,0,17,0,100,0,3,0,32,0,30,0,120,0,14,0,23,0,61,0,3,0,30,0,120,0,14,0,38,0,24,0,74,0,32,0,96,0,121,0,74,0,32,0,74,0,20,0,3,0,15,0,120,0,27,0,22,0,50,0,8,0,3,0,96,0,32,0,27,0,3,0,100,0,3,0,31,0,32,0,34,0,120,0,51,0,30,0,74,0,3,0,28,0,30,0,120,0,61,0,32,0,31,0,32,0,50,0,34,0,24,0,22,0,50,0,3,0,31,0,28,0,120,0,61,0,23,0,32,0,51,0,30,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,74,0,10,0,2],"phonemes":["z","r","ˈ","a","t","s","ɪ"," ","s","v","ˈ","ɛ","t","l","o","s","t","ɪ"," ","s","ˈ","ɛ"," ","t","ˈ","a","d","ɐ"," ","r","ˈ","a","z","l","ɐ","ʒ","ʊ"," ","n","ˈ","a"," ","s","v","ˈ","o","j","e"," ","ˈ","o","s","n","o","v","n","ɛ"," ","k","ˈ","o","m","p","o","n","ˌ","ɛ","n","t","ɛ",","," ","s","t","v","ˈ","ɑ","r","ɑ","j","ˌ","ʊ","t","ɕ","ɪ"," ","ˈ","o","p","t","ɪ","t","ʃ","k","ʊ"," ","p","r","ˈ","ɛ","t","s","t","ɐ","v","ʊ"," ","ʊ"," ","v","ˈ","i","d","ʊ"," ","t","r","ˈ","a","k","ɛ"," ","r","ˈ","a","z","l","ɪ","t","ʃ","ˌ","ɪ","t","ɪ","h"," ","b","ˈ","o","j","ɐ",","," ","ʃ","t","o"," ","ʊ"," ","s","t","v","ˈ","ɑ","r","ɪ"," ","p","r","ˈ","ɛ","t","s","t","ɐ","v","l","j","ɐ"," ","s","p","ˈ","ɛ","k","t","ɑ","r"," ","s","v","ˈ","ɛ","t","l","o","s","t","ɪ","."],"processed_text":"Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости.","text":"Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости."} +{"phoneme_ids":[1,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,50,0,28,0,30,0,120,0,21,0,25,0,51,0,30,0,26,0,50,0,3,0,17,0,120,0,33,0,66,0,50,0,3,0,26,0,120,0,14,0,31,0,32,0,51,0,22,0,18,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,3,0,38,0,30,0,120,0,14,0,23,0,3,0,22,0,120,0,18,0,17,0,26,0,27,0,25,0,3,0,28,0,30,0,120,0,61,0,24,0,27,0,25,0,74,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,104,0,74,0,32,0,31,0,61,0,10,0,2],"phonemes":["ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","ɐ","p","r","ˈ","i","m","ɑ","r","n","ɐ"," ","d","ˈ","u","ɡ","ɐ"," ","n","ˈ","a","s","t","ɑ","j","e"," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v"," ","z","r","ˈ","a","k"," ","j","ˈ","e","d","n","o","m"," ","p","r","ˈ","ɛ","l","o","m","ɪ"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ʎ","ɪ","t","s","ɛ","."],"processed_text":"Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице.","text":"Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице."} +{"phoneme_ids":[1,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,31,0,120,0,61,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,50,0,3,0,28,0,120,0,27,0,17,0,3,0,34,0,120,0,61,0,32,0,55,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,26,0,120,0,61,0,66,0,27,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,8,0,3,0,120,0,14,0,24,0,74,0,3,0,38,0,15,0,120,0,27,0,66,0,3,0,30,0,120,0,61,0,19,0,24,0,61,0,23,0,31,0,121,0,74,0,22,0,18,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,74,0,8,0,3,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,120,0,21,0,38,0,24,0,50,0,38,0,74,0,3,0,28,0,120,0,27,0,17,0,3,0,25,0,120,0,14,0,82,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,120,0,27,0,32,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,61,0,10,0,2],"phonemes":["p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","s","ˈ","ɛ"," ","p","r","ˈ","ɛ","l","ɐ","m","ɐ"," ","p","ˈ","o","d"," ","v","ˈ","ɛ","t","ɕ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","n","ˈ","ɛ","ɡ","o"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t",","," ","ˈ","a","l","ɪ"," ","z","b","ˈ","o","ɡ"," ","r","ˈ","ɛ","f","l","ɛ","k","s","ˌ","ɪ","j","e"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ɪ",","," ","p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","ˈ","i","z","l","ɐ","z","ɪ"," ","p","ˈ","o","d"," ","m","ˈ","a","ɲ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","ˈ","o","t"," ","t","s","r","v","ˈ","ɛ","n","ɛ","."],"processed_text":"Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене.","text":"Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене."} +{"phoneme_ids":[1,0,38,0,120,0,14,0,32,0,27,0,3,0,22,0,18,0,3,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,15,0,120,0,27,0,22,0,50,0,3,0,31,0,120,0,14,0,3,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,8,0,3,0,50,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,3,0,28,0,30,0,120,0,21,0,25,0,51,0,30,0,26,0,61,0,3,0,17,0,120,0,33,0,66,0,61,0,10,0,2],"phonemes":["z","ˈ","a","t","o"," ","j","e"," ","p","l","ˈ","a","v","ɐ"," ","b","ˈ","o","j","ɐ"," ","s","ˈ","a"," ","ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ",","," ","ɐ"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","ˈ","a"," ","s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ"," ","p","r","ˈ","i","m","ɑ","r","n","ɛ"," ","d","ˈ","u","ɡ","ɛ","."],"processed_text":"Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге.","text":"Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге."} +{"phoneme_ids":[1,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,50,0,31,0,120,0,61,0,23,0,100,0,26,0,17,0,121,0,51,0,30,0,26,0,50,0,3,0,17,0,120,0,33,0,66,0,50,0,3,0,26,0,120,0,14,0,31,0,32,0,51,0,22,0,18,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,3,0,38,0,30,0,120,0,14,0,23,0,3,0,17,0,34,0,120,0,27,0,31,0,32,0,30,0,100,0,23,0,27,0,3,0,28,0,30,0,120,0,61,0,24,0,27,0,25,0,74,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,104,0,74,0,32,0,31,0,61,0,10,0,2],"phonemes":["s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","ɐ","s","ˈ","ɛ","k","ʊ","n","d","ˌ","ɑ","r","n","ɐ"," ","d","ˈ","u","ɡ","ɐ"," ","n","ˈ","a","s","t","ɑ","j","e"," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v"," ","z","r","ˈ","a","k"," ","d","v","ˈ","o","s","t","r","ʊ","k","o"," ","p","r","ˈ","ɛ","l","o","m","ɪ"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ʎ","ɪ","t","s","ɛ","."],"processed_text":"Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице.","text":"Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице."} +{"phoneme_ids":[1,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,31,0,120,0,61,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,50,0,3,0,28,0,120,0,27,0,17,0,3,0,34,0,120,0,61,0,32,0,55,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,28,0,120,0,14,0,3,0,22,0,18,0,3,0,31,0,32,0,120,0,27,0,66,0,50,0,3,0,120,0,27,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,8,0,3,0,50,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,3,0,31,0,120,0,61,0,23,0,100,0,26,0,17,0,121,0,51,0,30,0,26,0,61,0,3,0,17,0,120,0,33,0,66,0,61,0,10,0,2],"phonemes":["p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","s","ˈ","ɛ"," ","p","r","ˈ","ɛ","l","ɐ","m","ɐ"," ","p","ˈ","o","d"," ","v","ˈ","ɛ","t","ɕ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","p","ˈ","a"," ","j","e"," ","s","t","ˈ","o","ɡ","ɐ"," ","ˈ","o","n","ɐ"," ","s","ˈ","a"," ","s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ",","," ","ɐ"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","ˈ","a"," ","ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ"," ","s","ˈ","ɛ","k","ʊ","n","d","ˌ","ɑ","r","n","ɛ"," ","d","ˈ","u","ɡ","ɛ","."],"processed_text":"Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге.","text":"Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге."} From bd80cba1f3fc65eab223107f3de5612dbc94c9e5 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 15:32:02 -0500 Subject: [PATCH 4/9] Add phoneme-silence --- src/cpp/main.cpp | 23 +++++++++++ src/cpp/piper.cpp | 97 +++++++++++++++++++++++++++++++++++++---------- src/cpp/piper.hpp | 15 ++++++++ 3 files changed, 114 insertions(+), 21 deletions(-) diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index 1242b87..aad42af 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,9 @@ struct RunConfig { // "output_file": str, (optional) // } bool jsonInput = false; + + // Seconds of extra silence to insert after a single phoneme + optional> phonemeSilenceSeconds; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); @@ -185,6 +189,8 @@ int main(int argc, char *argv[]) { runConfig.sentenceSilenceSeconds.value(); } + voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds; + if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); spdlog::info("Output directory: {}", runConfig.outputPath.value().string()); @@ -453,6 +459,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { } else if (arg == "--sentence_silence" || arg == "--sentence-silence") { ensureArg(argc, argv, i); runConfig.sentenceSilenceSeconds = stof(argv[++i]); + } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") { + ensureArg(argc, argv, i); + ensureArg(argc, argv, i + 1); + auto phonemeStr = std::string(argv[++i]); + if (!piper::isSingleCodepoint(phonemeStr)) { + std::cerr << "Phoneme '" << phonemeStr + << "' is not a single codepoint (--phoneme_silence)" + << std::endl; + exit(1); + } + + if (!runConfig.phonemeSilenceSeconds) { + runConfig.phonemeSilenceSeconds.emplace(); + } + + auto phoneme = piper::getCodepoint(phonemeStr); + (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]); } else if (arg == "--espeak_data" || arg == "--espeak-data") { ensureArg(argc, argv, i); runConfig.eSpeakDataPath = filesystem::path(argv[++i]); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index d83dd3f..6da95cd 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f; const std::string instanceName{"piper"}; -std::string getVersion() { - return VERSION; -} +std::string getVersion() { return VERSION; } // True if the string is a single UTF-8 codepoint bool isSingleCodepoint(std::string s) { @@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, sentencePhonemes.size(), phonemesStr); } - SynthesisResult sentenceResult; + std::vector>> phrasePhonemes; + std::vector phraseResults; + std::vector phraseSilenceSamples; // Use phoneme/id map from config PhonemeIdConfig idConfig; idConfig.phonemeIdMap = std::make_shared(voice.phonemizeConfig.phonemeIdMap); - // phonemes -> ids - phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes); - if (spdlog::should_log(spdlog::level::debug)) { - // DEBUG log for phoneme ids - std::stringstream phonemeIdsStr; - for (auto phonemeId : phonemeIds) { - phonemeIdsStr << phonemeId << ", "; - } + if (voice.synthesisConfig.phonemeSilenceSeconds) { + // Split into phrases + std::map &phonemeSilenceSeconds = + *voice.synthesisConfig.phonemeSilenceSeconds; - spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", - sentencePhonemes.size(), phonemeIds.size(), - phonemeIdsStr.str()); + auto currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + + for (auto sentencePhonemesIter = sentencePhonemes.begin(); + sentencePhonemesIter != sentencePhonemes.end(); + sentencePhonemesIter++) { + Phoneme ¤tPhoneme = *sentencePhonemesIter; + currentPhrasePhonemes->push_back(currentPhoneme); + + if (phonemeSilenceSeconds.count(currentPhoneme) > 0) { + // Split at phrase boundary + phraseSilenceSamples.push_back( + (std::size_t)(phonemeSilenceSeconds[currentPhoneme] * + voice.synthesisConfig.sampleRate * + voice.synthesisConfig.channels)); + + currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + } + } + } else { + // Use all phonemes + phrasePhonemes.push_back( + std::make_shared>(sentencePhonemes)); } - // ids -> audio - synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, - sentenceResult); + // Ensure results/samples are the same size + while (phraseResults.size() < phrasePhonemes.size()) { + phraseResults.emplace_back(); + } + + while (phraseSilenceSamples.size() < phrasePhonemes.size()) { + phraseSilenceSamples.push_back(0); + } + + // phonemes -> ids -> audio + for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) { + if (phrasePhonemes[phraseIdx]->size() <= 0) { + continue; + } + + // phonemes -> ids + phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds, + missingPhonemes); + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phoneme ids + std::stringstream phonemeIdsStr; + for (auto phonemeId : phonemeIds) { + phonemeIdsStr << phonemeId << ", "; + } + + spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", + phrasePhonemes[phraseIdx]->size(), phonemeIds.size(), + phonemeIdsStr.str()); + } + + // ids -> audio + synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, + phraseResults[phraseIdx]); + + // Add end of phrase silence + for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) { + audioBuffer.push_back(0); + } + + result.audioSeconds += phraseResults[phraseIdx].audioSeconds; + result.inferSeconds += phraseResults[phraseIdx].inferSeconds; + + phonemeIds.clear(); + } // Add end of sentence silence if (sentenceSilenceSamples > 0) { @@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, audioBuffer.clear(); } - result.audioSeconds += sentenceResult.audioSeconds; - result.inferSeconds += sentenceResult.inferSeconds; - phonemeIds.clear(); } diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp index 9e7c222..332a619 100644 --- a/src/cpp/piper.hpp +++ b/src/cpp/piper.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -49,14 +50,22 @@ struct PhonemizeConfig { }; struct SynthesisConfig { + // VITS inference settings float noiseScale = 0.667f; float lengthScale = 1.0f; float noiseW = 0.8f; + + // Audio settings int sampleRate = 22050; int sampleWidth = 2; // 16-bit int channels = 1; // mono + + // Speaker id from 0 to numSpeakers - 1 std::optional speakerId; + + // Extra silence float sentenceSilenceSeconds = 0.2f; + std::optional> phonemeSilenceSeconds; }; struct ModelConfig { @@ -89,6 +98,12 @@ struct Voice { ModelSession session; }; +// True if the string is a single UTF-8 codepoint +bool isSingleCodepoint(std::string s); + +// Get the first UTF-8 codepoint of a string +Phoneme getCodepoint(std::string s); + // Get version of Piper std::string getVersion(); From d95dab3bb3c6766b59b2740f5200a3804fd7f4b8 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 15:53:44 -0500 Subject: [PATCH 5/9] Load phoneme_silence from voice config --- README.md | 15 +++++++++++++++ VERSION | 2 +- src/cpp/main.cpp | 16 +++++++++++++++- src/cpp/piper.cpp | 28 ++++++++++++++++++++++++++-- 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f144049..9f88603 100644 --- a/README.md +++ b/README.md @@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a * Italian (it_IT) * Georgian (ka_GE) * Kazakh (kk_KZ) +* Luxembourgish (lb_LU) * Nepali (ne_NP) * Dutch (nl_BE, nl_NL) * Norwegian (no_NO) * Polish (pl_PL) * Portuguese (pt_BR) +* Romanian (ro_RO) * Russian (ru_RU) +* Serbian (sr_RS) * Swedish (sv_SE) * Swahili (sw_CD) +* Turkish (tr_TR) * Ukrainian (uk_UA) * Vietnamese (vi_VN) * Chinese (zh_CN) @@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker ` to change speakers (default: See `piper --help` for more options. +### Streaming Audio + +Piper can stream raw audio to stdout as its produced: + +``` sh +echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \ + ./piper --model en_US-lessac-medium.onnx --output-raw | \ + aplay -r 22050 -f S16_LE -t raw - +``` + +This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice. ### JSON Input diff --git a/VERSION b/VERSION index 9084fa2..26aaba0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.0 +1.2.0 diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index aad42af..8972eac 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -189,7 +189,21 @@ int main(int argc, char *argv[]) { runConfig.sentenceSilenceSeconds.value(); } - voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds; + if (runConfig.phonemeSilenceSeconds) { + if (!voice.synthesisConfig.phonemeSilenceSeconds) { + // Overwrite + voice.synthesisConfig.phonemeSilenceSeconds = + runConfig.phonemeSilenceSeconds; + } else { + // Merge + for (const auto &[phoneme, silenceSeconds] : + *runConfig.phonemeSilenceSeconds) { + voice.synthesisConfig.phonemeSilenceSeconds->try_emplace( + phoneme, silenceSeconds); + } + } + + } // if phonemeSilenceSeconds if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 6da95cd..ef7eb49 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -140,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { // "inference": { // "noise_scale": 0.667, // "length_scale": 1, - // "noise_w": 0.8 + // "noise_w": 0.8, + // "phoneme_silence": { + // "": , + // ... + // } // } // } @@ -166,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { if (inferenceValue.contains("noise_w")) { synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f); } - } + + if (inferenceValue.contains("phoneme_silence")) { + // phoneme -> seconds of silence to add after + synthesisConfig.phonemeSilenceSeconds.emplace(); + auto phonemeSilenceValue = inferenceValue["phoneme_silence"]; + for (auto &phonemeItem : phonemeSilenceValue.items()) { + std::string phonemeStr = phonemeItem.key(); + if (!isSingleCodepoint(phonemeStr)) { + spdlog::error("\"{}\" is not a single codepoint", phonemeStr); + throw std::runtime_error( + "Phonemes must be one codepoint (phoneme silence)"); + } + + auto phoneme = getCodepoint(phonemeStr); + (*synthesisConfig.phonemeSilenceSeconds)[phoneme] = + phonemeItem.value().get(); + } + + } // if phoneme_silence + + } // if inference } /* parseSynthesisConfig */ From 6e18236ae0e3657d824de085715f6c9e83484227 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 16:27:02 -0500 Subject: [PATCH 6/9] Bump to piper-phonemize 1.1.0 --- src/python/requirements.txt | 2 +- src/python_run/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/requirements.txt b/src/python/requirements.txt index 31b0763..2722c13 100644 --- a/src/python/requirements.txt +++ b/src/python/requirements.txt @@ -1,5 +1,5 @@ cython>=0.29.0,<1 -piper-phonemize~=1.0.0 +piper-phonemize~=1.1.0 librosa>=0.9.2,<1 numpy>=1.19.0 onnxruntime>=1.11.0 diff --git a/src/python_run/requirements.txt b/src/python_run/requirements.txt index 84b6a31..a598720 100644 --- a/src/python_run/requirements.txt +++ b/src/python_run/requirements.txt @@ -1,2 +1,2 @@ -piper-phonemize~=1.0.0 +piper-phonemize~=1.1.0 onnxruntime>=1.11.0,<2 From 36fec21382b9687114b81a529d0d028b35905cd1 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 19:47:08 -0500 Subject: [PATCH 7/9] Upgrade to piper-phonemize 1.1.0 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index caf5980..38890ff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,7 +33,7 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO RUN mkdir -p "lib/Linux-$(uname -m)" # Use pre-compiled Piper phonemization library (includes onnxruntime) -ARG PIPER_PHONEMIZE_VERSION='1.0.0' +ARG PIPER_PHONEMIZE_VERSION='1.1.0' RUN mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \ curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH}${TARGETVARIANT}.tar.gz" | \ tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - From e268564deb779af984ac8f632c98727447632124 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 31 Jul 2023 19:49:34 -0500 Subject: [PATCH 8/9] Update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9f88603..bcaf7d0 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,9 @@ The `MODEL_CARD` file for each voice contains important licensing information. P You can [run Piper with Python](#running-in-python) or download a binary release: -* [amd64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_amd64.tar.gz) (64-bit desktop Linux) -* [arm64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4) -* [armv7](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_armv7.tar.gz) (32-bit Raspberry Pi 3/4) +* [amd64](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz) (64-bit desktop Linux) +* [arm64](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4) +* [armv7](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_armv7.tar.gz) (32-bit Raspberry Pi 3/4) If you want to build from source, see the [Makefile](Makefile) and [C++ source](src/cpp). You must download and extract [piper-phonemize](https://github.com/rhasspy/piper-phonemize) to `lib/Linux-$(uname -m)/piper_phonemize` before building. From c7f8671beb5e8fe640aa10239dcb5247f91d09a4 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 1 Aug 2023 09:34:46 -0500 Subject: [PATCH 9/9] Add Thorsten training video --- TRAINING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TRAINING.md b/TRAINING.md index 4872d93..a0269fc 100644 --- a/TRAINING.md +++ b/TRAINING.md @@ -1,5 +1,7 @@ # Training Guide +Check out a [video training guide by Thorsten Müller](https://www.youtube.com/watch?v=b_we_jma220) + Training a voice for Piper involves 3 main steps: 1. Preparing the dataset @@ -32,7 +34,7 @@ python3 -m venv .venv source .venv/bin/activate pip3 install --upgrade pip pip3 install --upgrade wheel setuptools -pip3 install -r requirements.txt +pip3 install -e . ``` Run the `build_monotonic_align.sh` script in the `src/python` directory to build the extension.