diff --git a/etc/test_sentences/test_es-419.jsonl b/etc/test_sentences/test_es-419.jsonl new file mode 100644 index 0000000..0c201b1 --- /dev/null +++ b/etc/test_sentences/test_es-419.jsonl @@ -0,0 +1,6 @@ +{"phoneme_ids":[1,0,120,0,33,0,26,0,3,0,121,0,14,0,92,0,23,0,27,0,120,0,21,0,92,0,21,0,31,0,3,0,27,0,3,0,120,0,14,0,92,0,23,0,27,0,3,0,120,0,21,0,92,0,21,0,31,0,3,0,120,0,18,0,31,0,3,0,120,0,33,0,26,0,3,0,19,0,18,0,26,0,120,0,27,0,25,0,18,0,26,0,27,0,3,0,120,0,27,0,28,0,122,0,32,0,21,0,23,0,27,0,3,0,21,0,3,0,25,0,121,0,18,0,32,0,18,0,121,0,27,0,92,0,27,0,24,0,120,0,27,0,36,0,21,0,23,0,27,0,3,0,23,0,18,0,3,0,23,0,27,0,26,0,31,0,120,0,21,0,31,0,32,0,18,0,3,0,18,0,26,0,3,0,24,0,14,0,3,0,121,0,14,0,28,0,14,0,92,0,21,0,31,0,22,0,120,0,27,0,26,0,3,0,18,0,26,0,3,0,18,0,24,0,3,0,31,0,22,0,120,0,18,0,24,0,27,0,3,0,41,0,18,0,3,0,120,0,33,0,26,0,3,0,120,0,14,0,92,0,23,0,27,0,3,0,41,0,18,0,3,0,24,0,120,0,33,0,31,0,3,0,25,0,121,0,33,0,24,0,32,0,21,0,23,0,27,0,24,0,120,0,27,0,92,0,8,0,121,0,27,0,92,0,21,0,36,0,21,0,26,0,120,0,14,0,41,0,27,0,3,0,28,0,27,0,92,0,3,0,24,0,14,0,3,0,41,0,121,0,18,0,31,0,23,0,27,0,25,0,28,0,121,0,27,0,31,0,21,0,31,0,22,0,120,0,27,0,26,0,3,0,17,0,18,0,3,0,24,0,14,0,3,0,24,0,120,0,33,0,31,0,3,0,31,0,27,0,24,0,120,0,14,0,92,0,3,0,18,0,26,0,3,0,18,0,24,0,3,0,18,0,31,0,28,0,120,0,18,0,23,0,32,0,92,0,27,0,3,0,125,0,21,0,31,0,120,0,21,0,125,0,24,0,18,0,8,0,24,0,14,0,3,0,23,0,35,0,120,0,14,0,24,0,3,0,31,0,18,0,3,0,28,0,92,0,27,0,41,0,120,0,33,0,31,0,18,0,3,0,28,0,27,0,92,0,3,0,30,0,121,0,18,0,19,0,92,0,14,0,23,0,31,0,22,0,120,0,27,0,26,0,8,0,23,0,35,0,121,0,14,0,26,0,17,0,27,0,3,0,24,0,27,0,31,0,3,0,30,0,120,0,14,0,115,0,27,0,31,0,3,0,41,0,18,0,24,0,3,0,31,0,120,0,27,0,24,0,3,0,121,0,14,0,32,0,92,0,14,0,125,0,22,0,120,0,18,0,31,0,14,0,25,0,3,0,28,0,18,0,23,0,120,0,18,0,82,0,14,0,31,0,3,0,68,0,120,0,27,0,32,0,14,0,31,0,3,0,41,0,18,0,3,0,120,0,14,0,68,0,35,0,14,0,3,0,23,0,121,0,27,0,26,0,32,0,18,0,26,0,120,0,21,0,41,0,14,0,31,0,3,0,18,0,26,0,3,0,24,0,14,0,3,0,14,0,32,0,25,0,120,0,27,0,31,0,19,0,18,0,92,0,14,0,3,0,32,0,18,0,30,0,120,0,18,0,31,0,32,0,92,0,18,0,10,0,2],"phonemes":["ˈ","u","n"," ","ˌ","a","ɾ","k","o","ˈ","i","ɾ","i","s"," ","o"," ","ˈ","a","ɾ","k","o"," ","ˈ","i","ɾ","i","s"," ","ˈ","e","s"," ","ˈ","u","n"," ","f","e","n","ˈ","o","m","e","n","o"," ","ˈ","o","p","ː","t","i","k","o"," ","i"," ","m","ˌ","e","t","e","ˌ","o","ɾ","o","l","ˈ","o","x","i","k","o"," ","k","e"," ","k","o","n","s","ˈ","i","s","t","e"," ","e","n"," ","l","a"," ","ˌ","a","p","a","ɾ","i","s","j","ˈ","o","n"," ","e","n"," ","e","l"," ","s","j","ˈ","e","l","o"," ","ð","e"," ","ˈ","u","n"," ","ˈ","a","ɾ","k","o"," ","ð","e"," ","l","ˈ","u","s"," ","m","ˌ","u","l","t","i","k","o","l","ˈ","o","ɾ",",","ˌ","o","ɾ","i","x","i","n","ˈ","a","ð","o"," ","p","o","ɾ"," ","l","a"," ","ð","ˌ","e","s","k","o","m","p","ˌ","o","s","i","s","j","ˈ","o","n"," ","d","e"," ","l","a"," ","l","ˈ","u","s"," ","s","o","l","ˈ","a","ɾ"," ","e","n"," ","e","l"," ","e","s","p","ˈ","e","k","t","ɾ","o"," ","β","i","s","ˈ","i","β","l","e",",","l","a"," ","k","w","ˈ","a","l"," ","s","e"," ","p","ɾ","o","ð","ˈ","u","s","e"," ","p","o","ɾ"," ","r","ˌ","e","f","ɾ","a","k","s","j","ˈ","o","n",",","k","w","ˌ","a","n","d","o"," ","l","o","s"," ","r","ˈ","a","ʝ","o","s"," ","ð","e","l"," ","s","ˈ","o","l"," ","ˌ","a","t","ɾ","a","β","j","ˈ","e","s","a","m"," ","p","e","k","ˈ","e","ɲ","a","s"," ","ɣ","ˈ","o","t","a","s"," ","ð","e"," ","ˈ","a","ɣ","w","a"," ","k","ˌ","o","n","t","e","n","ˈ","i","ð","a","s"," ","e","n"," ","l","a"," ","a","t","m","ˈ","o","s","f","e","ɾ","a"," ","t","e","r","ˈ","e","s","t","ɾ","e","."],"processed_text":"Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre.","text":"Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre."} +{"phoneme_ids":[1,0,120,0,18,0,31,0,3,0,120,0,33,0,26,0,3,0,120,0,14,0,92,0,23,0,27,0,3,0,23,0,27,0,25,0,28,0,35,0,120,0,18,0,31,0,32,0,27,0,3,0,41,0,18,0,3,0,120,0,14,0,92,0,23,0,27,0,31,0,3,0,23,0,27,0,26,0,31,0,120,0,18,0,26,0,32,0,92,0,21,0,23,0,27,0,31,0,3,0,41,0,18,0,3,0,23,0,27,0,24,0,120,0,27,0,92,0,18,0,31,0,8,0,31,0,21,0,26,0,3,0,31,0,121,0,27,0,24,0,33,0,31,0,22,0,120,0,27,0,26,0,3,0,17,0,18,0,3,0,23,0,121,0,27,0,26,0,32,0,21,0,26,0,35,0,21,0,41,0,120,0,14,0,41,0,3,0,121,0,18,0,26,0,32,0,92,0,18,0,3,0,120,0,18,0,22,0,22,0,27,0,31,0,8,0,23,0,27,0,26,0,3,0,18,0,24,0,3,0,30,0,120,0,27,0,36,0,27,0,3,0,121,0,14,0,31,0,22,0,14,0,3,0,24,0,14,0,3,0,28,0,120,0,14,0,92,0,32,0,18,0,3,0,121,0,18,0,23,0,31,0,32,0,18,0,92,0,22,0,120,0,27,0,92,0,3,0,21,0,3,0,18,0,24,0,3,0,125,0,22,0,27,0,24,0,120,0,18,0,32,0,14,0,3,0,121,0,14,0,31,0,22,0,14,0,3,0,18,0,24,0,3,0,121,0,21,0,26,0,32,0,18,0,92,0,22,0,120,0,27,0,92,0,10,0,2],"phonemes":["ˈ","e","s"," ","ˈ","u","n"," ","ˈ","a","ɾ","k","o"," ","k","o","m","p","w","ˈ","e","s","t","o"," ","ð","e"," ","ˈ","a","ɾ","k","o","s"," ","k","o","n","s","ˈ","e","n","t","ɾ","i","k","o","s"," ","ð","e"," ","k","o","l","ˈ","o","ɾ","e","s",",","s","i","n"," ","s","ˌ","o","l","u","s","j","ˈ","o","n"," ","d","e"," ","k","ˌ","o","n","t","i","n","w","i","ð","ˈ","a","ð"," ","ˌ","e","n","t","ɾ","e"," ","ˈ","e","j","j","o","s",",","k","o","n"," ","e","l"," ","r","ˈ","o","x","o"," ","ˌ","a","s","j","a"," ","l","a"," ","p","ˈ","a","ɾ","t","e"," ","ˌ","e","k","s","t","e","ɾ","j","ˈ","o","ɾ"," ","i"," ","e","l"," ","β","j","o","l","ˈ","e","t","a"," ","ˌ","a","s","j","a"," ","e","l"," ","ˌ","i","n","t","e","ɾ","j","ˈ","o","ɾ","."],"processed_text":"Es un arco compuesto de arcos concéntricos de colores, sin solución de continuidad entre ellos, con el rojo hacia la parte exterior y el violeta hacia el interior.","text":"Es un arco compuesto de arcos concéntricos de colores, sin solución de continuidad entre ellos, con el rojo hacia la parte exterior y el violeta hacia el interior."} +{"phoneme_ids":[1,0,14,0,3,0,121,0,14,0,24,0,32,0,21,0,32,0,120,0,33,0,17,0,3,0,31,0,121,0,33,0,19,0,21,0,31,0,22,0,120,0,61,0,26,0,32,0,18,0,8,0,28,0,27,0,92,0,3,0,18,0,36,0,120,0,18,0,25,0,28,0,24,0,27,0,3,0,23,0,35,0,121,0,14,0,26,0,17,0,27,0,3,0,31,0,18,0,3,0,125,0,22,0,120,0,14,0,36,0,14,0,3,0,18,0,26,0,3,0,14,0,125,0,22,0,120,0,27,0,26,0,8,0,18,0,24,0,3,0,121,0,14,0,92,0,23,0,27,0,120,0,21,0,92,0,21,0,31,0,3,0,31,0,18,0,3,0,28,0,35,0,120,0,18,0,41,0,18,0,3,0,121,0,27,0,125,0,31,0,18,0,92,0,125,0,120,0,14,0,92,0,3,0,18,0,26,0,3,0,19,0,120,0,27,0,92,0,25,0,14,0,3,0,41,0,18,0,3,0,31,0,120,0,21,0,92,0,23,0,33,0,24,0,27,0,3,0,23,0,27,0,25,0,28,0,24,0,120,0,18,0,32,0,27,0,10,0,2],"phonemes":["a"," ","ˌ","a","l","t","i","t","ˈ","u","d"," ","s","ˌ","u","f","i","s","j","ˈ","ɛ","n","t","e",",","p","o","ɾ"," ","e","x","ˈ","e","m","p","l","o"," ","k","w","ˌ","a","n","d","o"," ","s","e"," ","β","j","ˈ","a","x","a"," ","e","n"," ","a","β","j","ˈ","o","n",",","e","l"," ","ˌ","a","ɾ","k","o","ˈ","i","ɾ","i","s"," ","s","e"," ","p","w","ˈ","e","ð","e"," ","ˌ","o","β","s","e","ɾ","β","ˈ","a","ɾ"," ","e","n"," ","f","ˈ","o","ɾ","m","a"," ","ð","e"," ","s","ˈ","i","ɾ","k","u","l","o"," ","k","o","m","p","l","ˈ","e","t","o","."],"processed_text":"A altitud suficiente, por ejemplo cuando se viaja en avión, el arcoíris se puede observar en forma de círculo completo.","text":"A altitud suficiente, por ejemplo cuando se viaja en avión, el arcoíris se puede observar en forma de círculo completo."} +{"phoneme_ids":[1,0,15,0,121,0,18,0,44,0,36,0,14,0,25,0,120,0,21,0,25,0,3,0,28,0,21,0,41,0,22,0,120,0,27,0,3,0,120,0,33,0,26,0,14,0,3,0,125,0,18,0,125,0,120,0,21,0,41,0,14,0,3,0,41,0,18,0,3,0,23,0,120,0,21,0,35,0,21,0,3,0,21,0,3,0,19,0,92,0,120,0,18,0,31,0,14,0,12,0,26,0,27,0,120,0,18,0,8,0,31,0,21,0,25,0,3,0,15,0,121,0,18,0,92,0,68,0,33,0,120,0,61,0,26,0,31,0,14,0,8,0,24,0,14,0,3,0,25,0,120,0,14,0,31,0,3,0,121,0,18,0,23,0,31,0,23,0,21,0,31,0,120,0,21,0,32,0,14,0,3,0,32,0,96,0,14,0,25,0,28,0,120,0,14,0,82,0,14,0,3,0,41,0,18,0,24,0,3,0,25,0,18,0,26,0,120,0,33,0,10,0,2],"phonemes":["b","ˌ","e","ŋ","x","a","m","ˈ","i","m"," ","p","i","ð","j","ˈ","o"," ","ˈ","u","n","a"," ","β","e","β","ˈ","i","ð","a"," ","ð","e"," ","k","ˈ","i","w","i"," ","i"," ","f","ɾ","ˈ","e","s","a",";","n","o","ˈ","e",",","s","i","m"," ","b","ˌ","e","ɾ","ɣ","u","ˈ","ɛ","n","s","a",",","l","a"," ","m","ˈ","a","s"," ","ˌ","e","k","s","k","i","s","ˈ","i","t","a"," ","t","ʃ","a","m","p","ˈ","a","ɲ","a"," ","ð","e","l"," ","m","e","n","ˈ","u","."],"processed_text":"Benjamín pidió una bebida de kiwi y fresa; Noé, sin vergüenza, la más exquisita champaña del menú.","text":"Benjamín pidió una bebida de kiwi y fresa; Noé, sin vergüenza, la más exquisita champaña del menú."} +{"phoneme_ids":[1,0,36,0,27,0,31,0,120,0,18,0,3,0,23,0,27,0,25,0,28,0,92,0,120,0,27,0,3,0,120,0,33,0,26,0,14,0,3,0,125,0,22,0,120,0,18,0,36,0,14,0,3,0,31,0,14,0,25,0,28,0,120,0,27,0,82,0,14,0,3,0,18,0,25,0,3,0,28,0,18,0,92,0,120,0,33,0,10,0,2,1,0,121,0,18,0,23,0,31,0,23,0,33,0,31,0,120,0,14,0,26,0,17,0,27,0,31,0,18,0,8,0,31,0,27,0,19,0,120,0,21,0,14,0,3,0,32,0,21,0,92,0,120,0,27,0,3,0,31,0,33,0,3,0,35,0,120,0,21,0,31,0,23,0,21,0,3,0,14,0,24,0,3,0,41,0,121,0,18,0,31,0,14,0,68,0,120,0,33,0,18,0,3,0,41,0,18,0,3,0,24,0,14,0,3,0,125,0,14,0,26,0,23,0,120,0,18,0,32,0,14,0,10,0,2],"phonemes":["x","o","s","ˈ","e"," ","k","o","m","p","ɾ","ˈ","o"," ","ˈ","u","n","a"," ","β","j","ˈ","e","x","a"," ","s","a","m","p","ˈ","o","ɲ","a"," ","e","m"," ","p","e","ɾ","ˈ","u",".","ˌ","e","k","s","k","u","s","ˈ","a","n","d","o","s","e",",","s","o","f","ˈ","i","a"," ","t","i","ɾ","ˈ","o"," ","s","u"," ","w","ˈ","i","s","k","i"," ","a","l"," ","ð","ˌ","e","s","a","ɣ","ˈ","u","e"," ","ð","e"," ","l","a"," ","β","a","n","k","ˈ","e","t","a","."],"processed_text":"José compró una vieja zampoña en Perú. Excusándose, Sofía tiró su whisky al desagüe de la banqueta.","text":"José compró una vieja zampoña en Perú. Excusándose, Sofía tiró su whisky al desagüe de la banqueta."} +{"phoneme_ids":[1,0,18,0,24,0,3,0,125,0,18,0,24,0,120,0,27,0,31,0,3,0,25,0,33,0,92,0,31,0,22,0,120,0,18,0,24,0,14,0,68,0,27,0,3,0,21,0,26,0,17,0,120,0,33,0,3,0,23,0,27,0,25,0,120,0,21,0,14,0,3,0,19,0,18,0,24,0,120,0,21,0,31,0,3,0,23,0,14,0,92,0,41,0,120,0,21,0,22,0,22,0,27,0,3,0,21,0,3,0,23,0,120,0,21,0,35,0,21,0,10,0,2,1,0,24,0,14,0,3,0,31,0,121,0,21,0,68,0,33,0,120,0,18,0,82,0,14,0,3,0,32,0,27,0,23,0,120,0,14,0,125,0,14,0,3,0,18,0,24,0,3,0,31,0,121,0,14,0,23,0,31,0,27,0,19,0,120,0,27,0,26,0,3,0,17,0,18,0,32,0,92,0,120,0,14,0,31,0,3,0,41,0,18,0,24,0,3,0,28,0,14,0,24,0,120,0,61,0,26,0,23,0,18,0,3,0,41,0,18,0,3,0,28,0,120,0,14,0,36,0,14,0,10,0,2],"phonemes":["e","l"," ","β","e","l","ˈ","o","s"," ","m","u","ɾ","s","j","ˈ","e","l","a","ɣ","o"," ","i","n","d","ˈ","u"," ","k","o","m","ˈ","i","a"," ","f","e","l","ˈ","i","s"," ","k","a","ɾ","ð","ˈ","i","j","j","o"," ","i"," ","k","ˈ","i","w","i",".","l","a"," ","s","ˌ","i","ɣ","u","ˈ","e","ɲ","a"," ","t","o","k","ˈ","a","β","a"," ","e","l"," ","s","ˌ","a","k","s","o","f","ˈ","o","n"," ","d","e","t","ɾ","ˈ","a","s"," ","ð","e","l"," ","p","a","l","ˈ","ɛ","n","k","e"," ","ð","e"," ","p","ˈ","a","x","a","."],"processed_text":"El veloz murciélago hindú comía feliz cardillo y kiwi. La cigüeña tocaba el saxofón detrás del palenque de paja.","text":"El veloz murciélago hindú comía feliz cardillo y kiwi. La cigüeña tocaba el saxofón detrás del palenque de paja."} diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index ede7bbb..2be0019 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -67,6 +67,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { for (auto &fromPhonemeItem : phonemeIdMapValue.items()) { std::string fromPhoneme = fromPhonemeItem.key(); if (!isSingleCodepoint(fromPhoneme)) { + std::stringstream idsStr; + for (auto &toIdValue : fromPhonemeItem.value()) { + PhonemeId toId = toIdValue.get(); + idsStr << toId << ","; + } + + spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme, + idsStr.str()); throw std::runtime_error( "Phonemes must be one codepoint (phoneme id map)"); } @@ -90,6 +98,7 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { for (auto &fromPhonemeItem : phonemeMapValue.items()) { std::string fromPhoneme = fromPhonemeItem.key(); if (!isSingleCodepoint(fromPhoneme)) { + spdlog::error("\"{}\" is not a single codepoint", fromPhoneme); throw std::runtime_error( "Phonemes must be one codepoint (phoneme map)"); } @@ -424,19 +433,10 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, SynthesisResult sentenceResult; + // Use phoneme/id map from config PhonemeIdConfig idConfig; - if (voice.phonemizeConfig.phonemeType == TextPhonemes) { - auto &language = voice.phonemizeConfig.eSpeak.voice; - spdlog::debug("Text phoneme language: {}", language); - if (DEFAULT_ALPHABET.count(language) < 1) { - throw std::runtime_error( - "Text phoneme language for voice is not supported"); - } - - // Use alphabet for language - idConfig.phonemeIdMap = - std::make_shared(DEFAULT_ALPHABET[language]); - } + idConfig.phonemeIdMap = + std::make_shared(voice.phonemizeConfig.phonemeIdMap); // phonemes -> ids phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes); diff --git a/src/python/piper_train/VERSION b/src/python/piper_train/VERSION index 6e8bf73..3eefcb9 100644 --- a/src/python/piper_train/VERSION +++ b/src/python/piper_train/VERSION @@ -1 +1 @@ -0.1.0 +1.0.0 diff --git a/src/python/piper_train/preprocess.py b/src/python/piper_train/preprocess.py index 045deda..205563e 100644 --- a/src/python/piper_train/preprocess.py +++ b/src/python/piper_train/preprocess.py @@ -26,6 +26,8 @@ from .phonemize import ( phonemize, ) +_DIR = Path(__file__).parent +_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip() _LOGGER = logging.getLogger("preprocess") @@ -151,6 +153,7 @@ def main() -> None: "num_symbols": MAX_PHONEMES, "num_speakers": len(speaker_counts), "speaker_id_map": speaker_ids, + "piper_version": _VERSION, }, config_file, ensure_ascii=False,