Add uk tests sentences

This commit is contained in:
Michael Hansen
2023-05-17 11:12:39 -05:00
parent bee661db8d
commit 27b81a800d
4 changed files with 64 additions and 9 deletions
+7
View File
@@ -0,0 +1,7 @@
{"text": "Весе́лка, також ра́йдуга оптичне явище в атмосфері, що являє собою одну, дві чи декілька різнокольорових дуг ,або кіл, якщо дивитися з повітря, що спостерігаються на тлі хмари, якщо вона розташована проти Сонця.", "phonemes": ["в", "е", "с", "е", "́", "л", "к", "а", ",", " ", "т", "а", "к", "о", "ж", " ", "р", "а", "́", "и", "̆", "д", "у", "г", "а", " ", "о", "п", "т", "и", "ч", "н", "е", " ", "я", "в", "и", "щ", "е", " ", "в", " ", "а", "т", "м", "о", "с", "ф", "е", "р", "і", ",", " ", "щ", "о", " ", "я", "в", "л", "я", "є", " ", "с", "о", "б", "о", "ю", " ", "о", "д", "н", "у", ",", " ", "д", "в", "і", " ", "ч", "и", " ", "д", "е", "к", "і", "л", "ь", "к", "а", " ", "р", "і", "з", "н", "о", "к", "о", "л", "ь", "о", "р", "о", "в", "и", "х", " ", "д", "у", "г", " ", ",", "а", "б", "о", " ", "к", "і", "л", ",", " ", "я", "к", "щ", "о", " ", "д", "и", "в", "и", "т", "и", "с", "я", " ", "з", " ", "п", "о", "в", "і", "т", "р", "я", ",", " ", "щ", "о", " ", "с", "п", "о", "с", "т", "е", "р", "і", "г", "а", "ю", "т", "ь", "с", "я", " ", "н", "а", " ", "т", "л", "і", " ", "х", "м", "а", "р", "и", ",", " ", "я", "к", "щ", "о", " ", "в", "о", "н", "а", " ", "р", "о", "з", "т", "а", "ш", "о", "в", "а", "н", "а", " ", "п", "р", "о", "т", "и", " ", "с", "о", "н", "ц", "я", "."], "phoneme_ids": [1, 0, 14, 0, 18, 0, 33, 0, 18, 0, 45, 0, 27, 0, 26, 0, 12, 0, 6, 0, 3, 0, 34, 0, 12, 0, 26, 0, 30, 0, 20, 0, 3, 0, 32, 0, 12, 0, 45, 0, 22, 0, 46, 0, 17, 0, 35, 0, 15, 0, 12, 0, 3, 0, 30, 0, 31, 0, 34, 0, 22, 0, 39, 0, 29, 0, 18, 0, 3, 0, 44, 0, 14, 0, 22, 0, 41, 0, 18, 0, 3, 0, 14, 0, 3, 0, 12, 0, 34, 0, 28, 0, 30, 0, 33, 0, 36, 0, 18, 0, 32, 0, 23, 0, 6, 0, 3, 0, 41, 0, 30, 0, 3, 0, 44, 0, 14, 0, 27, 0, 44, 0, 19, 0, 3, 0, 33, 0, 30, 0, 13, 0, 30, 0, 43, 0, 3, 0, 30, 0, 17, 0, 29, 0, 35, 0, 6, 0, 3, 0, 17, 0, 14, 0, 23, 0, 3, 0, 39, 0, 22, 0, 3, 0, 17, 0, 18, 0, 26, 0, 23, 0, 27, 0, 42, 0, 26, 0, 12, 0, 3, 0, 32, 0, 23, 0, 21, 0, 29, 0, 30, 0, 26, 0, 30, 0, 27, 0, 42, 0, 30, 0, 32, 0, 30, 0, 14, 0, 22, 0, 37, 0, 3, 0, 17, 0, 35, 0, 15, 0, 3, 0, 6, 0, 12, 0, 13, 0, 30, 0, 3, 0, 26, 0, 23, 0, 27, 0, 6, 0, 3, 0, 44, 0, 26, 0, 41, 0, 30, 0, 3, 0, 17, 0, 22, 0, 14, 0, 22, 0, 34, 0, 22, 0, 33, 0, 44, 0, 3, 0, 21, 0, 3, 0, 31, 0, 30, 0, 14, 0, 23, 0, 34, 0, 32, 0, 44, 0, 6, 0, 3, 0, 41, 0, 30, 0, 3, 0, 33, 0, 31, 0, 30, 0, 33, 0, 34, 0, 18, 0, 32, 0, 23, 0, 15, 0, 12, 0, 43, 0, 34, 0, 42, 0, 33, 0, 44, 0, 3, 0, 29, 0, 12, 0, 3, 0, 34, 0, 27, 0, 23, 0, 3, 0, 37, 0, 28, 0, 12, 0, 32, 0, 22, 0, 6, 0, 3, 0, 44, 0, 26, 0, 41, 0, 30, 0, 3, 0, 14, 0, 30, 0, 29, 0, 12, 0, 3, 0, 32, 0, 30, 0, 21, 0, 34, 0, 12, 0, 40, 0, 30, 0, 14, 0, 12, 0, 29, 0, 12, 0, 3, 0, 31, 0, 32, 0, 30, 0, 34, 0, 22, 0, 3, 0, 33, 0, 30, 0, 29, 0, 38, 0, 44, 0, 8, 0, 2]}
{"text": "Червоний колір ми бачимо з зовнішнього боку первинної веселки, а фіолетовий — із внутрішнього.", "phonemes": ["ч", "е", "р", "в", "о", "н", "и", "и", "̆", " ", "к", "о", "л", "і", "р", " ", "м", "и", " ", "б", "а", "ч", "и", "м", "о", " ", "з", " ", "з", "о", "в", "н", "і", "ш", "н", "ь", "о", "г", "о", " ", "б", "о", "к", "у", " ", "п", "е", "р", "в", "и", "н", "н", "о", "і", "̈", " ", "в", "е", "с", "е", "л", "к", "и", ",", " ", "а", " ", "ф", "і", "о", "л", "е", "т", "о", "в", "и", "и", "̆", " ", "—", " ", "і", "з", " ", "в", "н", "у", "т", "р", "і", "ш", "н", "ь", "о", "г", "о", "."], "phoneme_ids": [1, 0, 39, 0, 18, 0, 32, 0, 14, 0, 30, 0, 29, 0, 22, 0, 22, 0, 46, 0, 3, 0, 26, 0, 30, 0, 27, 0, 23, 0, 32, 0, 3, 0, 28, 0, 22, 0, 3, 0, 13, 0, 12, 0, 39, 0, 22, 0, 28, 0, 30, 0, 3, 0, 21, 0, 3, 0, 21, 0, 30, 0, 14, 0, 29, 0, 23, 0, 40, 0, 29, 0, 42, 0, 30, 0, 15, 0, 30, 0, 3, 0, 13, 0, 30, 0, 26, 0, 35, 0, 3, 0, 31, 0, 18, 0, 32, 0, 14, 0, 22, 0, 29, 0, 29, 0, 30, 0, 23, 0, 47, 0, 3, 0, 14, 0, 18, 0, 33, 0, 18, 0, 27, 0, 26, 0, 22, 0, 6, 0, 3, 0, 12, 0, 3, 0, 36, 0, 23, 0, 30, 0, 27, 0, 18, 0, 34, 0, 30, 0, 14, 0, 22, 0, 22, 0, 46, 0, 3, 0, 48, 0, 3, 0, 23, 0, 21, 0, 3, 0, 14, 0, 29, 0, 35, 0, 34, 0, 32, 0, 23, 0, 40, 0, 29, 0, 42, 0, 30, 0, 15, 0, 30, 0, 8, 0, 2]}
{"text": "Веселка пов'язана з заломленням і відбиттям ,деякою мірою і з дифракцією, сонячного світла у водяних краплях, зважених у повітрі.", "phonemes": ["в", "е", "с", "е", "л", "к", "а", " ", "п", "о", "в", "'", "я", "з", "а", "н", "а", " ", "з", " ", "з", "а", "л", "о", "м", "л", "е", "н", "н", "я", "м", " ", "і", " ", "в", "і", "д", "б", "и", "т", "т", "я", "м", " ", ",", "д", "е", "я", "к", "о", "ю", " ", "м", "і", "р", "о", "ю", " ", "і", " ", "з", " ", "д", "и", "ф", "р", "а", "к", "ц", "і", "є", "ю", ",", " ", "с", "о", "н", "я", "ч", "н", "о", "г", "о", " ", "с", "в", "і", "т", "л", "а", " ", "у", " ", "в", "о", "д", "я", "н", "и", "х", " ", "к", "р", "а", "п", "л", "я", "х", ",", " ", "з", "в", "а", "ж", "е", "н", "и", "х", " ", "у", " ", "п", "о", "в", "і", "т", "р", "і", "."], "phoneme_ids": [1, 0, 14, 0, 18, 0, 33, 0, 18, 0, 27, 0, 26, 0, 12, 0, 3, 0, 31, 0, 30, 0, 14, 0, 5, 0, 44, 0, 21, 0, 12, 0, 29, 0, 12, 0, 3, 0, 21, 0, 3, 0, 21, 0, 12, 0, 27, 0, 30, 0, 28, 0, 27, 0, 18, 0, 29, 0, 29, 0, 44, 0, 28, 0, 3, 0, 23, 0, 3, 0, 14, 0, 23, 0, 17, 0, 13, 0, 22, 0, 34, 0, 34, 0, 44, 0, 28, 0, 3, 0, 6, 0, 17, 0, 18, 0, 44, 0, 26, 0, 30, 0, 43, 0, 3, 0, 28, 0, 23, 0, 32, 0, 30, 0, 43, 0, 3, 0, 23, 0, 3, 0, 21, 0, 3, 0, 17, 0, 22, 0, 36, 0, 32, 0, 12, 0, 26, 0, 38, 0, 23, 0, 19, 0, 43, 0, 6, 0, 3, 0, 33, 0, 30, 0, 29, 0, 44, 0, 39, 0, 29, 0, 30, 0, 15, 0, 30, 0, 3, 0, 33, 0, 14, 0, 23, 0, 34, 0, 27, 0, 12, 0, 3, 0, 35, 0, 3, 0, 14, 0, 30, 0, 17, 0, 44, 0, 29, 0, 22, 0, 37, 0, 3, 0, 26, 0, 32, 0, 12, 0, 31, 0, 27, 0, 44, 0, 37, 0, 6, 0, 3, 0, 21, 0, 14, 0, 12, 0, 20, 0, 18, 0, 29, 0, 22, 0, 37, 0, 3, 0, 35, 0, 3, 0, 31, 0, 30, 0, 14, 0, 23, 0, 34, 0, 32, 0, 23, 0, 8, 0, 2]}
{"text": "Ці крапельки по-різному відхиляють світло різних кольорів, у результаті чого біле світло розкладається на спектр.", "phonemes": ["ц", "і", " ", "к", "р", "а", "п", "е", "л", "ь", "к", "и", " ", "п", "о", "-", "р", "і", "з", "н", "о", "м", "у", " ", "в", "і", "д", "х", "и", "л", "я", "ю", "т", "ь", " ", "с", "в", "і", "т", "л", "о", " ", "р", "і", "з", "н", "и", "х", " ", "к", "о", "л", "ь", "о", "р", "і", "в", ",", " ", "у", " ", "р", "е", "з", "у", "л", "ь", "т", "а", "т", "і", " ", "ч", "о", "г", "о", " ", "б", "і", "л", "е", " ", "с", "в", "і", "т", "л", "о", " ", "р", "о", "з", "к", "л", "а", "д", "а", "є", "т", "ь", "с", "я", " ", "н", "а", " ", "с", "п", "е", "к", "т", "р", "."], "phoneme_ids": [1, 0, 38, 0, 23, 0, 3, 0, 26, 0, 32, 0, 12, 0, 31, 0, 18, 0, 27, 0, 42, 0, 26, 0, 22, 0, 3, 0, 31, 0, 30, 0, 7, 0, 32, 0, 23, 0, 21, 0, 29, 0, 30, 0, 28, 0, 35, 0, 3, 0, 14, 0, 23, 0, 17, 0, 37, 0, 22, 0, 27, 0, 44, 0, 43, 0, 34, 0, 42, 0, 3, 0, 33, 0, 14, 0, 23, 0, 34, 0, 27, 0, 30, 0, 3, 0, 32, 0, 23, 0, 21, 0, 29, 0, 22, 0, 37, 0, 3, 0, 26, 0, 30, 0, 27, 0, 42, 0, 30, 0, 32, 0, 23, 0, 14, 0, 6, 0, 3, 0, 35, 0, 3, 0, 32, 0, 18, 0, 21, 0, 35, 0, 27, 0, 42, 0, 34, 0, 12, 0, 34, 0, 23, 0, 3, 0, 39, 0, 30, 0, 15, 0, 30, 0, 3, 0, 13, 0, 23, 0, 27, 0, 18, 0, 3, 0, 33, 0, 14, 0, 23, 0, 34, 0, 27, 0, 30, 0, 3, 0, 32, 0, 30, 0, 21, 0, 26, 0, 27, 0, 12, 0, 17, 0, 12, 0, 19, 0, 34, 0, 42, 0, 33, 0, 44, 0, 3, 0, 29, 0, 12, 0, 3, 0, 33, 0, 31, 0, 18, 0, 26, 0, 34, 0, 32, 0, 8, 0, 2]}
{"text": "Спостерігач, що стоїть спиною до джерела світла, бачить різнобарвне світіння, що виходить із простору по концентричному колу ,дузі.", "phonemes": ["с", "п", "о", "с", "т", "е", "р", "і", "г", "а", "ч", ",", " ", "щ", "о", " ", "с", "т", "о", "і", "̈", "т", "ь", " ", "с", "п", "и", "н", "о", "ю", " ", "д", "о", " ", "д", "ж", "е", "р", "е", "л", "а", " ", "с", "в", "і", "т", "л", "а", ",", " ", "б", "а", "ч", "и", "т", "ь", " ", "р", "і", "з", "н", "о", "б", "а", "р", "в", "н", "е", " ", "с", "в", "і", "т", "і", "н", "н", "я", ",", " ", "щ", "о", " ", "в", "и", "х", "о", "д", "и", "т", "ь", " ", "і", "з", " ", "п", "р", "о", "с", "т", "о", "р", "у", " ", "п", "о", " ", "к", "о", "н", "ц", "е", "н", "т", "р", "и", "ч", "н", "о", "м", "у", " ", "к", "о", "л", "у", " ", ",", "д", "у", "з", "і", "."], "phoneme_ids": [1, 0, 33, 0, 31, 0, 30, 0, 33, 0, 34, 0, 18, 0, 32, 0, 23, 0, 15, 0, 12, 0, 39, 0, 6, 0, 3, 0, 41, 0, 30, 0, 3, 0, 33, 0, 34, 0, 30, 0, 23, 0, 47, 0, 34, 0, 42, 0, 3, 0, 33, 0, 31, 0, 22, 0, 29, 0, 30, 0, 43, 0, 3, 0, 17, 0, 30, 0, 3, 0, 17, 0, 20, 0, 18, 0, 32, 0, 18, 0, 27, 0, 12, 0, 3, 0, 33, 0, 14, 0, 23, 0, 34, 0, 27, 0, 12, 0, 6, 0, 3, 0, 13, 0, 12, 0, 39, 0, 22, 0, 34, 0, 42, 0, 3, 0, 32, 0, 23, 0, 21, 0, 29, 0, 30, 0, 13, 0, 12, 0, 32, 0, 14, 0, 29, 0, 18, 0, 3, 0, 33, 0, 14, 0, 23, 0, 34, 0, 23, 0, 29, 0, 29, 0, 44, 0, 6, 0, 3, 0, 41, 0, 30, 0, 3, 0, 14, 0, 22, 0, 37, 0, 30, 0, 17, 0, 22, 0, 34, 0, 42, 0, 3, 0, 23, 0, 21, 0, 3, 0, 31, 0, 32, 0, 30, 0, 33, 0, 34, 0, 30, 0, 32, 0, 35, 0, 3, 0, 31, 0, 30, 0, 3, 0, 26, 0, 30, 0, 29, 0, 38, 0, 18, 0, 29, 0, 34, 0, 32, 0, 22, 0, 39, 0, 29, 0, 30, 0, 28, 0, 35, 0, 3, 0, 26, 0, 30, 0, 27, 0, 35, 0, 3, 0, 6, 0, 17, 0, 35, 0, 21, 0, 23, 0, 8, 0, 2]}
{"text": "Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!", "phonemes": ["ч", "у", "є", "ш", " ", "і", "̈", "х", ",", " ", "д", "о", "ц", "ю", ",", " ", "г", "а", "?", " ", "к", "у", "м", "е", "д", "н", "а", " ", "ж", " ", "т", "и", ",", " ", "п", "р", "о", "щ", "а", "и", "̆", "с", "я", " ", "б", "е", "з", " ", "ґ", "о", "л", "ь", "ф", "і", "в", "!"], "phoneme_ids": [1, 0, 39, 0, 35, 0, 19, 0, 40, 0, 3, 0, 23, 0, 47, 0, 37, 0, 6, 0, 3, 0, 17, 0, 30, 0, 38, 0, 43, 0, 6, 0, 3, 0, 15, 0, 12, 0, 11, 0, 3, 0, 26, 0, 35, 0, 28, 0, 18, 0, 17, 0, 29, 0, 12, 0, 3, 0, 20, 0, 3, 0, 34, 0, 22, 0, 6, 0, 3, 0, 31, 0, 32, 0, 30, 0, 41, 0, 12, 0, 22, 0, 46, 0, 33, 0, 44, 0, 3, 0, 13, 0, 18, 0, 21, 0, 3, 0, 16, 0, 30, 0, 27, 0, 42, 0, 36, 0, 23, 0, 14, 0, 4, 0, 2]}
{"text": "Жебракують філософи при ґанку церкви в Гадячі, ще й шатро їхнє п’яне знаємо.", "phonemes": ["ж", "е", "б", "р", "а", "к", "у", "ю", "т", "ь", " ", "ф", "і", "л", "о", "с", "о", "ф", "и", " ", "п", "р", "и", " ", "ґ", "а", "н", "к", "у", " ", "ц", "е", "р", "к", "в", "и", " ", "в", " ", "г", "а", "д", "я", "ч", "і", ",", " ", "щ", "е", " ", "и", "̆", " ", "ш", "а", "т", "р", "о", " ", "і", "̈", "х", "н", "є", " ", "п", "", "я", "н", "е", " ", "з", "н", "а", "є", "м", "о", "."], "phoneme_ids": [1, 0, 20, 0, 18, 0, 13, 0, 32, 0, 12, 0, 26, 0, 35, 0, 43, 0, 34, 0, 42, 0, 3, 0, 36, 0, 23, 0, 27, 0, 30, 0, 33, 0, 30, 0, 36, 0, 22, 0, 3, 0, 31, 0, 32, 0, 22, 0, 3, 0, 16, 0, 12, 0, 29, 0, 26, 0, 35, 0, 3, 0, 38, 0, 18, 0, 32, 0, 26, 0, 14, 0, 22, 0, 3, 0, 14, 0, 3, 0, 15, 0, 12, 0, 17, 0, 44, 0, 39, 0, 23, 0, 6, 0, 3, 0, 41, 0, 18, 0, 3, 0, 22, 0, 46, 0, 3, 0, 40, 0, 12, 0, 34, 0, 32, 0, 30, 0, 3, 0, 23, 0, 47, 0, 37, 0, 29, 0, 19, 0, 3, 0, 31, 0, 44, 0, 29, 0, 18, 0, 3, 0, 21, 0, 29, 0, 12, 0, 19, 0, 28, 0, 30, 0, 8, 0, 2]}
+7
View File
@@ -0,0 +1,7 @@
Весе́лка, також ра́йдуга оптичне явище в атмосфері, що являє собою одну, дві чи декілька різнокольорових дуг ,або кіл, якщо дивитися з повітря, що спостерігаються на тлі хмари, якщо вона розташована проти Сонця.
Червоний колір ми бачимо з зовнішнього боку первинної веселки, а фіолетовий — із внутрішнього.
Веселка пов'язана з заломленням і відбиттям ,деякою мірою і з дифракцією, сонячного світла у водяних краплях, зважених у повітрі.
Ці крапельки по-різному відхиляють світло різних кольорів, у результаті чого біле світло розкладається на спектр.
Спостерігач, що стоїть спиною до джерела світла, бачить різнобарвне світіння, що виходить із простору по концентричному колу ,дузі.
Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!
Жебракують філософи при ґанку церкви в Гадячі, ще й шатро їхнє п’яне знаємо.
+45 -3
View File
@@ -300,18 +300,55 @@ def phonemes_to_ids(
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("language")
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
args = parser.parse_args()
phonemizer = Phonemizer(args.language)
phonemizer: Optional[Phonemizer] = None
if args.text_casing == "lower":
casing = str.lower
elif args.text_casing == "upper":
casing = str.upper
else:
# ignore
casing = lambda s: s
if args.phoneme_type == PhonemeType.TEXT:
# Use text directly
phoneme_id_map = ALPHABETS[args.language]
else:
# Use eSpeak
phonemizer = Phonemizer(args.language)
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
phoneme_map = PHONEME_MAPS.get(args.language)
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(phonemes)
if args.phoneme_type == PhonemeType.TEXT:
phonemes = list(unicodedata.normalize("NFD", casing(line)))
else:
assert phonemizer is not None
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
phoneme_ids = phonemes_to_ids(
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
)
json.dump(
{
"text": line,
@@ -323,6 +360,11 @@ def main() -> None:
)
print("")
if missing_phonemes:
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
for phoneme, count in missing_phonemes.most_common():
print(phoneme, count, file=sys.stderr)
if __name__ == "__main__":
main()
+5 -6
View File
@@ -303,12 +303,11 @@ def phonemize_batch_text(
try:
_LOGGER.debug(utt)
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
utt.phoneme_ids = []
for phoneme in utt.phonemes:
if phoneme in alphabet:
utt.phoneme_ids.extend(alphabet[phoneme])
else:
utt.missing_phonemes[phoneme] += 1
utt.phoneme_ids = phonemes_to_ids(
utt.phonemes,
phoneme_id_map=alphabet,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,