mirror of
https://github.com/pstrueb/piper.git
synced 2026-06-02 01:47:02 +00:00
Merge branch 'master' into master
This commit is contained in:
+1
-1
@@ -33,7 +33,7 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
|
||||
RUN mkdir -p "lib/Linux-$(uname -m)"
|
||||
|
||||
# Use pre-compiled Piper phonemization library (includes onnxruntime)
|
||||
ARG PIPER_PHONEMIZE_VERSION='1.0.0'
|
||||
ARG PIPER_PHONEMIZE_VERSION='1.1.0'
|
||||
RUN mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
|
||||
curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH}${TARGETVARIANT}.tar.gz" | \
|
||||
tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf -
|
||||
|
||||
@@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a
|
||||
* Italian (it_IT)
|
||||
* Georgian (ka_GE)
|
||||
* Kazakh (kk_KZ)
|
||||
* Luxembourgish (lb_LU)
|
||||
* Nepali (ne_NP)
|
||||
* Dutch (nl_BE, nl_NL)
|
||||
* Norwegian (no_NO)
|
||||
* Polish (pl_PL)
|
||||
* Portuguese (pt_BR)
|
||||
* Romanian (ro_RO)
|
||||
* Russian (ru_RU)
|
||||
* Serbian (sr_RS)
|
||||
* Swedish (sv_SE)
|
||||
* Swahili (sw_CD)
|
||||
* Turkish (tr_TR)
|
||||
* Ukrainian (uk_UA)
|
||||
* Vietnamese (vi_VN)
|
||||
* Chinese (zh_CN)
|
||||
@@ -56,9 +60,9 @@ The `MODEL_CARD` file for each voice contains important licensing information. P
|
||||
|
||||
You can [run Piper with Python](#running-in-python) or download a binary release:
|
||||
|
||||
* [amd64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_amd64.tar.gz) (64-bit desktop Linux)
|
||||
* [arm64](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4)
|
||||
* [armv7](https://github.com/rhasspy/piper/releases/download/v1.1.0/piper_armv7.tar.gz) (32-bit Raspberry Pi 3/4)
|
||||
* [amd64](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz) (64-bit desktop Linux)
|
||||
* [arm64](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz) (64-bit Raspberry Pi 4)
|
||||
* [armv7](https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_armv7.tar.gz) (32-bit Raspberry Pi 3/4)
|
||||
|
||||
If you want to build from source, see the [Makefile](Makefile) and [C++ source](src/cpp).
|
||||
You must download and extract [piper-phonemize](https://github.com/rhasspy/piper-phonemize) to `lib/Linux-$(uname -m)/piper_phonemize` before building.
|
||||
@@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker <number>` to change speakers (default:
|
||||
|
||||
See `piper --help` for more options.
|
||||
|
||||
### Streaming Audio
|
||||
|
||||
Piper can stream raw audio to stdout as its produced:
|
||||
|
||||
``` sh
|
||||
echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \
|
||||
./piper --model en_US-lessac-medium.onnx --output-raw | \
|
||||
aplay -r 22050 -f S16_LE -t raw -
|
||||
```
|
||||
|
||||
This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice.
|
||||
|
||||
### JSON Input
|
||||
|
||||
|
||||
+3
-1
@@ -1,5 +1,7 @@
|
||||
# Training Guide
|
||||
|
||||
Check out a [video training guide by Thorsten Müller](https://www.youtube.com/watch?v=b_we_jma220)
|
||||
|
||||
Training a voice for Piper involves 3 main steps:
|
||||
|
||||
1. Preparing the dataset
|
||||
@@ -32,7 +34,7 @@ python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade wheel setuptools
|
||||
pip3 install -r requirements.txt
|
||||
pip3 install -e .
|
||||
```
|
||||
|
||||
Run the `build_monotonic_align.sh` script in the `src/python` directory to build the extension.
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
Et freet mech, Iech kennen ze léieren.
|
||||
Schwätzt wannechgelift méi lues.
|
||||
Vill Gléck fir däi Gebuertsdag.
|
||||
Mäi Loftkësseboot ass voller Éilen.
|
||||
Schwätz du Lëtzebuergesch?
|
||||
E gudde Rutsch an d'neit Joer.
|
||||
@@ -0,0 +1,4 @@
|
||||
Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă.
|
||||
De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont.
|
||||
În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal.
|
||||
Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă.
|
||||
@@ -0,0 +1,8 @@
|
||||
Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише.
|
||||
Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора.
|
||||
Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости.
|
||||
Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице.
|
||||
Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене.
|
||||
Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге.
|
||||
Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице.
|
||||
Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге.
|
||||
@@ -0,0 +1,6 @@
|
||||
{"phoneme_ids":[1,0,120,0,18,0,122,0,32,0,3,0,19,0,93,0,120,0,18,0,122,0,32,0,3,0,25,0,120,0,59,0,55,0,8,0,3,0,120,0,21,0,59,0,55,0,3,0,23,0,120,0,39,0,26,0,59,0,26,0,3,0,155,0,120,0,59,0,3,0,24,0,62,0,74,0,120,0,59,0,93,0,59,0,26,0,10,0,2],"phonemes":["ˈ","e","ː","t"," ","f","ʀ","ˈ","e","ː","t"," ","m","ˈ","ə","ɕ",","," ","ˈ","i","ə","ɕ"," ","k","ˈ","æ","n","ə","n"," ","ʦ","ˈ","ə"," ","l","ɜ","ɪ","ˈ","ə","ʀ","ə","n","."],"processed_text":"Et freet mech, Iech kennen ze léieren.","text":"Et freet mech, Iech kennen ze léieren."}
|
||||
{"phoneme_ids":[1,0,96,0,34,0,120,0,18,0,32,0,155,0,32,0,3,0,34,0,121,0,51,0,26,0,39,0,55,0,154,0,120,0,59,0,24,0,21,0,19,0,32,0,3,0,25,0,120,0,62,0,74,0,3,0,24,0,120,0,33,0,59,0,31,0,10,0,2],"phonemes":["ʃ","v","ˈ","e","t","ʦ","t"," ","v","ˌ","ɑ","n","æ","ɕ","g","ˈ","ə","l","i","f","t"," ","m","ˈ","ɜ","ɪ"," ","l","ˈ","u","ə","s","."],"processed_text":"Schwätzt wannechgelift méi lues.","text":"Schwätzt wannechgelift méi lues."}
|
||||
{"phoneme_ids":[1,0,19,0,120,0,21,0,24,0,3,0,154,0,24,0,120,0,18,0,23,0,3,0,19,0,120,0,21,0,122,0,94,0,3,0,17,0,120,0,39,0,122,0,74,0,3,0,154,0,59,0,15,0,120,0,33,0,122,0,94,0,32,0,31,0,17,0,14,0,122,0,156,0,10,0,2],"phonemes":["f","ˈ","i","l"," ","g","l","ˈ","e","k"," ","f","ˈ","i","ː","ʁ"," ","d","ˈ","æ","ː","ɪ"," ","g","ə","b","ˈ","u","ː","ʁ","t","s","d","a","ː","X","."],"processed_text":"Vill Gléck fir däi Gebuertsdag.","text":"Vill Gléck fir däi Gebuertsdag."}
|
||||
{"phoneme_ids":[1,0,25,0,120,0,39,0,122,0,74,0,3,0,24,0,121,0,27,0,19,0,32,0,23,0,59,0,31,0,120,0,18,0,122,0,15,0,27,0,122,0,32,0,3,0,120,0,51,0,31,0,3,0,34,0,120,0,27,0,24,0,18,0,122,0,93,0,3,0,120,0,62,0,74,0,24,0,59,0,26,0,10,0,2],"phonemes":["m","ˈ","æ","ː","ɪ"," ","l","ˌ","o","f","t","k","ə","s","ˈ","e","ː","b","o","ː","t"," ","ˈ","ɑ","s"," ","v","ˈ","o","l","e","ː","ʀ"," ","ˈ","ɜ","ɪ","l","ə","n","."],"processed_text":"Mäi Loftkësseboot ass voller Éilen.","text":"Mäi Loftkësseboot ass voller Éilen."}
|
||||
{"phoneme_ids":[1,0,96,0,34,0,120,0,18,0,32,0,155,0,3,0,17,0,120,0,33,0,122,0,3,0,24,0,121,0,59,0,155,0,59,0,15,0,120,0,33,0,122,0,94,0,22,0,59,0,96,0,13,0,2],"phonemes":["ʃ","v","ˈ","e","t","ʦ"," ","d","ˈ","u","ː"," ","l","ˌ","ə","ʦ","ə","b","ˈ","u","ː","ʁ","j","ə","ʃ","?"],"processed_text":"Schwätz du Lëtzebuergesch?","text":"Schwätz du Lëtzebuergesch?"}
|
||||
{"phoneme_ids":[1,0,120,0,59,0,3,0,154,0,120,0,33,0,17,0,59,0,3,0,93,0,120,0,33,0,32,0,96,0,3,0,120,0,51,0,26,0,3,0,17,0,26,0,120,0,51,0,74,0,32,0,3,0,22,0,120,0,27,0,122,0,94,0,10,0,2],"phonemes":["ˈ","ə"," ","g","ˈ","u","d","ə"," ","ʀ","ˈ","u","t","ʃ"," ","ˈ","ɑ","n"," ","d","n","ˈ","ɑ","ɪ","t"," ","j","ˈ","o","ː","ʁ","."],"processed_text":"E gudde Rutsch an d'neit Joer.","text":"E gudde Rutsch an d'neit Joer."}
|
||||
@@ -0,0 +1,4 @@
|
||||
{"phoneme_ids":[1,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,33,0,26,0,3,0,19,0,121,0,18,0,26,0,27,0,25,0,120,0,18,0,26,0,3,0,120,0,27,0,28,0,32,0,21,0,23,0,3,0,96,0,21,0,3,0,25,0,121,0,18,0,32,0,18,0,27,0,92,0,27,0,24,0,120,0,27,0,17,0,108,0,21,0,23,0,3,0,121,0,14,0,32,0,25,0,27,0,31,0,19,0,120,0,18,0,92,0,21,0,23,0,3,0,23,0,121,0,14,0,92,0,18,0,3,0,31,0,18,0,3,0,25,0,121,0,14,0,26,0,21,0,19,0,120,0,18,0,31,0,32,0,59,0,3,0,28,0,30,0,21,0,26,0,3,0,121,0,14,0,28,0,14,0,92,0,120,0,21,0,32,0,31,0,22,0,14,0,3,0,28,0,18,0,3,0,32,0,96,0,120,0,18,0,30,0,3,0,14,0,3,0,121,0,33,0,26,0,33,0,121,0,21,0,3,0,31,0,28,0,120,0,18,0,23,0,32,0,30,0,33,0,3,0,17,0,18,0,3,0,19,0,120,0,27,0,30,0,25,0,14,0,3,0,121,0,33,0,26,0,33,0,74,0,3,0,120,0,14,0,30,0,23,0,3,0,23,0,121,0,27,0,24,0,27,0,92,0,120,0,14,0,32,0,3,0,14,0,32,0,120,0,33,0,26,0,32,0,96,0,119,0,3,0,23,0,73,0,26,0,17,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,14,0,3,0,31,0,54,0,14,0,92,0,120,0,18,0,24,0,33,0,74,0,3,0,31,0,18,0,3,0,30,0,18,0,19,0,30,0,120,0,14,0,23,0,32,0,59,0,3,0,73,0,26,0,3,0,28,0,121,0,21,0,23,0,59,0,32,0,120,0,33,0,30,0,21,0,24,0,18,0,3,0,17,0,18,0,3,0,120,0,14,0,28,0,59,0,3,0,17,0,21,0,26,0,3,0,14,0,32,0,25,0,120,0,27,0,31,0,19,0,18,0,92,0,121,0,59,0,10,0,2],"phonemes":["k","ˌ","u","r","k","u","b","ˈ","e","u","l"," ","j","ˌ","e","s","t","e"," ","u","n"," ","f","ˌ","e","n","o","m","ˈ","e","n"," ","ˈ","o","p","t","i","k"," ","ʃ","i"," ","m","ˌ","e","t","e","o","ɾ","o","l","ˈ","o","d","ʒ","i","k"," ","ˌ","a","t","m","o","s","f","ˈ","e","ɾ","i","k"," ","k","ˌ","a","ɾ","e"," ","s","e"," ","m","ˌ","a","n","i","f","ˈ","e","s","t","ə"," ","p","r","i","n"," ","ˌ","a","p","a","ɾ","ˈ","i","t","s","j","a"," ","p","e"," ","t","ʃ","ˈ","e","r"," ","a"," ","ˌ","u","n","u","ˌ","i"," ","s","p","ˈ","e","k","t","r","u"," ","d","e"," ","f","ˈ","o","r","m","a"," ","ˌ","u","n","u","ɪ"," ","ˈ","a","r","k"," ","k","ˌ","o","l","o","ɾ","ˈ","a","t"," ","a","t","ˈ","u","n","t","ʃ","ʲ"," ","k","ɨ","n","d"," ","l","u","m","ˈ","i","n","a"," ","s","ɔ","a","ɾ","ˈ","e","l","u","ɪ"," ","s","e"," ","r","e","f","r","ˈ","a","k","t","ə"," ","ɨ","n"," ","p","ˌ","i","k","ə","t","ˈ","u","r","i","l","e"," ","d","e"," ","ˈ","a","p","ə"," ","d","i","n"," ","a","t","m","ˈ","o","s","f","e","ɾ","ˌ","ə","."],"processed_text":"Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă.","text":"Curcubeul este un fenomen optic și meteorologic atmosferic care se manifestă prin apariția pe cer a unui spectru de forma unui arc colorat atunci când lumina soarelui se refractă în picăturile de apă din atmosferă."}
|
||||
{"phoneme_ids":[1,0,17,0,18,0,3,0,32,0,96,0,18,0,24,0,18,0,3,0,25,0,14,0,74,0,3,0,25,0,120,0,33,0,24,0,32,0,18,0,3,0,121,0,27,0,92,0,119,0,119,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,3,0,31,0,18,0,3,0,27,0,15,0,31,0,120,0,18,0,30,0,34,0,59,0,3,0,17,0,120,0,33,0,28,0,59,0,3,0,28,0,24,0,120,0,54,0,14,0,22,0,18,0,8,0,3,0,23,0,73,0,26,0,17,0,3,0,31,0,54,0,14,0,92,0,120,0,18,0,24,0,18,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,121,0,14,0,28,0,30,0,27,0,28,0,22,0,120,0,14,0,32,0,3,0,17,0,18,0,3,0,121,0,27,0,92,0,21,0,38,0,120,0,27,0,26,0,32,0,10,0,2],"phonemes":["d","e"," ","t","ʃ","e","l","e"," ","m","a","ɪ"," ","m","ˈ","u","l","t","e"," ","ˌ","o","ɾ","ʲ","ʲ"," ","k","ˌ","u","r","k","u","b","ˈ","e","u","l"," ","s","e"," ","o","b","s","ˈ","e","r","v","ə"," ","d","ˈ","u","p","ə"," ","p","l","ˈ","ɔ","a","j","e",","," ","k","ɨ","n","d"," ","s","ɔ","a","ɾ","ˈ","e","l","e"," ","j","ˌ","e","s","t","e"," ","ˌ","a","p","r","o","p","j","ˈ","a","t"," ","d","e"," ","ˌ","o","ɾ","i","z","ˈ","o","n","t","."],"processed_text":"De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont.","text":"De cele mai multe ori curcubeul se observă după ploaie, când soarele este apropiat de orizont."}
|
||||
{"phoneme_ids":[1,0,73,0,26,0,3,0,23,0,27,0,26,0,17,0,120,0,21,0,32,0,31,0,21,0,74,0,3,0,15,0,120,0,33,0,26,0,18,0,3,0,17,0,18,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,59,0,8,0,3,0,73,0,26,0,3,0,19,0,120,0,14,0,32,0,31,0,14,0,3,0,28,0,121,0,18,0,92,0,18,0,32,0,120,0,18,0,24,0,33,0,74,0,3,0,17,0,18,0,3,0,28,0,24,0,120,0,54,0,14,0,22,0,18,0,8,0,3,0,33,0,26,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,100,0,3,0,31,0,121,0,18,0,23,0,33,0,26,0,17,0,120,0,14,0,30,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,34,0,21,0,38,0,120,0,21,0,15,0,21,0,24,0,3,0,17,0,18,0,14,0,31,0,120,0,33,0,28,0,30,0,14,0,3,0,23,0,121,0,33,0,30,0,23,0,33,0,15,0,120,0,18,0,33,0,24,0,33,0,74,0,3,0,28,0,30,0,121,0,21,0,26,0,32,0,96,0,21,0,28,0,120,0,14,0,24,0,10,0,2],"phonemes":["ɨ","n"," ","k","o","n","d","ˈ","i","t","s","i","ɪ"," ","b","ˈ","u","n","e"," ","d","e"," ","l","u","m","ˈ","i","n","ə",","," ","ɨ","n"," ","f","ˈ","a","t","s","a"," ","p","ˌ","e","ɾ","e","t","ˈ","e","l","u","ɪ"," ","d","e"," ","p","l","ˈ","ɔ","a","j","e",","," ","u","n"," ","k","ˌ","u","r","k","u","b","ˈ","e","ʊ"," ","s","ˌ","e","k","u","n","d","ˈ","a","r"," ","j","ˌ","e","s","t","e"," ","v","i","z","ˈ","i","b","i","l"," ","d","e","a","s","ˈ","u","p","r","a"," ","k","ˌ","u","r","k","u","b","ˈ","e","u","l","u","ɪ"," ","p","r","ˌ","i","n","t","ʃ","i","p","ˈ","a","l","."],"processed_text":"În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal.","text":"În condiții bune de lumină, în fața peretelui de ploaie, un curcubeu secundar este vizibil deasupra curcubeului principal."}
|
||||
{"phoneme_ids":[1,0,14,0,32,0,96,0,121,0,18,0,31,0,32,0,14,0,3,0,22,0,121,0,18,0,31,0,32,0,18,0,3,0,25,0,14,0,74,0,3,0,31,0,24,0,120,0,14,0,15,0,3,0,17,0,21,0,26,0,3,0,23,0,14,0,120,0,33,0,38,0,14,0,3,0,17,0,120,0,33,0,15,0,24,0,18,0,74,0,3,0,30,0,18,0,19,0,24,0,120,0,18,0,23,0,31,0,21,0,74,0,3,0,14,0,3,0,24,0,33,0,25,0,120,0,21,0,26,0,21,0,74,0,3,0,73,0,26,0,3,0,28,0,121,0,21,0,23,0,59,0,32,0,120,0,33,0,30,0,21,0,24,0,18,0,3,0,17,0,18,0,3,0,120,0,14,0,28,0,59,0,3,0,96,0,21,0,3,0,121,0,14,0,92,0,18,0,3,0,27,0,3,0,31,0,18,0,23,0,34,0,120,0,18,0,26,0,32,0,31,0,59,0,3,0,17,0,18,0,3,0,23,0,33,0,24,0,120,0,27,0,92,0,119,0,119,0,3,0,27,0,28,0,120,0,33,0,31,0,59,0,10,0,2],"phonemes":["a","t","ʃ","ˌ","e","s","t","a"," ","j","ˌ","e","s","t","e"," ","m","a","ɪ"," ","s","l","ˈ","a","b"," ","d","i","n"," ","k","a","ˈ","u","z","a"," ","d","ˈ","u","b","l","e","ɪ"," ","r","e","f","l","ˈ","e","k","s","i","ɪ"," ","a"," ","l","u","m","ˈ","i","n","i","ɪ"," ","ɨ","n"," ","p","ˌ","i","k","ə","t","ˈ","u","r","i","l","e"," ","d","e"," ","ˈ","a","p","ə"," ","ʃ","i"," ","ˌ","a","ɾ","e"," ","o"," ","s","e","k","v","ˈ","e","n","t","s","ə"," ","d","e"," ","k","u","l","ˈ","o","ɾ","ʲ","ʲ"," ","o","p","ˈ","u","s","ə","."],"processed_text":"Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă.","text":"Acesta este mai slab din cauza dublei reflexii a luminii în picăturile de apă și are o secvență de culori opusă."}
|
||||
@@ -0,0 +1,8 @@
|
||||
{"phoneme_ids":[1,0,17,0,120,0,33,0,66,0,50,0,3,0,22,0,18,0,3,0,120,0,27,0,28,0,32,0,74,0,32,0,96,0,23,0,50,0,3,0,74,0,3,0,25,0,120,0,61,0,32,0,61,0,121,0,27,0,30,0,27,0,24,0,121,0,27,0,96,0,23,0,50,0,3,0,28,0,120,0,27,0,22,0,50,0,34,0,50,0,3,0,23,0,120,0,27,0,22,0,74,0,3,0,31,0,120,0,61,0,3,0,28,0,120,0,27,0,22,0,50,0,34,0,24,0,22,0,121,0,100,0,22,0,18,0,3,0,26,0,120,0,14,0,3,0,26,0,120,0,61,0,15,0,100,0,8,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,74,0,3,0,38,0,30,0,120,0,14,0,32,0,31,0,74,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,121,0,51,0,22,0,100,0,3,0,23,0,30,0,120,0,27,0,31,0,3,0,31,0,120,0,21,0,32,0,26,0,61,0,3,0,34,0,120,0,27,0,17,0,61,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,74,0,8,0,3,0,26,0,120,0,51,0,22,0,32,0,96,0,61,0,96,0,32,0,55,0,61,0,3,0,26,0,120,0,14,0,23,0,27,0,26,0,3,0,23,0,120,0,21,0,96,0,61,0,10,0,2],"phonemes":["d","ˈ","u","ɡ","ɐ"," ","j","e"," ","ˈ","o","p","t","ɪ","t","ʃ","k","ɐ"," ","ɪ"," ","m","ˈ","ɛ","t","ɛ","ˌ","o","r","o","l","ˌ","o","ʃ","k","ɐ"," ","p","ˈ","o","j","ɐ","v","ɐ"," ","k","ˈ","o","j","ɪ"," ","s","ˈ","ɛ"," ","p","ˈ","o","j","ɐ","v","l","j","ˌ","ʊ","j","e"," ","n","ˈ","a"," ","n","ˈ","ɛ","b","ʊ",","," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v","ɪ"," ","z","r","ˈ","a","t","s","ɪ"," ","p","r","ˈ","ɛ","l","ɐ","m","ˌ","ɑ","j","ʊ"," ","k","r","ˈ","o","s"," ","s","ˈ","i","t","n","ɛ"," ","v","ˈ","o","d","ɛ","n","ɛ"," ","k","ˈ","a","p","ɪ",","," ","n","ˈ","ɑ","j","t","ʃ","ɛ","ʃ","t","ɕ","ɛ"," ","n","ˈ","a","k","o","n"," ","k","ˈ","i","ʃ","ɛ","."],"processed_text":"Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише.","text":"Дуга је оптичка и метеоролошка појава који се појављује на небу, када се сунчеви зраци преламају кроз ситне водене капи, најчешће након кише."}
|
||||
{"phoneme_ids":[1,0,17,0,120,0,33,0,66,0,50,0,3,0,31,0,120,0,61,0,3,0,120,0,27,0,15,0,74,0,32,0,96,0,26,0,27,0,3,0,34,0,120,0,21,0,17,0,74,0,3,0,26,0,120,0,14,0,3,0,38,0,120,0,14,0,31,0,32,0,27,0,30,0,100,0,3,0,23,0,120,0,21,0,96,0,26,0,74,0,20,0,3,0,23,0,120,0,14,0,28,0,74,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,28,0,120,0,27,0,31,0,25,0,50,0,32,0,30,0,50,0,32,0,96,0,3,0,31,0,32,0,120,0,27,0,22,0,74,0,3,0,120,0,27,0,23,0,30,0,61,0,26,0,100,0,32,0,3,0,24,0,120,0,61,0,17,0,107,0,74,0,25,0,50,0,3,0,31,0,120,0,33,0,26,0,32,0,31,0,100,0,3,0,74,0,3,0,66,0,24,0,120,0,61,0,17,0,50,0,3,0,100,0,3,0,31,0,25,0,120,0,61,0,30,0,100,0,3,0,32,0,120,0,27,0,66,0,50,0,3,0,38,0,120,0,14,0,31,0,32,0,27,0,30,0,50,0,10,0,2],"phonemes":["d","ˈ","u","ɡ","ɐ"," ","s","ˈ","ɛ"," ","ˈ","o","b","ɪ","t","ʃ","n","o"," ","v","ˈ","i","d","ɪ"," ","n","ˈ","a"," ","z","ˈ","a","s","t","o","r","ʊ"," ","k","ˈ","i","ʃ","n","ɪ","h"," ","k","ˈ","a","p","ɪ"," ","k","ˈ","a","d","ɐ"," ","p","ˈ","o","s","m","ɐ","t","r","ɐ","t","ʃ"," ","s","t","ˈ","o","j","ɪ"," ","ˈ","o","k","r","ɛ","n","ʊ","t"," ","l","ˈ","ɛ","d","ʑ","ɪ","m","ɐ"," ","s","ˈ","u","n","t","s","ʊ"," ","ɪ"," ","ɡ","l","ˈ","ɛ","d","ɐ"," ","ʊ"," ","s","m","ˈ","ɛ","r","ʊ"," ","t","ˈ","o","ɡ","ɐ"," ","z","ˈ","a","s","t","o","r","ɐ","."],"processed_text":"Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора.","text":"Дуга се обично види на застору кишних капи када посматрач стоји окренут леђима Сунцу и гледа у смеру тога застора."}
|
||||
{"phoneme_ids":[1,0,38,0,30,0,120,0,14,0,32,0,31,0,74,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,74,0,3,0,31,0,120,0,61,0,3,0,32,0,120,0,14,0,17,0,50,0,3,0,30,0,120,0,14,0,38,0,24,0,50,0,108,0,100,0,3,0,26,0,120,0,14,0,3,0,31,0,34,0,120,0,27,0,22,0,18,0,3,0,120,0,27,0,31,0,26,0,27,0,34,0,26,0,61,0,3,0,23,0,120,0,27,0,25,0,28,0,27,0,26,0,121,0,61,0,26,0,32,0,61,0,8,0,3,0,31,0,32,0,34,0,120,0,51,0,30,0,51,0,22,0,121,0,100,0,32,0,55,0,74,0,3,0,120,0,27,0,28,0,32,0,74,0,32,0,96,0,23,0,100,0,3,0,28,0,30,0,120,0,61,0,32,0,31,0,32,0,50,0,34,0,100,0,3,0,100,0,3,0,34,0,120,0,21,0,17,0,100,0,3,0,32,0,30,0,120,0,14,0,23,0,61,0,3,0,30,0,120,0,14,0,38,0,24,0,74,0,32,0,96,0,121,0,74,0,32,0,74,0,20,0,3,0,15,0,120,0,27,0,22,0,50,0,8,0,3,0,96,0,32,0,27,0,3,0,100,0,3,0,31,0,32,0,34,0,120,0,51,0,30,0,74,0,3,0,28,0,30,0,120,0,61,0,32,0,31,0,32,0,50,0,34,0,24,0,22,0,50,0,3,0,31,0,28,0,120,0,61,0,23,0,32,0,51,0,30,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,74,0,10,0,2],"phonemes":["z","r","ˈ","a","t","s","ɪ"," ","s","v","ˈ","ɛ","t","l","o","s","t","ɪ"," ","s","ˈ","ɛ"," ","t","ˈ","a","d","ɐ"," ","r","ˈ","a","z","l","ɐ","ʒ","ʊ"," ","n","ˈ","a"," ","s","v","ˈ","o","j","e"," ","ˈ","o","s","n","o","v","n","ɛ"," ","k","ˈ","o","m","p","o","n","ˌ","ɛ","n","t","ɛ",","," ","s","t","v","ˈ","ɑ","r","ɑ","j","ˌ","ʊ","t","ɕ","ɪ"," ","ˈ","o","p","t","ɪ","t","ʃ","k","ʊ"," ","p","r","ˈ","ɛ","t","s","t","ɐ","v","ʊ"," ","ʊ"," ","v","ˈ","i","d","ʊ"," ","t","r","ˈ","a","k","ɛ"," ","r","ˈ","a","z","l","ɪ","t","ʃ","ˌ","ɪ","t","ɪ","h"," ","b","ˈ","o","j","ɐ",","," ","ʃ","t","o"," ","ʊ"," ","s","t","v","ˈ","ɑ","r","ɪ"," ","p","r","ˈ","ɛ","t","s","t","ɐ","v","l","j","ɐ"," ","s","p","ˈ","ɛ","k","t","ɑ","r"," ","s","v","ˈ","ɛ","t","l","o","s","t","ɪ","."],"processed_text":"Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости.","text":"Зраци светлости се тада разлажу на своје основне компоненте, стварајући оптичку представу у виду траке различитих боја, што у ствари представља спектар светлости."}
|
||||
{"phoneme_ids":[1,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,50,0,28,0,30,0,120,0,21,0,25,0,51,0,30,0,26,0,50,0,3,0,17,0,120,0,33,0,66,0,50,0,3,0,26,0,120,0,14,0,31,0,32,0,51,0,22,0,18,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,3,0,38,0,30,0,120,0,14,0,23,0,3,0,22,0,120,0,18,0,17,0,26,0,27,0,25,0,3,0,28,0,30,0,120,0,61,0,24,0,27,0,25,0,74,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,104,0,74,0,32,0,31,0,61,0,10,0,2],"phonemes":["ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","ɐ","p","r","ˈ","i","m","ɑ","r","n","ɐ"," ","d","ˈ","u","ɡ","ɐ"," ","n","ˈ","a","s","t","ɑ","j","e"," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v"," ","z","r","ˈ","a","k"," ","j","ˈ","e","d","n","o","m"," ","p","r","ˈ","ɛ","l","o","m","ɪ"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ʎ","ɪ","t","s","ɛ","."],"processed_text":"Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице.","text":"Унутрашња-примарна дуга настаје када се сунчев зрак једном преломи са полеђине капљице."}
|
||||
{"phoneme_ids":[1,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,31,0,120,0,61,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,50,0,3,0,28,0,120,0,27,0,17,0,3,0,34,0,120,0,61,0,32,0,55,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,26,0,120,0,61,0,66,0,27,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,8,0,3,0,120,0,14,0,24,0,74,0,3,0,38,0,15,0,120,0,27,0,66,0,3,0,30,0,120,0,61,0,19,0,24,0,61,0,23,0,31,0,121,0,74,0,22,0,18,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,74,0,8,0,3,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,120,0,21,0,38,0,24,0,50,0,38,0,74,0,3,0,28,0,120,0,27,0,17,0,3,0,25,0,120,0,14,0,82,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,120,0,27,0,32,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,61,0,10,0,2],"phonemes":["p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","s","ˈ","ɛ"," ","p","r","ˈ","ɛ","l","ɐ","m","ɐ"," ","p","ˈ","o","d"," ","v","ˈ","ɛ","t","ɕ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","n","ˈ","ɛ","ɡ","o"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t",","," ","ˈ","a","l","ɪ"," ","z","b","ˈ","o","ɡ"," ","r","ˈ","ɛ","f","l","ɛ","k","s","ˌ","ɪ","j","e"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ɪ",","," ","p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","ˈ","i","z","l","ɐ","z","ɪ"," ","p","ˈ","o","d"," ","m","ˈ","a","ɲ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","ˈ","o","t"," ","t","s","r","v","ˈ","ɛ","n","ɛ","."],"processed_text":"Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене.","text":"Плава светлост се прелама под већим углом него црвена светлост, али због рефлексије са полеђине капи, плава светлост излази под мањим углом од црвене."}
|
||||
{"phoneme_ids":[1,0,38,0,120,0,14,0,32,0,27,0,3,0,22,0,18,0,3,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,15,0,120,0,27,0,22,0,50,0,3,0,31,0,120,0,14,0,3,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,8,0,3,0,50,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,3,0,28,0,30,0,120,0,21,0,25,0,51,0,30,0,26,0,61,0,3,0,17,0,120,0,33,0,66,0,61,0,10,0,2],"phonemes":["z","ˈ","a","t","o"," ","j","e"," ","p","l","ˈ","a","v","ɐ"," ","b","ˈ","o","j","ɐ"," ","s","ˈ","a"," ","ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ",","," ","ɐ"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","ˈ","a"," ","s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ"," ","p","r","ˈ","i","m","ɑ","r","n","ɛ"," ","d","ˈ","u","ɡ","ɛ","."],"processed_text":"Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге.","text":"Зато је плава боја са унутрашње стране, а црвена са спољашње стране примарне дуге."}
|
||||
{"phoneme_ids":[1,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,50,0,31,0,120,0,61,0,23,0,100,0,26,0,17,0,121,0,51,0,30,0,26,0,50,0,3,0,17,0,120,0,33,0,66,0,50,0,3,0,26,0,120,0,14,0,31,0,32,0,51,0,22,0,18,0,3,0,23,0,120,0,14,0,17,0,50,0,3,0,31,0,120,0,61,0,3,0,31,0,120,0,33,0,26,0,32,0,96,0,61,0,34,0,3,0,38,0,30,0,120,0,14,0,23,0,3,0,17,0,34,0,120,0,27,0,31,0,32,0,30,0,100,0,23,0,27,0,3,0,28,0,30,0,120,0,61,0,24,0,27,0,25,0,74,0,3,0,31,0,120,0,14,0,3,0,28,0,120,0,27,0,24,0,61,0,17,0,107,0,121,0,74,0,26,0,61,0,3,0,23,0,120,0,14,0,28,0,104,0,74,0,32,0,31,0,61,0,10,0,2],"phonemes":["s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","ɐ","s","ˈ","ɛ","k","ʊ","n","d","ˌ","ɑ","r","n","ɐ"," ","d","ˈ","u","ɡ","ɐ"," ","n","ˈ","a","s","t","ɑ","j","e"," ","k","ˈ","a","d","ɐ"," ","s","ˈ","ɛ"," ","s","ˈ","u","n","t","ʃ","ɛ","v"," ","z","r","ˈ","a","k"," ","d","v","ˈ","o","s","t","r","ʊ","k","o"," ","p","r","ˈ","ɛ","l","o","m","ɪ"," ","s","ˈ","a"," ","p","ˈ","o","l","ɛ","d","ʑ","ˌ","ɪ","n","ɛ"," ","k","ˈ","a","p","ʎ","ɪ","t","s","ɛ","."],"processed_text":"Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице.","text":"Спољашња-секундарна дуга настаје када се сунчев зрак двоструко преломи са полеђине капљице."}
|
||||
{"phoneme_ids":[1,0,28,0,24,0,120,0,14,0,34,0,50,0,3,0,31,0,34,0,120,0,61,0,32,0,24,0,27,0,31,0,32,0,3,0,31,0,120,0,61,0,3,0,28,0,30,0,120,0,61,0,24,0,50,0,25,0,50,0,3,0,28,0,120,0,27,0,17,0,3,0,34,0,120,0,61,0,32,0,55,0,74,0,25,0,3,0,120,0,33,0,66,0,24,0,27,0,25,0,3,0,28,0,120,0,14,0,3,0,22,0,18,0,3,0,31,0,32,0,120,0,27,0,66,0,50,0,3,0,120,0,27,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,31,0,28,0,120,0,27,0,104,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,8,0,3,0,50,0,3,0,32,0,31,0,30,0,34,0,120,0,61,0,26,0,50,0,3,0,31,0,120,0,14,0,3,0,120,0,33,0,26,0,100,0,32,0,30,0,121,0,50,0,96,0,82,0,18,0,3,0,31,0,32,0,30,0,120,0,14,0,26,0,61,0,3,0,31,0,120,0,61,0,23,0,100,0,26,0,17,0,121,0,51,0,30,0,26,0,61,0,3,0,17,0,120,0,33,0,66,0,61,0,10,0,2],"phonemes":["p","l","ˈ","a","v","ɐ"," ","s","v","ˈ","ɛ","t","l","o","s","t"," ","s","ˈ","ɛ"," ","p","r","ˈ","ɛ","l","ɐ","m","ɐ"," ","p","ˈ","o","d"," ","v","ˈ","ɛ","t","ɕ","ɪ","m"," ","ˈ","u","ɡ","l","o","m"," ","p","ˈ","a"," ","j","e"," ","s","t","ˈ","o","ɡ","ɐ"," ","ˈ","o","n","ɐ"," ","s","ˈ","a"," ","s","p","ˈ","o","ʎ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ",","," ","ɐ"," ","t","s","r","v","ˈ","ɛ","n","ɐ"," ","s","ˈ","a"," ","ˈ","u","n","ʊ","t","r","ˌ","ɐ","ʃ","ɲ","e"," ","s","t","r","ˈ","a","n","ɛ"," ","s","ˈ","ɛ","k","ʊ","n","d","ˌ","ɑ","r","n","ɛ"," ","d","ˈ","u","ɡ","ɛ","."],"processed_text":"Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге.","text":"Плава светлост се прелама под већим углом па је стога она са спољашње стране, а црвена са унутрашње стране секундарне дуге."}
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
@@ -76,6 +77,9 @@ struct RunConfig {
|
||||
// "output_file": str, (optional)
|
||||
// }
|
||||
bool jsonInput = false;
|
||||
|
||||
// Seconds of extra silence to insert after a single phoneme
|
||||
optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -185,6 +189,22 @@ int main(int argc, char *argv[]) {
|
||||
runConfig.sentenceSilenceSeconds.value();
|
||||
}
|
||||
|
||||
if (runConfig.phonemeSilenceSeconds) {
|
||||
if (!voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Overwrite
|
||||
voice.synthesisConfig.phonemeSilenceSeconds =
|
||||
runConfig.phonemeSilenceSeconds;
|
||||
} else {
|
||||
// Merge
|
||||
for (const auto &[phoneme, silenceSeconds] :
|
||||
*runConfig.phonemeSilenceSeconds) {
|
||||
voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
|
||||
phoneme, silenceSeconds);
|
||||
}
|
||||
}
|
||||
|
||||
} // if phonemeSilenceSeconds
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
|
||||
@@ -453,6 +473,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
|
||||
} else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
ensureArg(argc, argv, i + 1);
|
||||
auto phonemeStr = std::string(argv[++i]);
|
||||
if (!piper::isSingleCodepoint(phonemeStr)) {
|
||||
std::cerr << "Phoneme '" << phonemeStr
|
||||
<< "' is not a single codepoint (--phoneme_silence)"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!runConfig.phonemeSilenceSeconds) {
|
||||
runConfig.phonemeSilenceSeconds.emplace();
|
||||
}
|
||||
|
||||
auto phoneme = piper::getCodepoint(phonemeStr);
|
||||
(*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
|
||||
+102
-23
@@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
const std::string instanceName{"piper"};
|
||||
|
||||
std::string getVersion() {
|
||||
return VERSION;
|
||||
}
|
||||
std::string getVersion() { return VERSION; }
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s) {
|
||||
@@ -142,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
// "inference": {
|
||||
// "noise_scale": 0.667,
|
||||
// "length_scale": 1,
|
||||
// "noise_w": 0.8
|
||||
// "noise_w": 0.8,
|
||||
// "phoneme_silence": {
|
||||
// "<phoneme>": <seconds of silence>,
|
||||
// ...
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -168,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
if (inferenceValue.contains("noise_w")) {
|
||||
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("phoneme_silence")) {
|
||||
// phoneme -> seconds of silence to add after
|
||||
synthesisConfig.phonemeSilenceSeconds.emplace();
|
||||
auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
|
||||
for (auto &phonemeItem : phonemeSilenceValue.items()) {
|
||||
std::string phonemeStr = phonemeItem.key();
|
||||
if (!isSingleCodepoint(phonemeStr)) {
|
||||
spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme silence)");
|
||||
}
|
||||
|
||||
auto phoneme = getCodepoint(phonemeStr);
|
||||
(*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
|
||||
phonemeItem.value().get<float>();
|
||||
}
|
||||
|
||||
} // if phoneme_silence
|
||||
|
||||
} // if inference
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
@@ -458,30 +480,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
SynthesisResult sentenceResult;
|
||||
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
|
||||
std::vector<SynthesisResult> phraseResults;
|
||||
std::vector<size_t> phraseSilenceSamples;
|
||||
|
||||
// Use phoneme/id map from config
|
||||
PhonemeIdConfig idConfig;
|
||||
idConfig.phonemeIdMap =
|
||||
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
if (voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Split into phrases
|
||||
std::map<Phoneme, float> &phonemeSilenceSeconds =
|
||||
*voice.synthesisConfig.phonemeSilenceSeconds;
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
sentencePhonemes.size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
|
||||
for (auto sentencePhonemesIter = sentencePhonemes.begin();
|
||||
sentencePhonemesIter != sentencePhonemes.end();
|
||||
sentencePhonemesIter++) {
|
||||
Phoneme ¤tPhoneme = *sentencePhonemesIter;
|
||||
currentPhrasePhonemes->push_back(currentPhoneme);
|
||||
|
||||
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
|
||||
// Split at phrase boundary
|
||||
phraseSilenceSamples.push_back(
|
||||
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
|
||||
voice.synthesisConfig.sampleRate *
|
||||
voice.synthesisConfig.channels));
|
||||
|
||||
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use all phonemes
|
||||
phrasePhonemes.push_back(
|
||||
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
// Ensure results/samples are the same size
|
||||
while (phraseResults.size() < phrasePhonemes.size()) {
|
||||
phraseResults.emplace_back();
|
||||
}
|
||||
|
||||
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
|
||||
phraseSilenceSamples.push_back(0);
|
||||
}
|
||||
|
||||
// phonemes -> ids -> audio
|
||||
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
|
||||
if (phrasePhonemes[phraseIdx]->size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
|
||||
missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
phraseResults[phraseIdx]);
|
||||
|
||||
// Add end of phrase silence
|
||||
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
|
||||
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
|
||||
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
@@ -496,9 +578,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -49,14 +50,22 @@ struct PhonemizeConfig {
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
// VITS inference settings
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
|
||||
// Audio settings
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
|
||||
// Speaker id from 0 to numSpeakers - 1
|
||||
std::optional<SpeakerId> speakerId;
|
||||
|
||||
// Extra silence
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
@@ -89,6 +98,12 @@ struct Voice {
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s);
|
||||
|
||||
// Get the first UTF-8 codepoint of a string
|
||||
Phoneme getCodepoint(std::string s);
|
||||
|
||||
// Get version of Piper
|
||||
std::string getVersion();
|
||||
|
||||
|
||||
@@ -1,372 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from enum import Enum
|
||||
from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
MAX_PHONEMES = 256
|
||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
"(": [6],
|
||||
")": [7],
|
||||
",": [8],
|
||||
"-": [9],
|
||||
".": [10],
|
||||
":": [11],
|
||||
";": [12],
|
||||
"?": [13],
|
||||
"a": [14],
|
||||
"b": [15],
|
||||
"c": [16],
|
||||
"d": [17],
|
||||
"e": [18],
|
||||
"f": [19],
|
||||
"h": [20],
|
||||
"i": [21],
|
||||
"j": [22],
|
||||
"k": [23],
|
||||
"l": [24],
|
||||
"m": [25],
|
||||
"n": [26],
|
||||
"o": [27],
|
||||
"p": [28],
|
||||
"q": [29],
|
||||
"r": [30],
|
||||
"s": [31],
|
||||
"t": [32],
|
||||
"u": [33],
|
||||
"v": [34],
|
||||
"w": [35],
|
||||
"x": [36],
|
||||
"y": [37],
|
||||
"z": [38],
|
||||
"æ": [39],
|
||||
"ç": [40],
|
||||
"ð": [41],
|
||||
"ø": [42],
|
||||
"ħ": [43],
|
||||
"ŋ": [44],
|
||||
"œ": [45],
|
||||
"ǀ": [46],
|
||||
"ǁ": [47],
|
||||
"ǂ": [48],
|
||||
"ǃ": [49],
|
||||
"ɐ": [50],
|
||||
"ɑ": [51],
|
||||
"ɒ": [52],
|
||||
"ɓ": [53],
|
||||
"ɔ": [54],
|
||||
"ɕ": [55],
|
||||
"ɖ": [56],
|
||||
"ɗ": [57],
|
||||
"ɘ": [58],
|
||||
"ə": [59],
|
||||
"ɚ": [60],
|
||||
"ɛ": [61],
|
||||
"ɜ": [62],
|
||||
"ɞ": [63],
|
||||
"ɟ": [64],
|
||||
"ɠ": [65],
|
||||
"ɡ": [66],
|
||||
"ɢ": [67],
|
||||
"ɣ": [68],
|
||||
"ɤ": [69],
|
||||
"ɥ": [70],
|
||||
"ɦ": [71],
|
||||
"ɧ": [72],
|
||||
"ɨ": [73],
|
||||
"ɪ": [74],
|
||||
"ɫ": [75],
|
||||
"ɬ": [76],
|
||||
"ɭ": [77],
|
||||
"ɮ": [78],
|
||||
"ɯ": [79],
|
||||
"ɰ": [80],
|
||||
"ɱ": [81],
|
||||
"ɲ": [82],
|
||||
"ɳ": [83],
|
||||
"ɴ": [84],
|
||||
"ɵ": [85],
|
||||
"ɶ": [86],
|
||||
"ɸ": [87],
|
||||
"ɹ": [88],
|
||||
"ɺ": [89],
|
||||
"ɻ": [90],
|
||||
"ɽ": [91],
|
||||
"ɾ": [92],
|
||||
"ʀ": [93],
|
||||
"ʁ": [94],
|
||||
"ʂ": [95],
|
||||
"ʃ": [96],
|
||||
"ʄ": [97],
|
||||
"ʈ": [98],
|
||||
"ʉ": [99],
|
||||
"ʊ": [100],
|
||||
"ʋ": [101],
|
||||
"ʌ": [102],
|
||||
"ʍ": [103],
|
||||
"ʎ": [104],
|
||||
"ʏ": [105],
|
||||
"ʐ": [106],
|
||||
"ʑ": [107],
|
||||
"ʒ": [108],
|
||||
"ʔ": [109],
|
||||
"ʕ": [110],
|
||||
"ʘ": [111],
|
||||
"ʙ": [112],
|
||||
"ʛ": [113],
|
||||
"ʜ": [114],
|
||||
"ʝ": [115],
|
||||
"ʟ": [116],
|
||||
"ʡ": [117],
|
||||
"ʢ": [118],
|
||||
"ʲ": [119],
|
||||
"ˈ": [120],
|
||||
"ˌ": [121],
|
||||
"ː": [122],
|
||||
"ˑ": [123],
|
||||
"˞": [124],
|
||||
"β": [125],
|
||||
"θ": [126],
|
||||
"χ": [127],
|
||||
"ᵻ": [128],
|
||||
"ⱱ": [129],
|
||||
"0": [130], # tones
|
||||
"1": [131],
|
||||
"2": [132],
|
||||
"3": [133],
|
||||
"4": [134],
|
||||
"5": [135],
|
||||
"6": [136],
|
||||
"7": [137],
|
||||
"8": [138],
|
||||
"9": [139],
|
||||
"\u0327": [140], # combining cedilla
|
||||
"\u0303": [141], # combining tilde
|
||||
"\u032a": [142], # combining bridge below
|
||||
"\u032f": [143], # combining inverted breve below
|
||||
"\u0329": [144], # combining vertical line below
|
||||
"ʰ": [145],
|
||||
"ˤ": [146],
|
||||
"ε": [147],
|
||||
"↓": [148],
|
||||
"#": [149], # Icelandic
|
||||
'"': [150], # Russian
|
||||
"↑": [151],
|
||||
"\u033a": [152], # Basque
|
||||
"\u033b": [153],
|
||||
}
|
||||
|
||||
PHONEME_MAPS = {
|
||||
# Brazilian Portuguese
|
||||
"pt-br": {"c": ["k"]}
|
||||
}
|
||||
|
||||
ALPHABETS = {
|
||||
# Ukrainian
|
||||
"uk": {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
",": [6],
|
||||
"-": [7],
|
||||
".": [8],
|
||||
":": [9],
|
||||
";": [10],
|
||||
"?": [11],
|
||||
"а": [12],
|
||||
"б": [13],
|
||||
"в": [14],
|
||||
"г": [15],
|
||||
"ґ": [16],
|
||||
"д": [17],
|
||||
"е": [18],
|
||||
"є": [19],
|
||||
"ж": [20],
|
||||
"з": [21],
|
||||
"и": [22],
|
||||
"і": [23],
|
||||
"ї": [24],
|
||||
"й": [25],
|
||||
"к": [26],
|
||||
"л": [27],
|
||||
"м": [28],
|
||||
"н": [29],
|
||||
"о": [30],
|
||||
"п": [31],
|
||||
"р": [32],
|
||||
"с": [33],
|
||||
"т": [34],
|
||||
"у": [35],
|
||||
"ф": [36],
|
||||
"х": [37],
|
||||
"ц": [38],
|
||||
"ч": [39],
|
||||
"ш": [40],
|
||||
"щ": [41],
|
||||
"ь": [42],
|
||||
"ю": [43],
|
||||
"я": [44],
|
||||
"\u0301": [45], # combining acute accent
|
||||
"\u0306": [46], # combining breve
|
||||
"\u0308": [47], # combining diaeresis
|
||||
"—": [48], # em dash
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def phonemize(
|
||||
text: str,
|
||||
phonemizer: Phonemizer,
|
||||
phoneme_map: Optional[Dict[str, List[str]]] = None,
|
||||
) -> List[str]:
|
||||
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
||||
|
||||
# Phonemes are decomposed into unicode codepoints
|
||||
unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
|
||||
if not phoneme_map:
|
||||
return unmapped_phonemes
|
||||
|
||||
# Phonemes can be mapped to lists of other phonemes
|
||||
mapped_phonemes = []
|
||||
for phoneme in unmapped_phonemes:
|
||||
sub_phonemes = phoneme_map.get(phoneme)
|
||||
if sub_phonemes:
|
||||
mapped_phonemes.extend(sub_phonemes)
|
||||
else:
|
||||
mapped_phonemes.append(phoneme)
|
||||
|
||||
return mapped_phonemes
|
||||
|
||||
|
||||
def phonemes_to_ids(
|
||||
phonemes: Iterable[str],
|
||||
phoneme_id_map: Optional[Mapping[str, Iterable[int]]] = None,
|
||||
missing_phonemes: "Optional[Counter[str]]" = None,
|
||||
pad: Optional[str] = "_",
|
||||
bos: Optional[str] = "^",
|
||||
eos: Optional[str] = "$",
|
||||
) -> List[int]:
|
||||
if phoneme_id_map is None:
|
||||
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
phoneme_ids: List[int] = []
|
||||
|
||||
if bos:
|
||||
phoneme_ids.extend(phoneme_id_map[bos])
|
||||
|
||||
if pad:
|
||||
phoneme_ids.extend(phoneme_id_map[pad])
|
||||
|
||||
for phoneme in phonemes:
|
||||
mapped_phoneme_ids = phoneme_id_map.get(phoneme)
|
||||
if mapped_phoneme_ids:
|
||||
phoneme_ids.extend(mapped_phoneme_ids)
|
||||
|
||||
if pad:
|
||||
phoneme_ids.extend(phoneme_id_map[pad])
|
||||
elif missing_phonemes is not None:
|
||||
# Make note of missing phonemes
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if eos:
|
||||
phoneme_ids.extend(phoneme_id_map[eos])
|
||||
|
||||
return phoneme_ids
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("language")
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
phonemizer: Optional[Phonemizer] = None
|
||||
|
||||
if args.text_casing == "lower":
|
||||
casing = str.lower
|
||||
elif args.text_casing == "upper":
|
||||
casing = str.upper
|
||||
else:
|
||||
# ignore
|
||||
casing = lambda s: s
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
# Use text directly
|
||||
phoneme_id_map = ALPHABETS[args.language]
|
||||
else:
|
||||
# Use eSpeak
|
||||
phonemizer = Phonemizer(args.language)
|
||||
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
phonemes = list(unicodedata.normalize("NFD", casing(line)))
|
||||
else:
|
||||
assert phonemizer is not None
|
||||
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
|
||||
|
||||
phoneme_ids = phonemes_to_ids(
|
||||
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
|
||||
)
|
||||
json.dump(
|
||||
{
|
||||
"text": line,
|
||||
"phonemes": phonemes,
|
||||
"phoneme_ids": phoneme_ids,
|
||||
},
|
||||
sys.stdout,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
print("")
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
print(phoneme, count, file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -9,28 +9,37 @@ import os
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
from piper_phonemize import (
|
||||
phonemize_espeak,
|
||||
phonemize_codepoints,
|
||||
phoneme_ids_espeak,
|
||||
phoneme_ids_codepoints,
|
||||
get_codepoints_map,
|
||||
get_espeak_map,
|
||||
get_max_phonemes,
|
||||
tashkeel_run,
|
||||
)
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
from .phonemize import (
|
||||
ALPHABETS,
|
||||
DEFAULT_PHONEME_ID_MAP,
|
||||
MAX_PHONEMES,
|
||||
PHONEME_MAPS,
|
||||
PhonemeType,
|
||||
phonemes_to_ids,
|
||||
phonemize,
|
||||
)
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@@ -150,10 +159,10 @@ def main() -> None:
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_type": args.phoneme_type.value,
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": ALPHABETS[args.language]
|
||||
"phoneme_id_map": get_codepoints_map()[args.language]
|
||||
if args.phoneme_type == PhonemeType.TEXT
|
||||
else DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": MAX_PHONEMES,
|
||||
else get_espeak_map(),
|
||||
"num_symbols": get_max_phonemes(),
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
"piper_version": _VERSION,
|
||||
@@ -255,8 +264,6 @@ def phonemize_batch_espeak(
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
phonemizer = Phonemizer(default_voice=args.language)
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
@@ -266,10 +273,15 @@ def phonemize_batch_espeak(
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = phonemize(
|
||||
casing(utt.text), phonemizer, phoneme_map=phoneme_map
|
||||
)
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
all_phonemes = phonemize_espeak(casing(utt.text), args.language)
|
||||
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_espeak(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
@@ -298,7 +310,6 @@ def phonemize_batch_text(
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
alphabet = ALPHABETS[args.language]
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
@@ -308,10 +319,16 @@ def phonemize_batch_text(
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
all_phonemes = phonemize_codepoints(casing(utt.text))
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_codepoints(
|
||||
args.language,
|
||||
utt.phonemes,
|
||||
phoneme_id_map=alphabet,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
|
||||
@@ -4,4 +4,4 @@ librosa>=0.9.2,<1
|
||||
numpy>=1.19.0
|
||||
onnxruntime>=1.11.0
|
||||
pytorch-lightning~=1.7.0
|
||||
torch>=1.11.0,<2
|
||||
torch>=1.11.0,<2
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
piper-phonemize~=1.0.0
|
||||
piper-phonemize~=1.1.0
|
||||
onnxruntime>=1.11.0,<2
|
||||
|
||||
Reference in New Issue
Block a user