mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-16 13:25:30 +00:00
Load phoneme_silence from voice config
This commit is contained in:
15
README.md
15
README.md
@@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a
|
||||
* Italian (it_IT)
|
||||
* Georgian (ka_GE)
|
||||
* Kazakh (kk_KZ)
|
||||
* Luxembourgish (lb_LU)
|
||||
* Nepali (ne_NP)
|
||||
* Dutch (nl_BE, nl_NL)
|
||||
* Norwegian (no_NO)
|
||||
* Polish (pl_PL)
|
||||
* Portuguese (pt_BR)
|
||||
* Romanian (ro_RO)
|
||||
* Russian (ru_RU)
|
||||
* Serbian (sr_RS)
|
||||
* Swedish (sv_SE)
|
||||
* Swahili (sw_CD)
|
||||
* Turkish (tr_TR)
|
||||
* Ukrainian (uk_UA)
|
||||
* Vietnamese (vi_VN)
|
||||
* Chinese (zh_CN)
|
||||
@@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker <number>` to change speakers (default:
|
||||
|
||||
See `piper --help` for more options.
|
||||
|
||||
### Streaming Audio
|
||||
|
||||
Piper can stream raw audio to stdout as its produced:
|
||||
|
||||
``` sh
|
||||
echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \
|
||||
./piper --model en_US-lessac-medium.onnx --output-raw | \
|
||||
aplay -r 22050 -f S16_LE -t raw -
|
||||
```
|
||||
|
||||
This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice.
|
||||
|
||||
### JSON Input
|
||||
|
||||
|
||||
@@ -189,7 +189,21 @@ int main(int argc, char *argv[]) {
|
||||
runConfig.sentenceSilenceSeconds.value();
|
||||
}
|
||||
|
||||
voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
|
||||
if (runConfig.phonemeSilenceSeconds) {
|
||||
if (!voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Overwrite
|
||||
voice.synthesisConfig.phonemeSilenceSeconds =
|
||||
runConfig.phonemeSilenceSeconds;
|
||||
} else {
|
||||
// Merge
|
||||
for (const auto &[phoneme, silenceSeconds] :
|
||||
*runConfig.phonemeSilenceSeconds) {
|
||||
voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
|
||||
phoneme, silenceSeconds);
|
||||
}
|
||||
}
|
||||
|
||||
} // if phonemeSilenceSeconds
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
|
||||
@@ -140,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
// "inference": {
|
||||
// "noise_scale": 0.667,
|
||||
// "length_scale": 1,
|
||||
// "noise_w": 0.8
|
||||
// "noise_w": 0.8,
|
||||
// "phoneme_silence": {
|
||||
// "<phoneme>": <seconds of silence>,
|
||||
// ...
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -166,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
if (inferenceValue.contains("noise_w")) {
|
||||
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("phoneme_silence")) {
|
||||
// phoneme -> seconds of silence to add after
|
||||
synthesisConfig.phonemeSilenceSeconds.emplace();
|
||||
auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
|
||||
for (auto &phonemeItem : phonemeSilenceValue.items()) {
|
||||
std::string phonemeStr = phonemeItem.key();
|
||||
if (!isSingleCodepoint(phonemeStr)) {
|
||||
spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme silence)");
|
||||
}
|
||||
|
||||
auto phoneme = getCodepoint(phonemeStr);
|
||||
(*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
|
||||
phonemeItem.value().get<float>();
|
||||
}
|
||||
|
||||
} // if phoneme_silence
|
||||
|
||||
} // if inference
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
|
||||
Reference in New Issue
Block a user