Load phoneme_silence from voice config

2026-05-31 17:07:02 +00:00 · 2023-07-31 15:53:44 -05:00
parent bd80cba1f3
commit d95dab3bb3
4 changed files with 57 additions and 4 deletions
@@ -32,14 +32,18 @@ Our goal is to support Home Assistant and the [Year of Voice](https://www.home-a
 * Italian (it_IT)
 * Georgian (ka_GE)
 * Kazakh (kk_KZ)
+* Luxembourgish (lb_LU)
 * Nepali (ne_NP)
 * Dutch (nl_BE, nl_NL)
 * Norwegian (no_NO)
 * Polish (pl_PL)
 * Portuguese (pt_BR)
+* Romanian (ro_RO)
 * Russian (ru_RU)
+* Serbian (sr_RS)
 * Swedish (sv_SE)
 * Swahili (sw_CD)
+* Turkish (tr_TR)
 * Ukrainian (uk_UA)
 * Vietnamese (vi_VN)
 * Chinese (zh_CN)
@@ -81,6 +85,17 @@ For multi-speaker models, use `--speaker <number>` to change speakers (default:

 See `piper --help` for more options.

+### Streaming Audio
+
+Piper can stream raw audio to stdout as its produced:
+
+``` sh
+echo 'This sentence is spoken first. This sentence is synthesized while the first sentence is spoken.' | \
+  ./piper --model en_US-lessac-medium.onnx --output-raw | \
+  aplay -r 22050 -f S16_LE -t raw -
+```
+
+This is **raw** audio and not a WAV file, so make sure your audio player is set to play 16-bit mono PCM samples at the correct sample rate for the voice.

 ### JSON Input

@@ -1 +1 @@
-1.1.0
+1.2.0
@@ -189,7 +189,21 @@ int main(int argc, char *argv[]) {
        runConfig.sentenceSilenceSeconds.value();
  }

-  voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
+  if (runConfig.phonemeSilenceSeconds) {
+    if (!voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Overwrite
+      voice.synthesisConfig.phonemeSilenceSeconds =
+          runConfig.phonemeSilenceSeconds;
+    } else {
+      // Merge
+      for (const auto &[phoneme, silenceSeconds] :
+           *runConfig.phonemeSilenceSeconds) {
+        voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
+            phoneme, silenceSeconds);
+      }
+    }
+
+  } // if phonemeSilenceSeconds

  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
@@ -140,7 +140,11 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
  //     "inference": {
  //         "noise_scale": 0.667,
  //         "length_scale": 1,
-  //         "noise_w": 0.8
+  //         "noise_w": 0.8,
+  //         "phoneme_silence": {
+  //           "<phoneme>": <seconds of silence>,
+  //           ...
+  //         }
  //     }
  // }

@@ -166,7 +170,27 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
    if (inferenceValue.contains("noise_w")) {
      synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
    }
-  }
+
+    if (inferenceValue.contains("phoneme_silence")) {
+      // phoneme -> seconds of silence to add after
+      synthesisConfig.phonemeSilenceSeconds.emplace();
+      auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
+      for (auto &phonemeItem : phonemeSilenceValue.items()) {
+        std::string phonemeStr = phonemeItem.key();
+        if (!isSingleCodepoint(phonemeStr)) {
+          spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme silence)");
+        }
+
+        auto phoneme = getCodepoint(phonemeStr);
+        (*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
+            phonemeItem.value().get<float>();
+      }
+
+    } // if phoneme_silence
+
+  } // if inference

 } /* parseSynthesisConfig */
@@ -1 +1 @@
 .1.0
 .2.0