Add phoneme-silence

2026-04-28 18:54:49 +00:00 · 2023-07-31 15:32:02 -05:00
parent dcb4c828cd
commit bd80cba1f3
3 changed files with 114 additions and 21 deletions
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <functional>
 #include <iostream>
+#include <map>
 #include <mutex>
 #include <sstream>
 #include <stdexcept>
@@ -76,6 +77,9 @@ struct RunConfig {
  //   "output_file": str,        (optional)
  // }
  bool jsonInput = false;
+
+  // Seconds of extra silence to insert after a single phoneme
+  optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
 };

 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -185,6 +189,8 @@ int main(int argc, char *argv[]) {
        runConfig.sentenceSilenceSeconds.value();
  }

+  voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
+
  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
    spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
@@ -453,6 +459,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
    } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
      ensureArg(argc, argv, i);
      runConfig.sentenceSilenceSeconds = stof(argv[++i]);
+    } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
+      ensureArg(argc, argv, i);
+      ensureArg(argc, argv, i + 1);
+      auto phonemeStr = std::string(argv[++i]);
+      if (!piper::isSingleCodepoint(phonemeStr)) {
+        std::cerr << "Phoneme '" << phonemeStr
+                  << "' is not a single codepoint (--phoneme_silence)"
+                  << std::endl;
+        exit(1);
+      }
+
+      if (!runConfig.phonemeSilenceSeconds) {
+        runConfig.phonemeSilenceSeconds.emplace();
+      }
+
+      auto phoneme = piper::getCodepoint(phonemeStr);
+      (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
    } else if (arg == "--espeak_data" || arg == "--espeak-data") {
      ensureArg(argc, argv, i);
      runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;

 const std::string instanceName{"piper"};

-std::string getVersion() {
-  return VERSION;
-}
+std::string getVersion() { return VERSION; }

 // True if the string is a single UTF-8 codepoint
 bool isSingleCodepoint(std::string s) {
@@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
                    sentencePhonemes.size(), phonemesStr);
    }

-    SynthesisResult sentenceResult;
+    std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
+    std::vector<SynthesisResult> phraseResults;
+    std::vector<size_t> phraseSilenceSamples;

    // Use phoneme/id map from config
    PhonemeIdConfig idConfig;
    idConfig.phonemeIdMap =
        std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);

-    // phonemes -> ids
-    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
-    if (spdlog::should_log(spdlog::level::debug)) {
-      // DEBUG log for phoneme ids
-      std::stringstream phonemeIdsStr;
-      for (auto phonemeId : phonemeIds) {
-        phonemeIdsStr << phonemeId << ", ";
-      }
+    if (voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Split into phrases
+      std::map<Phoneme, float> &phonemeSilenceSeconds =
+          *voice.synthesisConfig.phonemeSilenceSeconds;

-      spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
-                    sentencePhonemes.size(), phonemeIds.size(),
-                    phonemeIdsStr.str());
+      auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+      phrasePhonemes.push_back(currentPhrasePhonemes);
+
+      for (auto sentencePhonemesIter = sentencePhonemes.begin();
+           sentencePhonemesIter != sentencePhonemes.end();
+           sentencePhonemesIter++) {
+        Phoneme &currentPhoneme = *sentencePhonemesIter;
+        currentPhrasePhonemes->push_back(currentPhoneme);
+
+        if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
+          // Split at phrase boundary
+          phraseSilenceSamples.push_back(
+              (std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
+                            voice.synthesisConfig.sampleRate *
+                            voice.synthesisConfig.channels));
+
+          currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+          phrasePhonemes.push_back(currentPhrasePhonemes);
+        }
+      }
+    } else {
+      // Use all phonemes
+      phrasePhonemes.push_back(
+          std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
    }

-    // ids -> audio
-    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
-               sentenceResult);
+    // Ensure results/samples are the same size
+    while (phraseResults.size() < phrasePhonemes.size()) {
+      phraseResults.emplace_back();
+    }
+
+    while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
+      phraseSilenceSamples.push_back(0);
+    }
+
+    // phonemes -> ids -> audio
+    for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
+      if (phrasePhonemes[phraseIdx]->size() <= 0) {
+        continue;
+      }
+
+      // phonemes -> ids
+      phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
+                      missingPhonemes);
+      if (spdlog::should_log(spdlog::level::debug)) {
+        // DEBUG log for phoneme ids
+        std::stringstream phonemeIdsStr;
+        for (auto phonemeId : phonemeIds) {
+          phonemeIdsStr << phonemeId << ", ";
+        }
+
+        spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                      phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
+                      phonemeIdsStr.str());
+      }
+
+      // ids -> audio
+      synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+                 phraseResults[phraseIdx]);
+
+      // Add end of phrase silence
+      for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
+        audioBuffer.push_back(0);
+      }
+
+      result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
+      result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
+
+      phonemeIds.clear();
+    }

    // Add end of sentence silence
    if (sentenceSilenceSamples > 0) {
@@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
      audioBuffer.clear();
    }

-    result.audioSeconds += sentenceResult.audioSeconds;
-    result.inferSeconds += sentenceResult.inferSeconds;
-
    phonemeIds.clear();
  }

--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@@ -3,6 +3,7 @@

 #include <fstream>
 #include <functional>
+#include <map>
 #include <optional>
 #include <string>
 #include <vector>
@@ -49,14 +50,22 @@ struct PhonemizeConfig {
 };

 struct SynthesisConfig {
+  // VITS inference settings
  float noiseScale = 0.667f;
  float lengthScale = 1.0f;
  float noiseW = 0.8f;
+
+  // Audio settings
  int sampleRate = 22050;
  int sampleWidth = 2; // 16-bit
  int channels = 1;    // mono
+
+  // Speaker id from 0 to numSpeakers - 1
  std::optional<SpeakerId> speakerId;
+
+  // Extra silence
  float sentenceSilenceSeconds = 0.2f;
+  std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
 };

 struct ModelConfig {
@@ -89,6 +98,12 @@ struct Voice {
  ModelSession session;
 };

+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s);
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s);
+
 // Get version of Piper
 std::string getVersion();