First working version with libpiper_phonemize

2026-05-04 04:58:01 +00:00 · 2023-06-08 15:40:37 -05:00
parent 7d27863b48
commit 810fad44cf
11 changed files with 505 additions and 778 deletions
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@@ -1,24 +1,78 @@
 #ifndef PIPER_H_
 #define PIPER_H_

-#include <filesystem>
-#include <iostream>
+#include <functional>
+#include <fstream>
+#include <optional>
 #include <string>
 #include <vector>

-#include "json.hpp"
-#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <phoneme_ids.hpp>
+#include <phonemize.hpp>

-#include "config.hpp"
-#include "model.hpp"
-#include "phonemize.hpp"
-#include "synthesize.hpp"
-#include "wavfile.hpp"
+#include "json.hpp"

 using json = nlohmann::json;

 namespace piper {

+typedef int64_t SpeakerId;
+
+struct eSpeakConfig {
+  std::string voice = "en-us";
+};
+
+struct PiperConfig {
+  std::string eSpeakDataPath;
+  bool useESpeak = true;
+};
+
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };
+
+struct PhonemizeConfig {
+  PhonemeType phonemeType = eSpeakPhonemes;
+  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
+  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
+
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;
+
+  std::optional<eSpeakConfig> eSpeak;
+};
+
+struct SynthesisConfig {
+  float noiseScale = 0.667f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+  std::optional<SpeakerId> speakerId;
+  float sentenceSilenceSeconds = 0.2f;
+};
+
+struct ModelConfig {
+  int numSpeakers;
+};
+
+struct ModelSession {
+  Ort::Session onnx;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::SessionOptions options;
+  Ort::Env env;
+
+  ModelSession() : onnx(nullptr){};
+};
+
+struct SynthesisResult {
+  double inferSeconds;
+  double audioSeconds;
+  double realTimeFactor;
+};
+
 struct Voice {
  json configRoot;
  PhonemizeConfig phonemizeConfig;
@@ -27,122 +81,25 @@ struct Voice {
  ModelSession session;
 };

-void initialize(std::filesystem::path cwd) {
-  string dataPath;
+// Must be called before using textTo* functions
+void initialize(PiperConfig &config);

-  auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
-  if (std::filesystem::is_directory(cwdDataPath)) {
-    dataPath = cwdDataPath.string();
-  }
-
-	cerr << "dataPath: " << dataPath << endl;
-
-  // Set up espeak-ng for calling espeak_TextToPhonemes
-  int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
-                                 /*buflength*/ 0,
-                                 /*path*/ dataPath.c_str(),
-                                 /*options*/ 0);
-  if (result < 0) {
-    throw runtime_error("Failed to initialize eSpeak-ng");
-  }
-}
-
-void terminate() {
-  // Clean up espeak-ng
-  espeak_Terminate();
-}
+// Clean up
+void terminate(PiperConfig &config);

 // Load Onnx model and JSON config file
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
-               optional<SpeakerId> &speakerId) {
-  ifstream modelConfigFile(modelConfigPath.c_str());
-  voice.configRoot = json::parse(modelConfigFile);
-
-  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
-  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
-  parseModelConfig(voice.configRoot, voice.modelConfig);
-
-  if (voice.modelConfig.numSpeakers > 1) {
-    // Multispeaker model
-    if (speakerId) {
-      voice.synthesisConfig.speakerId = speakerId;
-    } else {
-      // Default speaker
-      voice.synthesisConfig.speakerId = 0;
-    }
-  }
-
-  loadModel(modelPath, voice.session);
-
-} /* loadVoice */
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId);

 // Phonemize text and synthesize audio
-void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
-                 SynthesisResult &result,
-                 const function<void()> &audioCallback) {
-
-  size_t sentenceSilenceSamples = 0;
-  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
-    sentenceSilenceSamples = (size_t)(
-        voice.synthesisConfig.sentenceSilenceSeconds *
-        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
-  }
-
-  // Phonemes for each sentence
-  vector<vector<Phoneme>> phonemes;
-  phonemize(text, voice.phonemizeConfig, phonemes);
-
-  vector<PhonemeId> phonemeIds;
-  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
-       ++phonemesIter) {
-    vector<Phoneme> &sentencePhonemes = *phonemesIter;
-    SynthesisResult sentenceResult;
-    phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
-    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
-               sentenceResult);
-
-    // Add end of sentence silence
-    if (sentenceSilenceSamples > 0) {
-      for (size_t i = 0; i < sentenceSilenceSamples; i++) {
-        audioBuffer.push_back(0);
-      }
-    }
-
-    if (audioCallback) {
-      // Call back must copy audio since it is cleared afterwards.
-      audioCallback();
-      audioBuffer.clear();
-    }
-
-    result.audioSeconds += sentenceResult.audioSeconds;
-    result.inferSeconds += sentenceResult.inferSeconds;
-
-    phonemeIds.clear();
-  }
-
-  if (result.audioSeconds > 0) {
-    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
-  }
-
-} /* textToAudio */
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback);

 // Phonemize text and synthesize audio to WAV file
-void textToWavFile(Voice &voice, string text, ostream &audioFile,
-                   SynthesisResult &result) {
-
-  vector<int16_t> audioBuffer;
-  textToAudio(voice, text, audioBuffer, result, NULL);
-
-  // Write WAV
-  auto synthesisConfig = voice.synthesisConfig;
-  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
-                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
-                 audioFile);
-
-  audioFile.write((const char *)audioBuffer.data(),
-                  sizeof(int16_t) * audioBuffer.size());
-
-} /* textToWavFile */
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result);

 } // namespace piper