Split into sentences and output audio as available

2026-04-22 15:54:49 +00:00 · 2023-04-12 15:56:06 -05:00
parent f8386b1984
commit e1d34f14fb
6 changed files with 255 additions and 51 deletions
--- a/src/cpp/synthesize.hpp
+++ b/src/cpp/synthesize.hpp
@@ -26,22 +26,21 @@ struct SynthesisResult {
 };

 // Phoneme ids to WAV audio
-void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
-                vector<int16_t> &audioBuffer, SynthesisResult &result) {
+void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
+                ModelSession &session, vector<int16_t> &audioBuffer,
+                SynthesisResult &result) {
  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

  // Allocate
-  vector<int64_t> phonemeIdLengths{(int64_t)synthesisConfig.phonemeIds.size()};
+  vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
  vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
                       synthesisConfig.noiseW};

  vector<Ort::Value> inputTensors;
-  vector<int64_t> phonemeIdsShape{1,
-                                  (int64_t)synthesisConfig.phonemeIds.size()};
+  vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-      memoryInfo, synthesisConfig.phonemeIds.data(),
-      synthesisConfig.phonemeIds.size(), phonemeIdsShape.data(),
+      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
      phonemeIdsShape.size()));

  vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};