Add multispeaker

2026-04-26 09:44:49 +00:00 · 2023-01-05 21:47:08 -06:00
parent 06a154a4ed
commit a7fe73390e
8 changed files with 68 additions and 16 deletions
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -17,9 +17,13 @@ pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)

 # https://github.com/espeak-ng/pcaudiolib
 check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
+
 if(PCAUDIO_INCLUDE_FOUND)
-target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
-set(PCAUDIO_LIBRARIES "pcaudio")
+  option(USE_PCAUDIO "Build with pcaudiolib" ON)
+  if(USE_PCAUDIO)
+    target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
+    set(PCAUDIO_LIBRARIES "pcaudio")
+  endif()
 endif()

 set(ONNXRUNTIME_ROOTDIR "/usr/local/include/onnxruntime")
--- a/src/cpp/config.hpp
+++ b/src/cpp/config.hpp
@@ -19,6 +19,7 @@ namespace larynx {

 typedef char32_t Phoneme;
 typedef int64_t PhonemeId;
+typedef int64_t SpeakerId;

 const string DefaultVoice = "en-gb-x-rp";

@@ -52,7 +53,11 @@ struct SynthesisConfig {
  int sampleRate = 22050;
  int sampleWidth = 2; // 16-bit
  int channels = 1;    // mono
-  filesystem::path outputPath;
+  optional<SpeakerId> speakerId;
+};
+
+struct ModelConfig {
+  int numSpeakers;
 };

 bool isSingleCodepoint(string s) {
@@ -84,14 +89,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
    }

    auto phonemeMapValue = configRoot["phoneme_map"];
-    for (auto& fromPhonemeItem : phonemeMapValue.items()) {
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
      string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw runtime_error("Phonemes must be one codepoint (phoneme map)");
      }

      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto& toPhonemeValue : fromPhonemeItem.value()) {
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
        string toPhoneme = toPhonemeValue.get<string>();
        if (!isSingleCodepoint(toPhoneme)) {
          throw runtime_error("Phonemes must be one codepoint (phoneme map)");
@@ -106,14 +111,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
  // phoneme to [id] map
  if (configRoot.contains("phoneme_id_map")) {
    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
-    for (auto& fromPhonemeItem : phonemeIdMapValue.items()) {
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
      string fromPhoneme = fromPhonemeItem.key();
      if (!isSingleCodepoint(fromPhoneme)) {
        throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
      }

      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto& toIdValue : fromPhonemeItem.value()) {
+      for (auto &toIdValue : fromPhonemeItem.value()) {
        PhonemeId toId = toIdValue.get<PhonemeId>();
        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
      }
@@ -134,6 +139,12 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {

 } /* parseSynthesisConfig */

+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+} /* parseModelConfig */
+
 } // namespace larynx

 #endif // CONFIG_H_
--- a/src/cpp/larynx.hpp
+++ b/src/cpp/larynx.hpp
@@ -1,5 +1,5 @@
-#ifndef API_H_
-#define API_H_
+#ifndef LARYNX_H_
+#define LARYNX_H_

 #include <iostream>
 #include <string>
@@ -22,6 +22,7 @@ struct Voice {
  json configRoot;
  PhonemizeConfig phonemizeConfig;
  SynthesisConfig synthesisConfig;
+  ModelConfig modelConfig;
  ModelSession session;
 };

@@ -42,12 +43,24 @@ void terminate() {
 }

 // Load Onnx model and JSON config file
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice) {
+void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
+               optional<SpeakerId> &speakerId) {
  ifstream modelConfigFile(modelConfigPath.c_str());
  voice.configRoot = json::parse(modelConfigFile);

  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multispeaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }

  loadModel(modelPath, voice.session);

@@ -83,8 +96,8 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile,
  audioFile.write((const char *)audioBuffer.data(),
                  sizeof(int16_t) * audioBuffer.size());

-} /* textToAudio */
+} /* textToWavFile */

 } // namespace larynx

-#endif // API_H_
+#endif // LARYNX_H_
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -12,7 +12,7 @@
 #include <pcaudiolib/audio.h>
 #endif

-#include "api.hpp"
+#include "larynx.hpp"

 using namespace std;

@@ -23,6 +23,7 @@ struct RunConfig {
  filesystem::path modelConfigPath;
  OutputType outputType = OUTPUT_PLAY;
  optional<filesystem::path> outputPath;
+  optional<larynx::SpeakerId> speakerId;
 };

 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -36,7 +37,7 @@ int main(int argc, char *argv[]) {
  larynx::Voice voice;
  auto startTime = chrono::steady_clock::now();
  loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
-            voice);
+            voice, runConfig.speakerId);
  auto endTime = chrono::steady_clock::now();
  auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
  cerr << "Load time: " << loadSeconds << " sec" << endl;
@@ -122,9 +123,11 @@ int main(int argc, char *argv[]) {

  larynx::terminate();

+#ifdef HAVE_PCAUDIO
  audio_object_close(my_audio);
  audio_object_destroy(my_audio);
  my_audio = nullptr;
+#endif

  return EXIT_SUCCESS;
 }
@@ -145,6 +148,7 @@ void printUsage(char *argv[]) {
  cerr << "   -d  DIR   --output_dir  DIR   path to output directory (default: "
          "cwd)"
       << endl;
+  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
  cerr << endl;
 }

@@ -182,6 +186,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
      ensureArg(argc, argv, i);
      runConfig.outputType = OUTPUT_DIRECTORY;
      runConfig.outputPath = filesystem::path(argv[++i]);
+    } else if (arg == "-s" || arg == "--speaker") {
+      ensureArg(argc, argv, i);
+      runConfig.speakerId = (larynx::SpeakerId)stoi(argv[++i]);
    } else if (arg == "-h" || arg == "--help") {
      printUsage(argv);
      exit(0);
--- a/src/cpp/phonemize.hpp
+++ b/src/cpp/phonemize.hpp
@@ -10,9 +10,9 @@
 #include <vector>

 #include <espeak-ng/speak_lib.h>
-#include <utf8.h>

 #include "config.hpp"
+#include "utf8.h"

 using namespace std;

--- a/src/cpp/synthesize.hpp
+++ b/src/cpp/synthesize.hpp
@@ -53,6 +53,15 @@ void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
                                      scalesShape.data(), scalesShape.size()));

+  if (synthesisConfig.speakerId) {
+    // Add speaker id
+    vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value()};
+    vector<int64_t> speakerIdShape{1};
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
  // Infer
  auto startTime = chrono::steady_clock::now();
  auto outputTensors =