Use spdlog

2026-04-29 19:24:49 +00:00 · 2023-06-08 15:42:49 -05:00
parent 8b3dfc20dd
commit 8289c0a6e5
5 changed files with 159 additions and 33 deletions
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@@ -2,10 +2,12 @@
 #include <chrono>
 #include <fstream>
 #include <limits>
+#include <sstream>
 #include <stdexcept>

 #include <espeak-ng/speak_lib.h>
 #include <onnxruntime_cxx_api.h>
+#include <spdlog/spdlog.h>

 #include "piper.hpp"
 #include "utf8.h"
@@ -114,7 +116,6 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {

 // Load JSON config for audio synthesis
 void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
-
  // {
  //     "audio": {
  //         "sample_rate": 22050
@@ -162,6 +163,7 @@ void initialize(PiperConfig &config) {
  if (config.useESpeak) {
    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
    // See: https://github.com/rhasspy/espeak-ng
+    spdlog::debug("Initializing eSpeak");
    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
                                   /*buflength*/ 0,
                                   /*path*/ config.eSpeakDataPath.c_str(),
@@ -169,18 +171,26 @@ void initialize(PiperConfig &config) {
    if (result < 0) {
      throw std::runtime_error("Failed to initialize eSpeak-ng");
    }
+
+    spdlog::debug("Initialized eSpeak");
  }
+
+  spdlog::info("Initialized piper");
 }

 void terminate(PiperConfig &config) {
  if (config.useESpeak) {
    // Clean up espeak-ng
+    spdlog::debug("Terminating eSpeak");
    espeak_Terminate();
+    spdlog::debug("Terminated eSpeak");
  }
+
+  spdlog::info("Terminated piper");
 }

 void loadModel(std::string modelPath, ModelSession &session) {
-
+  spdlog::debug("Loading onnx model from {}", modelPath);
  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
                         instanceName.c_str());
  session.env.DisableTelemetryEvents();
@@ -205,13 +215,15 @@ void loadModel(std::string modelPath, ModelSession &session) {
  auto startTime = std::chrono::steady_clock::now();
  session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
  auto endTime = std::chrono::steady_clock::now();
-  auto loadDuration = std::chrono::duration<double>(endTime - startTime);
+  spdlog::debug("Loaded onnx model in {} second(s)",
+                std::chrono::duration<double>(endTime - startTime).count());
 }

 // Load Onnx model and JSON config file
 void loadVoice(PiperConfig &config, std::string modelPath,
               std::string modelConfigPath, Voice &voice,
               std::optional<SpeakerId> &speakerId) {
+  spdlog::debug("Parsing voice config at {}", modelConfigPath);
  std::ifstream modelConfigFile(modelConfigPath);
  voice.configRoot = json::parse(modelConfigFile);

@@ -229,6 +241,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
    }
  }

+  spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
+
  loadModel(modelPath, voice.session);

 } /* loadVoice */
@@ -237,6 +251,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
 void synthesize(std::vector<PhonemeId> &phonemeIds,
                SynthesisConfig &synthesisConfig, ModelSession &session,
                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
+  spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
+
  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

@@ -302,6 +318,8 @@ void synthesize(std::vector<PhonemeId> &phonemeIds,
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }
+  spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
+                result.audioSeconds, result.inferSeconds);

  // Get max audio value for scaling
  float maxAudioValue = 0.01f;
@@ -351,6 +369,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
  }

  // Phonemes for each sentence
+  spdlog::debug("Phonemizing text: {}", text);
  std::vector<std::vector<Phoneme>> phonemes;

  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
@@ -370,11 +389,24 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
       ++phonemesIter) {
    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
+
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phonemes
+      std::string phonemesStr;
+      for (auto phoneme : sentencePhonemes) {
+        utf8::append(phoneme, phonemesStr);
+      }
+
+      spdlog::debug("Converting {} phoneme(s) to ids: {}",
+                    sentencePhonemes.size(), phonemesStr);
+    }
+
    SynthesisResult sentenceResult;

    PhonemeIdConfig idConfig;
    if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
      auto &language = voice.phonemizeConfig.eSpeak->voice;
+      spdlog::debug("Text phoneme language: {}", language);
      if (DEFAULT_ALPHABET.count(language) < 1) {
        throw std::runtime_error(
            "Text phoneme language for voice is not supported");
@@ -387,6 +419,17 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,

    // phonemes -> ids
    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phoneme ids
+      std::stringstream phonemeIdsStr;
+      for (auto phonemeId : phonemeIds) {
+        phonemeIdsStr << phonemeId << ", ";
+      }
+
+      spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                    sentencePhonemes.size(), phonemeIds.size(),
+                    phonemeIdsStr.str());
+    }

    // ids -> audio
    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
@@ -411,6 +454,18 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
    phonemeIds.clear();
  }

+  if (missingPhonemes.size() > 0) {
+    spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
+                 missingPhonemes.size());
+
+    for (auto phonemeCount : missingPhonemes) {
+      std::string phonemeStr;
+      utf8::append(phonemeCount.first, phonemeStr);
+      spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
+                   (uint32_t)phonemeCount.first, phonemeCount.second);
+    }
+  }
+
  if (result.audioSeconds > 0) {
    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
  }