Use spdlog

This commit is contained in:
Michael Hansen
2023-06-08 15:42:49 -05:00
parent 8b3dfc20dd
commit 8289c0a6e5
5 changed files with 159 additions and 33 deletions

View File

@@ -1,9 +1,10 @@
cmake_minimum_required(VERSION 3.13)
include(CheckIncludeFileCXX)
project(piper C CXX)
find_package(PkgConfig)
pkg_check_modules(SPDLOG REQUIRED spdlog)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -19,7 +20,8 @@ target_link_libraries(piper
piper_phonemize
espeak-ng
onnxruntime
pthread)
pthread
${SPDLOG_LIBRARIES})
if(NOT APPLE)
target_link_libraries(-static-libgcc -static-libstdc++)
@@ -31,4 +33,8 @@ target_link_directories(piper PUBLIC
target_include_directories(piper PUBLIC
${PIPER_PHONEMIZE_ROOTDIR}/include
${ONNXRUNTIME_ROOTDIR}/include)
${ONNXRUNTIME_ROOTDIR}/include
${SPDLOG_INCLUDE_DIRS})
target_compile_options(piper PUBLIC
${SPDLOG_CFLAGS_OTHER})

View File

@@ -21,6 +21,9 @@
#include <mach-o/dyld.h>
#endif
#include <spdlog/sinks/stdout_color_sinks.h>
#include <spdlog/spdlog.h>
#include "piper.hpp"
using namespace std;
@@ -28,15 +31,36 @@ using namespace std;
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
struct RunConfig {
// Path to .onnx voice file
filesystem::path modelPath;
// Path to JSON voice config file
filesystem::path modelConfigPath;
// Type of output to produce.
// Default is to write a WAV file in the current directory.
OutputType outputType = OUTPUT_DIRECTORY;
// Path for output
optional<filesystem::path> outputPath = filesystem::path(".");
// Numerical id of the default speaker (multi-speaker voices)
optional<piper::SpeakerId> speakerId;
// Amount of noise to add during audio generation
optional<float> noiseScale;
// Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
optional<float> lengthScale;
// Variation in phoneme lengths
optional<float> noiseW;
// Seconds of silence to add after each sentence
optional<float> sentenceSilenceSeconds;
// Path to espeak-ng data directory (default is next to piper executable)
optional<filesystem::path> eSpeakDataPath;
};
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -44,19 +68,45 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
// ----------------------------------------------------------------------------
int main(int argc, char *argv[]) {
spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
RunConfig runConfig;
parseArgs(argc, argv, runConfig);
// NOTE: This won't work for Windows (need GetModuleFileName)
piper::PiperConfig piperConfig;
piper::Voice voice;
spdlog::debug("Loading voice from {} (config={})",
runConfig.modelPath.string(),
runConfig.modelConfigPath.string());
auto startTime = chrono::steady_clock::now();
loadVoice(piperConfig, runConfig.modelPath.string(),
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
auto endTime = chrono::steady_clock::now();
spdlog::info("Loaded voice in {} second(s)",
chrono::duration<double>(endTime - startTime).count());
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
spdlog::debug("Voice uses eSpeak phonemes ({})",
voice.phonemizeConfig.eSpeak->voice);
if (runConfig.eSpeakDataPath) {
// User provided path
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
} else {
// Get the path to the piper executable so we can locate espeak-ng-data
// next to it.
#ifdef _MSC_VER
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
#else
#ifdef __APPLE__
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
#elifdef __APPLE__
auto exePath = []() {
char moduleFileName[PATH_MAX] = { 0 };
uint32_t moduleFileNameSize = std::size(moduleFileName);
@@ -64,21 +114,21 @@ int main(int argc, char *argv[]) {
return filesystem::path(moduleFileName);
}();
#else
auto exePath = filesystem::canonical("/proc/self/exe");
auto exePath = filesystem::canonical("/proc/self/exe");
#endif
piper::PiperConfig piperConfig;
piperConfig.eSpeakDataPath =
std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data"))
.string();
piperConfig.eSpeakDataPath =
std::filesystem::absolute(
exePath.parent_path().append("espeak-ng-data"))
.string();
piper::Voice voice;
auto startTime = chrono::steady_clock::now();
loadVoice(piperConfig, runConfig.modelPath.string(),
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
auto endTime = chrono::steady_clock::now();
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
cerr << "Load time: " << loadSeconds << " sec" << endl;
spdlog::debug("espeak-ng-data directory is expected at {}",
piperConfig.eSpeakDataPath);
}
} else {
// Not using eSpeak
piperConfig.useESpeak = false;
}
piper::initialize(piperConfig);
@@ -102,7 +152,7 @@ int main(int argc, char *argv[]) {
if (runConfig.outputType == OUTPUT_DIRECTORY) {
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
cerr << "Output directory: " << runConfig.outputPath.value() << endl;
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
}
string line;
@@ -175,13 +225,13 @@ int main(int argc, char *argv[]) {
}
// Wait for audio output to finish
cerr << "Waiting for audio..." << endl;
spdlog::info("Waiting for audio to finish playing...");
rawOutputThread.join();
}
cerr << "Real-time factor: " << result.realTimeFactor
<< " (infer=" << result.inferSeconds
<< " sec, audio=" << result.audioSeconds << " sec)" << endl;
spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
result.realTimeFactor, result.inferSeconds,
result.audioSeconds);
}
piper::terminate(piperConfig);
@@ -189,6 +239,8 @@ int main(int argc, char *argv[]) {
return EXIT_SUCCESS;
}
// ----------------------------------------------------------------------------
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished) {
@@ -220,6 +272,8 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
} // rawOutputProc
// ----------------------------------------------------------------------------
void printUsage(char *argv[]) {
cerr << endl;
cerr << "usage: " << argv[0] << " [options]" << endl;
@@ -249,6 +303,10 @@ void printUsage(char *argv[]) {
cerr << " --silence_seconds NUM seconds of silence after each "
"sentence (default: 0.2)"
<< endl;
cerr << " --espeak_data DIR path to espeak-ng data directory"
<< endl;
cerr << " --debug print DEBUG messages to the console"
<< endl;
cerr << endl;
}
@@ -304,6 +362,12 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
ensureArg(argc, argv, i);
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
ensureArg(argc, argv, i);
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
} else if (arg == "--debug") {
// Set DEBUG logging
spdlog::set_level(spdlog::level::debug);
} else if (arg == "-h" || arg == "--help") {
printUsage(argv);
exit(0);

View File

@@ -2,10 +2,12 @@
#include <chrono>
#include <fstream>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <espeak-ng/speak_lib.h>
#include <onnxruntime_cxx_api.h>
#include <spdlog/spdlog.h>
#include "piper.hpp"
#include "utf8.h"
@@ -114,7 +116,6 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
// Load JSON config for audio synthesis
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
// {
// "audio": {
// "sample_rate": 22050
@@ -162,6 +163,7 @@ void initialize(PiperConfig &config) {
if (config.useESpeak) {
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
// See: https://github.com/rhasspy/espeak-ng
spdlog::debug("Initializing eSpeak");
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
/*buflength*/ 0,
/*path*/ config.eSpeakDataPath.c_str(),
@@ -169,18 +171,26 @@ void initialize(PiperConfig &config) {
if (result < 0) {
throw std::runtime_error("Failed to initialize eSpeak-ng");
}
spdlog::debug("Initialized eSpeak");
}
spdlog::info("Initialized piper");
}
void terminate(PiperConfig &config) {
if (config.useESpeak) {
// Clean up espeak-ng
spdlog::debug("Terminating eSpeak");
espeak_Terminate();
spdlog::debug("Terminated eSpeak");
}
spdlog::info("Terminated piper");
}
void loadModel(std::string modelPath, ModelSession &session) {
spdlog::debug("Loading onnx model from {}", modelPath);
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
instanceName.c_str());
session.env.DisableTelemetryEvents();
@@ -205,13 +215,15 @@ void loadModel(std::string modelPath, ModelSession &session) {
auto startTime = std::chrono::steady_clock::now();
session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
auto endTime = std::chrono::steady_clock::now();
auto loadDuration = std::chrono::duration<double>(endTime - startTime);
spdlog::debug("Loaded onnx model in {} second(s)",
std::chrono::duration<double>(endTime - startTime).count());
}
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId) {
spdlog::debug("Parsing voice config at {}", modelConfigPath);
std::ifstream modelConfigFile(modelConfigPath);
voice.configRoot = json::parse(modelConfigFile);
@@ -229,6 +241,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
}
}
spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
loadModel(modelPath, voice.session);
} /* loadVoice */
@@ -237,6 +251,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
void synthesize(std::vector<PhonemeId> &phonemeIds,
SynthesisConfig &synthesisConfig, ModelSession &session,
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
@@ -302,6 +318,8 @@ void synthesize(std::vector<PhonemeId> &phonemeIds,
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
result.audioSeconds, result.inferSeconds);
// Get max audio value for scaling
float maxAudioValue = 0.01f;
@@ -351,6 +369,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
}
// Phonemes for each sentence
spdlog::debug("Phonemizing text: {}", text);
std::vector<std::vector<Phoneme>> phonemes;
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
@@ -370,11 +389,24 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
if (spdlog::should_log(spdlog::level::debug)) {
// DEBUG log for phonemes
std::string phonemesStr;
for (auto phoneme : sentencePhonemes) {
utf8::append(phoneme, phonemesStr);
}
spdlog::debug("Converting {} phoneme(s) to ids: {}",
sentencePhonemes.size(), phonemesStr);
}
SynthesisResult sentenceResult;
PhonemeIdConfig idConfig;
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
auto &language = voice.phonemizeConfig.eSpeak->voice;
spdlog::debug("Text phoneme language: {}", language);
if (DEFAULT_ALPHABET.count(language) < 1) {
throw std::runtime_error(
"Text phoneme language for voice is not supported");
@@ -387,6 +419,17 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
// phonemes -> ids
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
if (spdlog::should_log(spdlog::level::debug)) {
// DEBUG log for phoneme ids
std::stringstream phonemeIdsStr;
for (auto phonemeId : phonemeIds) {
phonemeIdsStr << phonemeId << ", ";
}
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
sentencePhonemes.size(), phonemeIds.size(),
phonemeIdsStr.str());
}
// ids -> audio
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
@@ -411,6 +454,18 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
phonemeIds.clear();
}
if (missingPhonemes.size() > 0) {
spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
missingPhonemes.size());
for (auto phonemeCount : missingPhonemes) {
std::string phonemeStr;
utf8::append(phonemeCount.first, phonemeStr);
spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
(uint32_t)phonemeCount.first, phonemeCount.second);
}
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}