mirror of
https://github.com/pstrueb/piper.git
synced 2026-05-02 04:18:01 +00:00
Use spdlog
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
project(piper C CXX)
|
||||
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(SPDLOG REQUIRED spdlog)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
@@ -19,7 +20,8 @@ target_link_libraries(piper
|
||||
piper_phonemize
|
||||
espeak-ng
|
||||
onnxruntime
|
||||
pthread)
|
||||
pthread
|
||||
${SPDLOG_LIBRARIES})
|
||||
|
||||
if(NOT APPLE)
|
||||
target_link_libraries(-static-libgcc -static-libstdc++)
|
||||
@@ -31,4 +33,8 @@ target_link_directories(piper PUBLIC
|
||||
|
||||
target_include_directories(piper PUBLIC
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/include
|
||||
${ONNXRUNTIME_ROOTDIR}/include)
|
||||
${ONNXRUNTIME_ROOTDIR}/include
|
||||
${SPDLOG_INCLUDE_DIRS})
|
||||
|
||||
target_compile_options(piper PUBLIC
|
||||
${SPDLOG_CFLAGS_OTHER})
|
||||
|
||||
114
src/cpp/main.cpp
114
src/cpp/main.cpp
@@ -21,6 +21,9 @@
|
||||
#include <mach-o/dyld.h>
|
||||
#endif
|
||||
|
||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "piper.hpp"
|
||||
|
||||
using namespace std;
|
||||
@@ -28,15 +31,36 @@ using namespace std;
|
||||
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
|
||||
|
||||
struct RunConfig {
|
||||
// Path to .onnx voice file
|
||||
filesystem::path modelPath;
|
||||
|
||||
// Path to JSON voice config file
|
||||
filesystem::path modelConfigPath;
|
||||
|
||||
// Type of output to produce.
|
||||
// Default is to write a WAV file in the current directory.
|
||||
OutputType outputType = OUTPUT_DIRECTORY;
|
||||
|
||||
// Path for output
|
||||
optional<filesystem::path> outputPath = filesystem::path(".");
|
||||
|
||||
// Numerical id of the default speaker (multi-speaker voices)
|
||||
optional<piper::SpeakerId> speakerId;
|
||||
|
||||
// Amount of noise to add during audio generation
|
||||
optional<float> noiseScale;
|
||||
|
||||
// Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
|
||||
optional<float> lengthScale;
|
||||
|
||||
// Variation in phoneme lengths
|
||||
optional<float> noiseW;
|
||||
|
||||
// Seconds of silence to add after each sentence
|
||||
optional<float> sentenceSilenceSeconds;
|
||||
|
||||
// Path to espeak-ng data directory (default is next to piper executable)
|
||||
optional<filesystem::path> eSpeakDataPath;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -44,19 +68,45 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
|
||||
|
||||
RunConfig runConfig;
|
||||
parseArgs(argc, argv, runConfig);
|
||||
|
||||
// NOTE: This won't work for Windows (need GetModuleFileName)
|
||||
piper::PiperConfig piperConfig;
|
||||
piper::Voice voice;
|
||||
|
||||
spdlog::debug("Loading voice from {} (config={})",
|
||||
runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string());
|
||||
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(piperConfig, runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
spdlog::info("Loaded voice in {} second(s)",
|
||||
chrono::duration<double>(endTime - startTime).count());
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
|
||||
spdlog::debug("Voice uses eSpeak phonemes ({})",
|
||||
voice.phonemizeConfig.eSpeak->voice);
|
||||
|
||||
if (runConfig.eSpeakDataPath) {
|
||||
// User provided path
|
||||
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
|
||||
} else {
|
||||
// Get the path to the piper executable so we can locate espeak-ng-data
|
||||
// next to it.
|
||||
#ifdef _MSC_VER
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#elifdef __APPLE__
|
||||
auto exePath = []() {
|
||||
char moduleFileName[PATH_MAX] = { 0 };
|
||||
uint32_t moduleFileNameSize = std::size(moduleFileName);
|
||||
@@ -64,21 +114,21 @@ int main(int argc, char *argv[]) {
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
#endif
|
||||
|
||||
piper::PiperConfig piperConfig;
|
||||
piperConfig.eSpeakDataPath =
|
||||
std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data"))
|
||||
.string();
|
||||
piperConfig.eSpeakDataPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("espeak-ng-data"))
|
||||
.string();
|
||||
|
||||
piper::Voice voice;
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(piperConfig, runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
|
||||
cerr << "Load time: " << loadSeconds << " sec" << endl;
|
||||
spdlog::debug("espeak-ng-data directory is expected at {}",
|
||||
piperConfig.eSpeakDataPath);
|
||||
}
|
||||
} else {
|
||||
// Not using eSpeak
|
||||
piperConfig.useESpeak = false;
|
||||
}
|
||||
|
||||
piper::initialize(piperConfig);
|
||||
|
||||
@@ -102,7 +152,7 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
cerr << "Output directory: " << runConfig.outputPath.value() << endl;
|
||||
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
|
||||
}
|
||||
|
||||
string line;
|
||||
@@ -175,13 +225,13 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
// Wait for audio output to finish
|
||||
cerr << "Waiting for audio..." << endl;
|
||||
spdlog::info("Waiting for audio to finish playing...");
|
||||
rawOutputThread.join();
|
||||
}
|
||||
|
||||
cerr << "Real-time factor: " << result.realTimeFactor
|
||||
<< " (infer=" << result.inferSeconds
|
||||
<< " sec, audio=" << result.audioSeconds << " sec)" << endl;
|
||||
spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
|
||||
result.realTimeFactor, result.inferSeconds,
|
||||
result.audioSeconds);
|
||||
}
|
||||
|
||||
piper::terminate(piperConfig);
|
||||
@@ -189,6 +239,8 @@ int main(int argc, char *argv[]) {
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished) {
|
||||
@@ -220,6 +272,8 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
|
||||
} // rawOutputProc
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void printUsage(char *argv[]) {
|
||||
cerr << endl;
|
||||
cerr << "usage: " << argv[0] << " [options]" << endl;
|
||||
@@ -249,6 +303,10 @@ void printUsage(char *argv[]) {
|
||||
cerr << " --silence_seconds NUM seconds of silence after each "
|
||||
"sentence (default: 0.2)"
|
||||
<< endl;
|
||||
cerr << " --espeak_data DIR path to espeak-ng data directory"
|
||||
<< endl;
|
||||
cerr << " --debug print DEBUG messages to the console"
|
||||
<< endl;
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
@@ -304,6 +362,12 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--debug") {
|
||||
// Set DEBUG logging
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
printUsage(argv);
|
||||
exit(0);
|
||||
|
||||
@@ -2,10 +2,12 @@
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "piper.hpp"
|
||||
#include "utf8.h"
|
||||
@@ -114,7 +116,6 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
|
||||
// Load JSON config for audio synthesis
|
||||
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
|
||||
// {
|
||||
// "audio": {
|
||||
// "sample_rate": 22050
|
||||
@@ -162,6 +163,7 @@ void initialize(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
|
||||
// See: https://github.com/rhasspy/espeak-ng
|
||||
spdlog::debug("Initializing eSpeak");
|
||||
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||
/*buflength*/ 0,
|
||||
/*path*/ config.eSpeakDataPath.c_str(),
|
||||
@@ -169,18 +171,26 @@ void initialize(PiperConfig &config) {
|
||||
if (result < 0) {
|
||||
throw std::runtime_error("Failed to initialize eSpeak-ng");
|
||||
}
|
||||
|
||||
spdlog::debug("Initialized eSpeak");
|
||||
}
|
||||
|
||||
spdlog::info("Initialized piper");
|
||||
}
|
||||
|
||||
void terminate(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Clean up espeak-ng
|
||||
spdlog::debug("Terminating eSpeak");
|
||||
espeak_Terminate();
|
||||
spdlog::debug("Terminated eSpeak");
|
||||
}
|
||||
|
||||
spdlog::info("Terminated piper");
|
||||
}
|
||||
|
||||
void loadModel(std::string modelPath, ModelSession &session) {
|
||||
|
||||
spdlog::debug("Loading onnx model from {}", modelPath);
|
||||
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
||||
instanceName.c_str());
|
||||
session.env.DisableTelemetryEvents();
|
||||
@@ -205,13 +215,15 @@ void loadModel(std::string modelPath, ModelSession &session) {
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
auto loadDuration = std::chrono::duration<double>(endTime - startTime);
|
||||
spdlog::debug("Loaded onnx model in {} second(s)",
|
||||
std::chrono::duration<double>(endTime - startTime).count());
|
||||
}
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId) {
|
||||
spdlog::debug("Parsing voice config at {}", modelConfigPath);
|
||||
std::ifstream modelConfigFile(modelConfigPath);
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
@@ -229,6 +241,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
}
|
||||
}
|
||||
|
||||
spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
|
||||
|
||||
loadModel(modelPath, voice.session);
|
||||
|
||||
} /* loadVoice */
|
||||
@@ -237,6 +251,8 @@ void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
void synthesize(std::vector<PhonemeId> &phonemeIds,
|
||||
SynthesisConfig &synthesisConfig, ModelSession &session,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
|
||||
spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
|
||||
|
||||
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
||||
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
||||
|
||||
@@ -302,6 +318,8 @@ void synthesize(std::vector<PhonemeId> &phonemeIds,
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
|
||||
result.audioSeconds, result.inferSeconds);
|
||||
|
||||
// Get max audio value for scaling
|
||||
float maxAudioValue = 0.01f;
|
||||
@@ -351,6 +369,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
spdlog::debug("Phonemizing text: {}", text);
|
||||
std::vector<std::vector<Phoneme>> phonemes;
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
|
||||
@@ -370,11 +389,24 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||
++phonemesIter) {
|
||||
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phonemes
|
||||
std::string phonemesStr;
|
||||
for (auto phoneme : sentencePhonemes) {
|
||||
utf8::append(phoneme, phonemesStr);
|
||||
}
|
||||
|
||||
spdlog::debug("Converting {} phoneme(s) to ids: {}",
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
SynthesisResult sentenceResult;
|
||||
|
||||
PhonemeIdConfig idConfig;
|
||||
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
|
||||
auto &language = voice.phonemizeConfig.eSpeak->voice;
|
||||
spdlog::debug("Text phoneme language: {}", language);
|
||||
if (DEFAULT_ALPHABET.count(language) < 1) {
|
||||
throw std::runtime_error(
|
||||
"Text phoneme language for voice is not supported");
|
||||
@@ -387,6 +419,17 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
sentencePhonemes.size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
@@ -411,6 +454,18 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
if (missingPhonemes.size() > 0) {
|
||||
spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
|
||||
missingPhonemes.size());
|
||||
|
||||
for (auto phonemeCount : missingPhonemes) {
|
||||
std::string phonemeStr;
|
||||
utf8::append(phonemeCount.first, phonemeStr);
|
||||
spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
|
||||
(uint32_t)phonemeCount.first, phonemeCount.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user