diff --git a/Dockerfile b/Dockerfile index 077d261..b1f2e56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,6 +56,7 @@ WORKDIR /test COPY local/en-us/lessac/low/en-us-lessac-low.onnx \ local/en-us/lessac/low/en-us-lessac-low.onnx.json ./ +# Run Piper on a test sentence and verify that the WAV file isn't empty COPY --from=build /dist/piper_*.tar.gz ./ RUN tar -xzf piper*.tar.gz RUN echo 'This is a test.' | ./piper/piper -m en-us-lessac-low.onnx -f test.wav diff --git a/VERSION b/VERSION index 6e8bf73..3eefcb9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.0 +1.0.0 diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 064baa9..a5e89b5 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -1,9 +1,10 @@ cmake_minimum_required(VERSION 3.13) -include(CheckIncludeFileCXX) - project(piper C CXX) +find_package(PkgConfig) +pkg_check_modules(SPDLOG REQUIRED spdlog) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -19,7 +20,8 @@ target_link_libraries(piper piper_phonemize espeak-ng onnxruntime - pthread) + pthread + ${SPDLOG_LIBRARIES}) if(NOT APPLE) target_link_libraries(-static-libgcc -static-libstdc++) @@ -31,4 +33,8 @@ target_link_directories(piper PUBLIC target_include_directories(piper PUBLIC ${PIPER_PHONEMIZE_ROOTDIR}/include - ${ONNXRUNTIME_ROOTDIR}/include) + ${ONNXRUNTIME_ROOTDIR}/include + ${SPDLOG_INCLUDE_DIRS}) + +target_compile_options(piper PUBLIC + ${SPDLOG_CFLAGS_OTHER}) diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index 0c4010a..d8cb085 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -21,6 +21,9 @@ #include #endif +#include +#include + #include "piper.hpp" using namespace std; @@ -28,15 +31,36 @@ using namespace std; enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW }; struct RunConfig { + // Path to .onnx voice file filesystem::path modelPath; + + // Path to JSON voice config file filesystem::path modelConfigPath; + + // Type of output to produce. + // Default is to write a WAV file in the current directory. OutputType outputType = OUTPUT_DIRECTORY; + + // Path for output optional outputPath = filesystem::path("."); + + // Numerical id of the default speaker (multi-speaker voices) optional speakerId; + + // Amount of noise to add during audio generation optional noiseScale; + + // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower) optional lengthScale; + + // Variation in phoneme lengths optional noiseW; + + // Seconds of silence to add after each sentence optional sentenceSilenceSeconds; + + // Path to espeak-ng data directory (default is next to piper executable) + optional eSpeakDataPath; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); @@ -44,19 +68,45 @@ void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, condition_variable &cvAudio, bool &audioReady, bool &audioFinished); +// ---------------------------------------------------------------------------- + int main(int argc, char *argv[]) { + spdlog::set_default_logger(spdlog::stderr_color_st("piper")); + RunConfig runConfig; parseArgs(argc, argv, runConfig); - // NOTE: This won't work for Windows (need GetModuleFileName) + piper::PiperConfig piperConfig; + piper::Voice voice; + + spdlog::debug("Loading voice from {} (config={})", + runConfig.modelPath.string(), + runConfig.modelConfigPath.string()); + + auto startTime = chrono::steady_clock::now(); + loadVoice(piperConfig, runConfig.modelPath.string(), + runConfig.modelConfigPath.string(), voice, runConfig.speakerId); + auto endTime = chrono::steady_clock::now(); + spdlog::info("Loaded voice in {} second(s)", + chrono::duration(endTime - startTime).count()); + + if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) { + spdlog::debug("Voice uses eSpeak phonemes ({})", + voice.phonemizeConfig.eSpeak->voice); + + if (runConfig.eSpeakDataPath) { + // User provided path + piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string(); + } else { + // Get the path to the piper executable so we can locate espeak-ng-data + // next to it. #ifdef _MSC_VER - auto exePath = []() { - wchar_t moduleFileName[MAX_PATH] = {0}; - GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); - return filesystem::path(moduleFileName); - }(); -#else -#ifdef __APPLE__ + auto exePath = []() { + wchar_t moduleFileName[MAX_PATH] = {0}; + GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); + return filesystem::path(moduleFileName); + }(); +#elifdef __APPLE__ auto exePath = []() { char moduleFileName[PATH_MAX] = { 0 }; uint32_t moduleFileNameSize = std::size(moduleFileName); @@ -64,21 +114,21 @@ int main(int argc, char *argv[]) { return filesystem::path(moduleFileName); }(); #else - auto exePath = filesystem::canonical("/proc/self/exe"); + auto exePath = filesystem::canonical("/proc/self/exe"); #endif - piper::PiperConfig piperConfig; - piperConfig.eSpeakDataPath = - std::filesystem::absolute(exePath.parent_path().append("espeak-ng-data")) - .string(); + piperConfig.eSpeakDataPath = + std::filesystem::absolute( + exePath.parent_path().append("espeak-ng-data")) + .string(); - piper::Voice voice; - auto startTime = chrono::steady_clock::now(); - loadVoice(piperConfig, runConfig.modelPath.string(), - runConfig.modelConfigPath.string(), voice, runConfig.speakerId); - auto endTime = chrono::steady_clock::now(); - auto loadSeconds = chrono::duration(endTime - startTime).count(); - cerr << "Load time: " << loadSeconds << " sec" << endl; + spdlog::debug("espeak-ng-data directory is expected at {}", + piperConfig.eSpeakDataPath); + } + } else { + // Not using eSpeak + piperConfig.useESpeak = false; + } piper::initialize(piperConfig); @@ -102,7 +152,7 @@ int main(int argc, char *argv[]) { if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); - cerr << "Output directory: " << runConfig.outputPath.value() << endl; + spdlog::info("Output directory: {}", runConfig.outputPath.value().string()); } string line; @@ -175,13 +225,13 @@ int main(int argc, char *argv[]) { } // Wait for audio output to finish - cerr << "Waiting for audio..." << endl; + spdlog::info("Waiting for audio to finish playing..."); rawOutputThread.join(); } - cerr << "Real-time factor: " << result.realTimeFactor - << " (infer=" << result.inferSeconds - << " sec, audio=" << result.audioSeconds << " sec)" << endl; + spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)", + result.realTimeFactor, result.inferSeconds, + result.audioSeconds); } piper::terminate(piperConfig); @@ -189,6 +239,8 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } +// ---------------------------------------------------------------------------- + void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, condition_variable &cvAudio, bool &audioReady, bool &audioFinished) { @@ -220,6 +272,8 @@ void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, } // rawOutputProc +// ---------------------------------------------------------------------------- + void printUsage(char *argv[]) { cerr << endl; cerr << "usage: " << argv[0] << " [options]" << endl; @@ -249,6 +303,10 @@ void printUsage(char *argv[]) { cerr << " --silence_seconds NUM seconds of silence after each " "sentence (default: 0.2)" << endl; + cerr << " --espeak_data DIR path to espeak-ng data directory" + << endl; + cerr << " --debug print DEBUG messages to the console" + << endl; cerr << endl; } @@ -304,6 +362,12 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { } else if (arg == "--sentence_silence" || arg == "--sentence-silence") { ensureArg(argc, argv, i); runConfig.sentenceSilenceSeconds = stof(argv[++i]); + } else if (arg == "--espeak_data" || arg == "--espeak-data") { + ensureArg(argc, argv, i); + runConfig.eSpeakDataPath = filesystem::path(argv[++i]); + } else if (arg == "--debug") { + // Set DEBUG logging + spdlog::set_level(spdlog::level::debug); } else if (arg == "-h" || arg == "--help") { printUsage(argv); exit(0); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 7c4d98d..4345adb 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -2,10 +2,12 @@ #include #include #include +#include #include #include #include +#include #include "piper.hpp" #include "utf8.h" @@ -114,7 +116,6 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { // Load JSON config for audio synthesis void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { - // { // "audio": { // "sample_rate": 22050 @@ -162,6 +163,7 @@ void initialize(PiperConfig &config) { if (config.useESpeak) { // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator // See: https://github.com/rhasspy/espeak-ng + spdlog::debug("Initializing eSpeak"); int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, /*buflength*/ 0, /*path*/ config.eSpeakDataPath.c_str(), @@ -169,18 +171,26 @@ void initialize(PiperConfig &config) { if (result < 0) { throw std::runtime_error("Failed to initialize eSpeak-ng"); } + + spdlog::debug("Initialized eSpeak"); } + + spdlog::info("Initialized piper"); } void terminate(PiperConfig &config) { if (config.useESpeak) { // Clean up espeak-ng + spdlog::debug("Terminating eSpeak"); espeak_Terminate(); + spdlog::debug("Terminated eSpeak"); } + + spdlog::info("Terminated piper"); } void loadModel(std::string modelPath, ModelSession &session) { - + spdlog::debug("Loading onnx model from {}", modelPath); session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, instanceName.c_str()); session.env.DisableTelemetryEvents(); @@ -205,13 +215,15 @@ void loadModel(std::string modelPath, ModelSession &session) { auto startTime = std::chrono::steady_clock::now(); session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options); auto endTime = std::chrono::steady_clock::now(); - auto loadDuration = std::chrono::duration(endTime - startTime); + spdlog::debug("Loaded onnx model in {} second(s)", + std::chrono::duration(endTime - startTime).count()); } // Load Onnx model and JSON config file void loadVoice(PiperConfig &config, std::string modelPath, std::string modelConfigPath, Voice &voice, std::optional &speakerId) { + spdlog::debug("Parsing voice config at {}", modelConfigPath); std::ifstream modelConfigFile(modelConfigPath); voice.configRoot = json::parse(modelConfigFile); @@ -229,6 +241,8 @@ void loadVoice(PiperConfig &config, std::string modelPath, } } + spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers); + loadModel(modelPath, voice.session); } /* loadVoice */ @@ -237,6 +251,8 @@ void loadVoice(PiperConfig &config, std::string modelPath, void synthesize(std::vector &phonemeIds, SynthesisConfig &synthesisConfig, ModelSession &session, std::vector &audioBuffer, SynthesisResult &result) { + spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size()); + auto memoryInfo = Ort::MemoryInfo::CreateCpu( OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); @@ -302,6 +318,8 @@ void synthesize(std::vector &phonemeIds, if (result.audioSeconds > 0) { result.realTimeFactor = result.inferSeconds / result.audioSeconds; } + spdlog::debug("Synthesized {} second(s) of audio in {} second(s)", + result.audioSeconds, result.inferSeconds); // Get max audio value for scaling float maxAudioValue = 0.01f; @@ -351,6 +369,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, } // Phonemes for each sentence + spdlog::debug("Phonemizing text: {}", text); std::vector> phonemes; if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) { @@ -370,11 +389,24 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end(); ++phonemesIter) { std::vector &sentencePhonemes = *phonemesIter; + + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phonemes + std::string phonemesStr; + for (auto phoneme : sentencePhonemes) { + utf8::append(phoneme, phonemesStr); + } + + spdlog::debug("Converting {} phoneme(s) to ids: {}", + sentencePhonemes.size(), phonemesStr); + } + SynthesisResult sentenceResult; PhonemeIdConfig idConfig; if (voice.phonemizeConfig.phonemeType == TextPhonemes) { auto &language = voice.phonemizeConfig.eSpeak->voice; + spdlog::debug("Text phoneme language: {}", language); if (DEFAULT_ALPHABET.count(language) < 1) { throw std::runtime_error( "Text phoneme language for voice is not supported"); @@ -387,6 +419,17 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, // phonemes -> ids phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes); + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phoneme ids + std::stringstream phonemeIdsStr; + for (auto phonemeId : phonemeIds) { + phonemeIdsStr << phonemeId << ", "; + } + + spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", + sentencePhonemes.size(), phonemeIds.size(), + phonemeIdsStr.str()); + } // ids -> audio synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, @@ -411,6 +454,18 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, phonemeIds.clear(); } + if (missingPhonemes.size() > 0) { + spdlog::warn("Missing {} phoneme(s) from phoneme/id map!", + missingPhonemes.size()); + + for (auto phonemeCount : missingPhonemes) { + std::string phonemeStr; + utf8::append(phonemeCount.first, phonemeStr); + spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr, + (uint32_t)phonemeCount.first, phonemeCount.second); + } + } + if (result.audioSeconds > 0) { result.realTimeFactor = result.inferSeconds / result.audioSeconds; }