diff --git a/Makefile b/Makefile index ee7d2aa..e3c27d8 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,6 @@ piper: mkdir -p build cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make cp -aR $(LIB_DIR)/piper_phonemize/lib/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/ - cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/ clean: rm -rf build/ dist/ diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 1e3523e..792b680 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -14,7 +14,6 @@ string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'") string(APPEND CMAKE_C_FLAGS " -Wall -Wextra") set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize) -set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime) target_link_libraries(piper piper_phonemize @@ -28,12 +27,10 @@ if(NOT APPLE) endif() target_link_directories(piper PUBLIC - ${PIPER_PHONEMIZE_ROOTDIR}/lib - ${ONNXRUNTIME_ROOTDIR}/lib) + ${PIPER_PHONEMIZE_ROOTDIR}/lib) target_include_directories(piper PUBLIC ${PIPER_PHONEMIZE_ROOTDIR}/include - ${ONNXRUNTIME_ROOTDIR}/include ${SPDLOG_INCLUDE_DIRS}) target_compile_options(piper PUBLIC diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index d8cb085..b3fe24e 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -61,6 +61,10 @@ struct RunConfig { // Path to espeak-ng data directory (default is next to piper executable) optional eSpeakDataPath; + + // Path to libtashkeel ort model + // https://github.com/mush42/libtashkeel/ + optional tashkeelModelPath; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); @@ -90,33 +94,34 @@ int main(int argc, char *argv[]) { spdlog::info("Loaded voice in {} second(s)", chrono::duration(endTime - startTime).count()); - if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) { - spdlog::debug("Voice uses eSpeak phonemes ({})", - voice.phonemizeConfig.eSpeak->voice); - - if (runConfig.eSpeakDataPath) { - // User provided path - piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string(); - } else { - // Get the path to the piper executable so we can locate espeak-ng-data - // next to it. + // Get the path to the piper executable so we can locate espeak-ng-data, etc. + // next to it. #ifdef _MSC_VER - auto exePath = []() { - wchar_t moduleFileName[MAX_PATH] = {0}; - GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); - return filesystem::path(moduleFileName); - }(); + auto exePath = []() { + wchar_t moduleFileName[MAX_PATH] = {0}; + GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); + return filesystem::path(moduleFileName); + }(); #elifdef __APPLE__ auto exePath = []() { - char moduleFileName[PATH_MAX] = { 0 }; + char moduleFileName[PATH_MAX] = {0}; uint32_t moduleFileNameSize = std::size(moduleFileName); _NSGetExecutablePath(moduleFileName, &moduleFileNameSize); return filesystem::path(moduleFileName); }(); #else - auto exePath = filesystem::canonical("/proc/self/exe"); + auto exePath = filesystem::canonical("/proc/self/exe"); #endif + if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) { + spdlog::debug("Voice uses eSpeak phonemes ({})", + voice.phonemizeConfig.eSpeak.voice); + + if (runConfig.eSpeakDataPath) { + // User provided path + piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string(); + } else { + // Assume next to piper executable piperConfig.eSpeakDataPath = std::filesystem::absolute( exePath.parent_path().append("espeak-ng-data")) @@ -130,6 +135,25 @@ int main(int argc, char *argv[]) { piperConfig.useESpeak = false; } + // Enable libtashkeel for Arabic + if (voice.phonemizeConfig.eSpeak.voice == "ar") { + piperConfig.useTashkeel = true; + if (runConfig.tashkeelModelPath) { + // User provided path + piperConfig.tashkeelModelPath = + runConfig.tashkeelModelPath.value().string(); + } else { + // Assume next to piper executable + piperConfig.tashkeelModelPath = + std::filesystem::absolute( + exePath.parent_path().append("libtashkeel_model.ort")) + .string(); + + spdlog::debug("libtashkeel model is expected at {}", + piperConfig.tashkeelModelPath.value()); + } + } + piper::initialize(piperConfig); // Scales @@ -365,6 +389,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { } else if (arg == "--espeak_data" || arg == "--espeak-data") { ensureArg(argc, argv, i); runConfig.eSpeakDataPath = filesystem::path(argv[++i]); + } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") { + ensureArg(argc, argv, i); + runConfig.tashkeelModelPath = filesystem::path(argv[++i]); } else if (arg == "--debug") { // Set DEBUG logging spdlog::set_level(spdlog::level::debug); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index 4345adb..ede7bbb 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -47,13 +47,9 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { // } if (configRoot.contains("espeak")) { - if (!phonemizeConfig.eSpeak) { - phonemizeConfig.eSpeak.emplace(); - } - auto espeakValue = configRoot["espeak"]; if (espeakValue.contains("voice")) { - phonemizeConfig.eSpeak->voice = espeakValue["voice"].get(); + phonemizeConfig.eSpeak.voice = espeakValue["voice"].get(); } } @@ -175,6 +171,22 @@ void initialize(PiperConfig &config) { spdlog::debug("Initialized eSpeak"); } + // Load onnx model for libtashkeel + // https://github.com/mush42/libtashkeel/ + if (config.useTashkeel) { + spdlog::debug("Using libtashkeel for diacritization"); + if (!config.tashkeelModelPath) { + throw std::runtime_error("No path to libtashkeel model"); + } + + spdlog::debug("Loading libtashkeel model from {}", + config.tashkeelModelPath.value()); + config.tashkeelState = std::make_unique(); + tashkeel::tashkeel_load(config.tashkeelModelPath.value(), + *config.tashkeelState); + spdlog::debug("Initialized libtashkeel"); + } + spdlog::info("Initialized piper"); } @@ -368,6 +380,15 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels); } + if (config.useTashkeel) { + if (!config.tashkeelState) { + throw std::runtime_error("Tashkeel model is not loaded"); + } + + spdlog::debug("Diacritizing text with libtashkeel: {}", text); + text = tashkeel::tashkeel_run(text, *config.tashkeelState); + } + // Phonemes for each sentence spdlog::debug("Phonemizing text: {}", text); std::vector> phonemes; @@ -375,7 +396,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) { // Use espeak-ng for phonemization eSpeakPhonemeConfig eSpeakConfig; - eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice; + eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice; phonemize_eSpeak(text, eSpeakConfig, phonemes); } else { // Use UTF-8 codepoints as "phonemes" @@ -405,7 +426,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, PhonemeIdConfig idConfig; if (voice.phonemizeConfig.phonemeType == TextPhonemes) { - auto &language = voice.phonemizeConfig.eSpeak->voice; + auto &language = voice.phonemizeConfig.eSpeak.voice; spdlog::debug("Text phoneme language: {}", language); if (DEFAULT_ALPHABET.count(language) < 1) { throw std::runtime_error( diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp index 640e4b7..29a8bcf 100644 --- a/src/cpp/piper.hpp +++ b/src/cpp/piper.hpp @@ -1,8 +1,8 @@ #ifndef PIPER_H_ #define PIPER_H_ -#include #include +#include #include #include #include @@ -10,6 +10,7 @@ #include #include #include +#include #include "json.hpp" @@ -26,6 +27,10 @@ struct eSpeakConfig { struct PiperConfig { std::string eSpeakDataPath; bool useESpeak = true; + + bool useTashkeel = false; + std::optional tashkeelModelPath; + std::unique_ptr tashkeelState; }; enum PhonemeType { eSpeakPhonemes, TextPhonemes }; @@ -40,7 +45,7 @@ struct PhonemizeConfig { PhonemeId idEos = 2; // end of sentence bool interspersePad = true; - std::optional eSpeak; + eSpeakConfig eSpeak; }; struct SynthesisConfig {