mirror of
https://github.com/pstrueb/piper.git
synced 2026-06-02 09:57:02 +00:00
Use libtashkeel
This commit is contained in:
@@ -6,7 +6,6 @@ piper:
|
||||
mkdir -p build
|
||||
cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
|
||||
cp -aR $(LIB_DIR)/piper_phonemize/lib/espeak-ng-data $(LIB_DIR)/piper_phonemize/lib/*.so* build/
|
||||
cp -a $(LIB_DIR)/onnxruntime/lib/*.so* build/
|
||||
|
||||
clean:
|
||||
rm -rf build/ dist/
|
||||
|
||||
@@ -14,7 +14,6 @@ string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
|
||||
string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")
|
||||
|
||||
set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
|
||||
set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/onnxruntime)
|
||||
|
||||
target_link_libraries(piper
|
||||
piper_phonemize
|
||||
@@ -28,12 +27,10 @@ if(NOT APPLE)
|
||||
endif()
|
||||
|
||||
target_link_directories(piper PUBLIC
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/lib
|
||||
${ONNXRUNTIME_ROOTDIR}/lib)
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/lib)
|
||||
|
||||
target_include_directories(piper PUBLIC
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/include
|
||||
${ONNXRUNTIME_ROOTDIR}/include
|
||||
${SPDLOG_INCLUDE_DIRS})
|
||||
|
||||
target_compile_options(piper PUBLIC
|
||||
|
||||
+44
-17
@@ -61,6 +61,10 @@ struct RunConfig {
|
||||
|
||||
// Path to espeak-ng data directory (default is next to piper executable)
|
||||
optional<filesystem::path> eSpeakDataPath;
|
||||
|
||||
// Path to libtashkeel ort model
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
optional<filesystem::path> tashkeelModelPath;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -90,33 +94,34 @@ int main(int argc, char *argv[]) {
|
||||
spdlog::info("Loaded voice in {} second(s)",
|
||||
chrono::duration<double>(endTime - startTime).count());
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
|
||||
spdlog::debug("Voice uses eSpeak phonemes ({})",
|
||||
voice.phonemizeConfig.eSpeak->voice);
|
||||
|
||||
if (runConfig.eSpeakDataPath) {
|
||||
// User provided path
|
||||
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
|
||||
} else {
|
||||
// Get the path to the piper executable so we can locate espeak-ng-data
|
||||
// next to it.
|
||||
// Get the path to the piper executable so we can locate espeak-ng-data, etc.
|
||||
// next to it.
|
||||
#ifdef _MSC_VER
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#elifdef __APPLE__
|
||||
auto exePath = []() {
|
||||
char moduleFileName[PATH_MAX] = { 0 };
|
||||
char moduleFileName[PATH_MAX] = {0};
|
||||
uint32_t moduleFileNameSize = std::size(moduleFileName);
|
||||
_NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
#endif
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
|
||||
spdlog::debug("Voice uses eSpeak phonemes ({})",
|
||||
voice.phonemizeConfig.eSpeak.voice);
|
||||
|
||||
if (runConfig.eSpeakDataPath) {
|
||||
// User provided path
|
||||
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.eSpeakDataPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("espeak-ng-data"))
|
||||
@@ -130,6 +135,25 @@ int main(int argc, char *argv[]) {
|
||||
piperConfig.useESpeak = false;
|
||||
}
|
||||
|
||||
// Enable libtashkeel for Arabic
|
||||
if (voice.phonemizeConfig.eSpeak.voice == "ar") {
|
||||
piperConfig.useTashkeel = true;
|
||||
if (runConfig.tashkeelModelPath) {
|
||||
// User provided path
|
||||
piperConfig.tashkeelModelPath =
|
||||
runConfig.tashkeelModelPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.tashkeelModelPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("libtashkeel_model.ort"))
|
||||
.string();
|
||||
|
||||
spdlog::debug("libtashkeel model is expected at {}",
|
||||
piperConfig.tashkeelModelPath.value());
|
||||
}
|
||||
}
|
||||
|
||||
piper::initialize(piperConfig);
|
||||
|
||||
// Scales
|
||||
@@ -365,6 +389,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--debug") {
|
||||
// Set DEBUG logging
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
|
||||
+28
-7
@@ -47,13 +47,9 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
// }
|
||||
|
||||
if (configRoot.contains("espeak")) {
|
||||
if (!phonemizeConfig.eSpeak) {
|
||||
phonemizeConfig.eSpeak.emplace();
|
||||
}
|
||||
|
||||
auto espeakValue = configRoot["espeak"];
|
||||
if (espeakValue.contains("voice")) {
|
||||
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<std::string>();
|
||||
phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,6 +171,22 @@ void initialize(PiperConfig &config) {
|
||||
spdlog::debug("Initialized eSpeak");
|
||||
}
|
||||
|
||||
// Load onnx model for libtashkeel
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
if (config.useTashkeel) {
|
||||
spdlog::debug("Using libtashkeel for diacritization");
|
||||
if (!config.tashkeelModelPath) {
|
||||
throw std::runtime_error("No path to libtashkeel model");
|
||||
}
|
||||
|
||||
spdlog::debug("Loading libtashkeel model from {}",
|
||||
config.tashkeelModelPath.value());
|
||||
config.tashkeelState = std::make_unique<tashkeel::State>();
|
||||
tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
|
||||
*config.tashkeelState);
|
||||
spdlog::debug("Initialized libtashkeel");
|
||||
}
|
||||
|
||||
spdlog::info("Initialized piper");
|
||||
}
|
||||
|
||||
@@ -368,6 +380,15 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||
}
|
||||
|
||||
if (config.useTashkeel) {
|
||||
if (!config.tashkeelState) {
|
||||
throw std::runtime_error("Tashkeel model is not loaded");
|
||||
}
|
||||
|
||||
spdlog::debug("Diacritizing text with libtashkeel: {}", text);
|
||||
text = tashkeel::tashkeel_run(text, *config.tashkeelState);
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
spdlog::debug("Phonemizing text: {}", text);
|
||||
std::vector<std::vector<Phoneme>> phonemes;
|
||||
@@ -375,7 +396,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
|
||||
// Use espeak-ng for phonemization
|
||||
eSpeakPhonemeConfig eSpeakConfig;
|
||||
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak->voice;
|
||||
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
|
||||
phonemize_eSpeak(text, eSpeakConfig, phonemes);
|
||||
} else {
|
||||
// Use UTF-8 codepoints as "phonemes"
|
||||
@@ -405,7 +426,7 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
|
||||
PhonemeIdConfig idConfig;
|
||||
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
|
||||
auto &language = voice.phonemizeConfig.eSpeak->voice;
|
||||
auto &language = voice.phonemizeConfig.eSpeak.voice;
|
||||
spdlog::debug("Text phoneme language: {}", language);
|
||||
if (DEFAULT_ALPHABET.count(language) < 1) {
|
||||
throw std::runtime_error(
|
||||
|
||||
+7
-2
@@ -1,8 +1,8 @@
|
||||
#ifndef PIPER_H_
|
||||
#define PIPER_H_
|
||||
|
||||
#include <functional>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <phoneme_ids.hpp>
|
||||
#include <phonemize.hpp>
|
||||
#include <tashkeel.hpp>
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
@@ -26,6 +27,10 @@ struct eSpeakConfig {
|
||||
struct PiperConfig {
|
||||
std::string eSpeakDataPath;
|
||||
bool useESpeak = true;
|
||||
|
||||
bool useTashkeel = false;
|
||||
std::optional<std::string> tashkeelModelPath;
|
||||
std::unique_ptr<tashkeel::State> tashkeelState;
|
||||
};
|
||||
|
||||
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
|
||||
@@ -40,7 +45,7 @@ struct PhonemizeConfig {
|
||||
PhonemeId idEos = 2; // end of sentence
|
||||
bool interspersePad = true;
|
||||
|
||||
std::optional<eSpeakConfig> eSpeak;
|
||||
eSpeakConfig eSpeak;
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
|
||||
Reference in New Issue
Block a user