diff --git a/.gitignore b/.gitignore index 8e72809..e8bcba1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ htmlcov /data/ /build/ /local/ +/dist/ *.so .venv/ diff --git a/Makefile b/Makefile index a2c40c6..7a68fc0 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,19 @@ -.PHONY: release debug clean +.PHONY: release debug clean docker release: mkdir -p build cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make +no-pcaudio: + mkdir -p build + cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release -DUSE_PCAUDIO=OFF && make + debug: mkdir -p build cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Debug && make clean: rm -rf build/ dist/ + +docker: + docker buildx build . --platform 'linux/amd64,linux/arm64' --output 'type=local,dest=dist' diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 7c9ec65..1ebadd9 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -17,9 +17,13 @@ pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2) # https://github.com/espeak-ng/pcaudiolib check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND) + if(PCAUDIO_INCLUDE_FOUND) -target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO) -set(PCAUDIO_LIBRARIES "pcaudio") + option(USE_PCAUDIO "Build with pcaudiolib" ON) + if(USE_PCAUDIO) + target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO) + set(PCAUDIO_LIBRARIES "pcaudio") + endif() endif() set(ONNXRUNTIME_ROOTDIR "/usr/local/include/onnxruntime") diff --git a/src/cpp/config.hpp b/src/cpp/config.hpp index 4244879..a50f16c 100644 --- a/src/cpp/config.hpp +++ b/src/cpp/config.hpp @@ -19,6 +19,7 @@ namespace larynx { typedef char32_t Phoneme; typedef int64_t PhonemeId; +typedef int64_t SpeakerId; const string DefaultVoice = "en-gb-x-rp"; @@ -52,7 +53,11 @@ struct SynthesisConfig { int sampleRate = 22050; int sampleWidth = 2; // 16-bit int channels = 1; // mono - filesystem::path outputPath; + optional speakerId; +}; + +struct ModelConfig { + int numSpeakers; }; bool isSingleCodepoint(string s) { @@ -84,14 +89,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { } auto phonemeMapValue = configRoot["phoneme_map"]; - for (auto& fromPhonemeItem : phonemeMapValue.items()) { + for (auto &fromPhonemeItem : phonemeMapValue.items()) { string fromPhoneme = fromPhonemeItem.key(); if (!isSingleCodepoint(fromPhoneme)) { throw runtime_error("Phonemes must be one codepoint (phoneme map)"); } auto fromCodepoint = getCodepoint(fromPhoneme); - for (auto& toPhonemeValue : fromPhonemeItem.value()) { + for (auto &toPhonemeValue : fromPhonemeItem.value()) { string toPhoneme = toPhonemeValue.get(); if (!isSingleCodepoint(toPhoneme)) { throw runtime_error("Phonemes must be one codepoint (phoneme map)"); @@ -106,14 +111,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) { // phoneme to [id] map if (configRoot.contains("phoneme_id_map")) { auto phonemeIdMapValue = configRoot["phoneme_id_map"]; - for (auto& fromPhonemeItem : phonemeIdMapValue.items()) { + for (auto &fromPhonemeItem : phonemeIdMapValue.items()) { string fromPhoneme = fromPhonemeItem.key(); if (!isSingleCodepoint(fromPhoneme)) { throw runtime_error("Phonemes must be one codepoint (phoneme id map)"); } auto fromCodepoint = getCodepoint(fromPhoneme); - for (auto& toIdValue : fromPhonemeItem.value()) { + for (auto &toIdValue : fromPhonemeItem.value()) { PhonemeId toId = toIdValue.get(); phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId); } @@ -134,6 +139,12 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) { } /* parseSynthesisConfig */ +void parseModelConfig(json &configRoot, ModelConfig &modelConfig) { + + modelConfig.numSpeakers = configRoot["num_speakers"].get(); + +} /* parseModelConfig */ + } // namespace larynx #endif // CONFIG_H_ diff --git a/src/cpp/api.hpp b/src/cpp/larynx.hpp similarity index 83% rename from src/cpp/api.hpp rename to src/cpp/larynx.hpp index 122bb80..32abb16 100644 --- a/src/cpp/api.hpp +++ b/src/cpp/larynx.hpp @@ -1,5 +1,5 @@ -#ifndef API_H_ -#define API_H_ +#ifndef LARYNX_H_ +#define LARYNX_H_ #include #include @@ -22,6 +22,7 @@ struct Voice { json configRoot; PhonemizeConfig phonemizeConfig; SynthesisConfig synthesisConfig; + ModelConfig modelConfig; ModelSession session; }; @@ -42,12 +43,24 @@ void terminate() { } // Load Onnx model and JSON config file -void loadVoice(string modelPath, string modelConfigPath, Voice &voice) { +void loadVoice(string modelPath, string modelConfigPath, Voice &voice, + optional &speakerId) { ifstream modelConfigFile(modelConfigPath.c_str()); voice.configRoot = json::parse(modelConfigFile); parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig); parseSynthesisConfig(voice.configRoot, voice.synthesisConfig); + parseModelConfig(voice.configRoot, voice.modelConfig); + + if (voice.modelConfig.numSpeakers > 1) { + // Multispeaker model + if (speakerId) { + voice.synthesisConfig.speakerId = speakerId; + } else { + // Default speaker + voice.synthesisConfig.speakerId = 0; + } + } loadModel(modelPath, voice.session); @@ -83,8 +96,8 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile, audioFile.write((const char *)audioBuffer.data(), sizeof(int16_t) * audioBuffer.size()); -} /* textToAudio */ +} /* textToWavFile */ } // namespace larynx -#endif // API_H_ +#endif // LARYNX_H_ diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index ffb266f..3204b2f 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -12,7 +12,7 @@ #include #endif -#include "api.hpp" +#include "larynx.hpp" using namespace std; @@ -23,6 +23,7 @@ struct RunConfig { filesystem::path modelConfigPath; OutputType outputType = OUTPUT_PLAY; optional outputPath; + optional speakerId; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); @@ -36,7 +37,7 @@ int main(int argc, char *argv[]) { larynx::Voice voice; auto startTime = chrono::steady_clock::now(); loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(), - voice); + voice, runConfig.speakerId); auto endTime = chrono::steady_clock::now(); auto loadSeconds = chrono::duration(endTime - startTime).count(); cerr << "Load time: " << loadSeconds << " sec" << endl; @@ -122,9 +123,11 @@ int main(int argc, char *argv[]) { larynx::terminate(); +#ifdef HAVE_PCAUDIO audio_object_close(my_audio); audio_object_destroy(my_audio); my_audio = nullptr; +#endif return EXIT_SUCCESS; } @@ -145,6 +148,7 @@ void printUsage(char *argv[]) { cerr << " -d DIR --output_dir DIR path to output directory (default: " "cwd)" << endl; + cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl; cerr << endl; } @@ -182,6 +186,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { ensureArg(argc, argv, i); runConfig.outputType = OUTPUT_DIRECTORY; runConfig.outputPath = filesystem::path(argv[++i]); + } else if (arg == "-s" || arg == "--speaker") { + ensureArg(argc, argv, i); + runConfig.speakerId = (larynx::SpeakerId)stoi(argv[++i]); } else if (arg == "-h" || arg == "--help") { printUsage(argv); exit(0); diff --git a/src/cpp/phonemize.hpp b/src/cpp/phonemize.hpp index 80c62c6..1c89b53 100644 --- a/src/cpp/phonemize.hpp +++ b/src/cpp/phonemize.hpp @@ -10,9 +10,9 @@ #include #include -#include #include "config.hpp" +#include "utf8.h" using namespace std; diff --git a/src/cpp/synthesize.hpp b/src/cpp/synthesize.hpp index f75d77b..71070e2 100644 --- a/src/cpp/synthesize.hpp +++ b/src/cpp/synthesize.hpp @@ -53,6 +53,15 @@ void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session, Ort::Value::CreateTensor(memoryInfo, scales.data(), scales.size(), scalesShape.data(), scalesShape.size())); + if (synthesisConfig.speakerId) { + // Add speaker id + vector speakerId{(int64_t)synthesisConfig.speakerId.value()}; + vector speakerIdShape{1}; + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(), + speakerIdShape.size())); + } + // Infer auto startTime = chrono::steady_clock::now(); auto outputTensors =