mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-26 09:44:49 +00:00
Add multispeaker
This commit is contained in:
@@ -17,9 +17,13 @@ pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
|
||||
|
||||
# https://github.com/espeak-ng/pcaudiolib
|
||||
check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
|
||||
|
||||
if(PCAUDIO_INCLUDE_FOUND)
|
||||
target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
|
||||
set(PCAUDIO_LIBRARIES "pcaudio")
|
||||
option(USE_PCAUDIO "Build with pcaudiolib" ON)
|
||||
if(USE_PCAUDIO)
|
||||
target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
|
||||
set(PCAUDIO_LIBRARIES "pcaudio")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(ONNXRUNTIME_ROOTDIR "/usr/local/include/onnxruntime")
|
||||
|
||||
@@ -19,6 +19,7 @@ namespace larynx {
|
||||
|
||||
typedef char32_t Phoneme;
|
||||
typedef int64_t PhonemeId;
|
||||
typedef int64_t SpeakerId;
|
||||
|
||||
const string DefaultVoice = "en-gb-x-rp";
|
||||
|
||||
@@ -52,7 +53,11 @@ struct SynthesisConfig {
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
filesystem::path outputPath;
|
||||
optional<SpeakerId> speakerId;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
int numSpeakers;
|
||||
};
|
||||
|
||||
bool isSingleCodepoint(string s) {
|
||||
@@ -84,14 +89,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
}
|
||||
|
||||
auto phonemeMapValue = configRoot["phoneme_map"];
|
||||
for (auto& fromPhonemeItem : phonemeMapValue.items()) {
|
||||
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
||||
string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto& toPhonemeValue : fromPhonemeItem.value()) {
|
||||
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
||||
string toPhoneme = toPhonemeValue.get<string>();
|
||||
if (!isSingleCodepoint(toPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
||||
@@ -106,14 +111,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
// phoneme to [id] map
|
||||
if (configRoot.contains("phoneme_id_map")) {
|
||||
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
||||
for (auto& fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||
string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto& toIdValue : fromPhonemeItem.value()) {
|
||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
||||
}
|
||||
@@ -134,6 +139,12 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
||||
|
||||
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
||||
|
||||
} /* parseModelConfig */
|
||||
|
||||
} // namespace larynx
|
||||
|
||||
#endif // CONFIG_H_
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef API_H_
|
||||
#define API_H_
|
||||
#ifndef LARYNX_H_
|
||||
#define LARYNX_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
@@ -22,6 +22,7 @@ struct Voice {
|
||||
json configRoot;
|
||||
PhonemizeConfig phonemizeConfig;
|
||||
SynthesisConfig synthesisConfig;
|
||||
ModelConfig modelConfig;
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
@@ -42,12 +43,24 @@ void terminate() {
|
||||
}
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(string modelPath, string modelConfigPath, Voice &voice) {
|
||||
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
|
||||
optional<SpeakerId> &speakerId) {
|
||||
ifstream modelConfigFile(modelConfigPath.c_str());
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||
|
||||
if (voice.modelConfig.numSpeakers > 1) {
|
||||
// Multispeaker model
|
||||
if (speakerId) {
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
} else {
|
||||
// Default speaker
|
||||
voice.synthesisConfig.speakerId = 0;
|
||||
}
|
||||
}
|
||||
|
||||
loadModel(modelPath, voice.session);
|
||||
|
||||
@@ -83,8 +96,8 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile,
|
||||
audioFile.write((const char *)audioBuffer.data(),
|
||||
sizeof(int16_t) * audioBuffer.size());
|
||||
|
||||
} /* textToAudio */
|
||||
} /* textToWavFile */
|
||||
|
||||
} // namespace larynx
|
||||
|
||||
#endif // API_H_
|
||||
#endif // LARYNX_H_
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <pcaudiolib/audio.h>
|
||||
#endif
|
||||
|
||||
#include "api.hpp"
|
||||
#include "larynx.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@@ -23,6 +23,7 @@ struct RunConfig {
|
||||
filesystem::path modelConfigPath;
|
||||
OutputType outputType = OUTPUT_PLAY;
|
||||
optional<filesystem::path> outputPath;
|
||||
optional<larynx::SpeakerId> speakerId;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -36,7 +37,7 @@ int main(int argc, char *argv[]) {
|
||||
larynx::Voice voice;
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
|
||||
voice);
|
||||
voice, runConfig.speakerId);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
|
||||
cerr << "Load time: " << loadSeconds << " sec" << endl;
|
||||
@@ -122,9 +123,11 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
larynx::terminate();
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
audio_object_close(my_audio);
|
||||
audio_object_destroy(my_audio);
|
||||
my_audio = nullptr;
|
||||
#endif
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@@ -145,6 +148,7 @@ void printUsage(char *argv[]) {
|
||||
cerr << " -d DIR --output_dir DIR path to output directory (default: "
|
||||
"cwd)"
|
||||
<< endl;
|
||||
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
@@ -182,6 +186,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.outputType = OUTPUT_DIRECTORY;
|
||||
runConfig.outputPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "-s" || arg == "--speaker") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.speakerId = (larynx::SpeakerId)stoi(argv[++i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
printUsage(argv);
|
||||
exit(0);
|
||||
|
||||
@@ -10,9 +10,9 @@
|
||||
#include <vector>
|
||||
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <utf8.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "utf8.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
@@ -53,6 +53,15 @@ void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
|
||||
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
||||
scalesShape.data(), scalesShape.size()));
|
||||
|
||||
if (synthesisConfig.speakerId) {
|
||||
// Add speaker id
|
||||
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value()};
|
||||
vector<int64_t> speakerIdShape{1};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
||||
speakerIdShape.size()));
|
||||
}
|
||||
|
||||
// Infer
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
auto outputTensors =
|
||||
|
||||
Reference in New Issue
Block a user