Add multispeaker

This commit is contained in:
Michael Hansen
2023-01-05 21:47:08 -06:00
parent 06a154a4ed
commit a7fe73390e
8 changed files with 68 additions and 16 deletions

View File

@@ -17,9 +17,13 @@ pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
# https://github.com/espeak-ng/pcaudiolib
check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
if(PCAUDIO_INCLUDE_FOUND)
target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
set(PCAUDIO_LIBRARIES "pcaudio")
option(USE_PCAUDIO "Build with pcaudiolib" ON)
if(USE_PCAUDIO)
target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
set(PCAUDIO_LIBRARIES "pcaudio")
endif()
endif()
set(ONNXRUNTIME_ROOTDIR "/usr/local/include/onnxruntime")

View File

@@ -19,6 +19,7 @@ namespace larynx {
typedef char32_t Phoneme;
typedef int64_t PhonemeId;
typedef int64_t SpeakerId;
const string DefaultVoice = "en-gb-x-rp";
@@ -52,7 +53,11 @@ struct SynthesisConfig {
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
filesystem::path outputPath;
optional<SpeakerId> speakerId;
};
struct ModelConfig {
int numSpeakers;
};
bool isSingleCodepoint(string s) {
@@ -84,14 +89,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
}
auto phonemeMapValue = configRoot["phoneme_map"];
for (auto& fromPhonemeItem : phonemeMapValue.items()) {
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto& toPhonemeValue : fromPhonemeItem.value()) {
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
string toPhoneme = toPhonemeValue.get<string>();
if (!isSingleCodepoint(toPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
@@ -106,14 +111,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
// phoneme to [id] map
if (configRoot.contains("phoneme_id_map")) {
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
for (auto& fromPhonemeItem : phonemeIdMapValue.items()) {
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto& toIdValue : fromPhonemeItem.value()) {
for (auto &toIdValue : fromPhonemeItem.value()) {
PhonemeId toId = toIdValue.get<PhonemeId>();
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
}
@@ -134,6 +139,12 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
} /* parseSynthesisConfig */
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
} /* parseModelConfig */
} // namespace larynx
#endif // CONFIG_H_

View File

@@ -1,5 +1,5 @@
#ifndef API_H_
#define API_H_
#ifndef LARYNX_H_
#define LARYNX_H_
#include <iostream>
#include <string>
@@ -22,6 +22,7 @@ struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
SynthesisConfig synthesisConfig;
ModelConfig modelConfig;
ModelSession session;
};
@@ -42,12 +43,24 @@ void terminate() {
}
// Load Onnx model and JSON config file
void loadVoice(string modelPath, string modelConfigPath, Voice &voice) {
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
optional<SpeakerId> &speakerId) {
ifstream modelConfigFile(modelConfigPath.c_str());
voice.configRoot = json::parse(modelConfigFile);
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
parseModelConfig(voice.configRoot, voice.modelConfig);
if (voice.modelConfig.numSpeakers > 1) {
// Multispeaker model
if (speakerId) {
voice.synthesisConfig.speakerId = speakerId;
} else {
// Default speaker
voice.synthesisConfig.speakerId = 0;
}
}
loadModel(modelPath, voice.session);
@@ -83,8 +96,8 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile,
audioFile.write((const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
} /* textToAudio */
} /* textToWavFile */
} // namespace larynx
#endif // API_H_
#endif // LARYNX_H_

View File

@@ -12,7 +12,7 @@
#include <pcaudiolib/audio.h>
#endif
#include "api.hpp"
#include "larynx.hpp"
using namespace std;
@@ -23,6 +23,7 @@ struct RunConfig {
filesystem::path modelConfigPath;
OutputType outputType = OUTPUT_PLAY;
optional<filesystem::path> outputPath;
optional<larynx::SpeakerId> speakerId;
};
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -36,7 +37,7 @@ int main(int argc, char *argv[]) {
larynx::Voice voice;
auto startTime = chrono::steady_clock::now();
loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
voice);
voice, runConfig.speakerId);
auto endTime = chrono::steady_clock::now();
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
cerr << "Load time: " << loadSeconds << " sec" << endl;
@@ -122,9 +123,11 @@ int main(int argc, char *argv[]) {
larynx::terminate();
#ifdef HAVE_PCAUDIO
audio_object_close(my_audio);
audio_object_destroy(my_audio);
my_audio = nullptr;
#endif
return EXIT_SUCCESS;
}
@@ -145,6 +148,7 @@ void printUsage(char *argv[]) {
cerr << " -d DIR --output_dir DIR path to output directory (default: "
"cwd)"
<< endl;
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
cerr << endl;
}
@@ -182,6 +186,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
ensureArg(argc, argv, i);
runConfig.outputType = OUTPUT_DIRECTORY;
runConfig.outputPath = filesystem::path(argv[++i]);
} else if (arg == "-s" || arg == "--speaker") {
ensureArg(argc, argv, i);
runConfig.speakerId = (larynx::SpeakerId)stoi(argv[++i]);
} else if (arg == "-h" || arg == "--help") {
printUsage(argv);
exit(0);

View File

@@ -10,9 +10,9 @@
#include <vector>
#include <espeak-ng/speak_lib.h>
#include <utf8.h>
#include "config.hpp"
#include "utf8.h"
using namespace std;

View File

@@ -53,6 +53,15 @@ void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
if (synthesisConfig.speakerId) {
// Add speaker id
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value()};
vector<int64_t> speakerIdShape{1};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
speakerIdShape.size()));
}
// Infer
auto startTime = chrono::steady_clock::now();
auto outputTensors =