First working version with libpiper_phonemize

This commit is contained in:
Michael Hansen
2023-06-08 15:40:37 -05:00
parent 7d27863b48
commit 810fad44cf
11 changed files with 505 additions and 778 deletions

View File

@@ -1,24 +1,78 @@
#ifndef PIPER_H_
#define PIPER_H_
#include <filesystem>
#include <iostream>
#include <functional>
#include <fstream>
#include <optional>
#include <string>
#include <vector>
#include "json.hpp"
#include <espeak-ng/speak_lib.h>
#include <onnxruntime_cxx_api.h>
#include <phoneme_ids.hpp>
#include <phonemize.hpp>
#include "config.hpp"
#include "model.hpp"
#include "phonemize.hpp"
#include "synthesize.hpp"
#include "wavfile.hpp"
#include "json.hpp"
using json = nlohmann::json;
namespace piper {
typedef int64_t SpeakerId;
struct eSpeakConfig {
std::string voice = "en-us";
};
struct PiperConfig {
std::string eSpeakDataPath;
bool useESpeak = true;
};
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
struct PhonemizeConfig {
PhonemeType phonemeType = eSpeakPhonemes;
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
PhonemeId idPad = 0; // padding (optionally interspersed)
PhonemeId idBos = 1; // beginning of sentence
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
std::optional<eSpeakConfig> eSpeak;
};
struct SynthesisConfig {
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
std::optional<SpeakerId> speakerId;
float sentenceSilenceSeconds = 0.2f;
};
struct ModelConfig {
int numSpeakers;
};
struct ModelSession {
Ort::Session onnx;
Ort::AllocatorWithDefaultOptions allocator;
Ort::SessionOptions options;
Ort::Env env;
ModelSession() : onnx(nullptr){};
};
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
@@ -27,122 +81,25 @@ struct Voice {
ModelSession session;
};
void initialize(std::filesystem::path cwd) {
string dataPath;
// Must be called before using textTo* functions
void initialize(PiperConfig &config);
auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
if (std::filesystem::is_directory(cwdDataPath)) {
dataPath = cwdDataPath.string();
}
cerr << "dataPath: " << dataPath << endl;
// Set up espeak-ng for calling espeak_TextToPhonemes
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
/*buflength*/ 0,
/*path*/ dataPath.c_str(),
/*options*/ 0);
if (result < 0) {
throw runtime_error("Failed to initialize eSpeak-ng");
}
}
void terminate() {
// Clean up espeak-ng
espeak_Terminate();
}
// Clean up
void terminate(PiperConfig &config);
// Load Onnx model and JSON config file
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
optional<SpeakerId> &speakerId) {
ifstream modelConfigFile(modelConfigPath.c_str());
voice.configRoot = json::parse(modelConfigFile);
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
parseModelConfig(voice.configRoot, voice.modelConfig);
if (voice.modelConfig.numSpeakers > 1) {
// Multispeaker model
if (speakerId) {
voice.synthesisConfig.speakerId = speakerId;
} else {
// Default speaker
voice.synthesisConfig.speakerId = 0;
}
}
loadModel(modelPath, voice.session);
} /* loadVoice */
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId);
// Phonemize text and synthesize audio
void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
SynthesisResult &result,
const function<void()> &audioCallback) {
size_t sentenceSilenceSamples = 0;
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
sentenceSilenceSamples = (size_t)(
voice.synthesisConfig.sentenceSilenceSeconds *
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
// Phonemes for each sentence
vector<vector<Phoneme>> phonemes;
phonemize(text, voice.phonemizeConfig, phonemes);
vector<PhonemeId> phonemeIds;
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
vector<Phoneme> &sentencePhonemes = *phonemesIter;
SynthesisResult sentenceResult;
phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
sentenceResult);
// Add end of sentence silence
if (sentenceSilenceSamples > 0) {
for (size_t i = 0; i < sentenceSilenceSamples; i++) {
audioBuffer.push_back(0);
}
}
if (audioCallback) {
// Call back must copy audio since it is cleared afterwards.
audioCallback();
audioBuffer.clear();
}
result.audioSeconds += sentenceResult.audioSeconds;
result.inferSeconds += sentenceResult.inferSeconds;
phonemeIds.clear();
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
} /* textToAudio */
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback);
// Phonemize text and synthesize audio to WAV file
void textToWavFile(Voice &voice, string text, ostream &audioFile,
SynthesisResult &result) {
vector<int16_t> audioBuffer;
textToAudio(voice, text, audioBuffer, result, NULL);
// Write WAV
auto synthesisConfig = voice.synthesisConfig;
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
synthesisConfig.channels, (int32_t)audioBuffer.size(),
audioFile);
audioFile.write((const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
} /* textToWavFile */
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result);
} // namespace piper