mirror of
https://github.com/pstrueb/piper.git
synced 2026-05-04 04:58:01 +00:00
First working version with libpiper_phonemize
This commit is contained in:
@@ -1,24 +1,78 @@
|
||||
#ifndef PIPER_H_
|
||||
#define PIPER_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <fstream>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "json.hpp"
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <phoneme_ids.hpp>
|
||||
#include <phonemize.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "model.hpp"
|
||||
#include "phonemize.hpp"
|
||||
#include "synthesize.hpp"
|
||||
#include "wavfile.hpp"
|
||||
#include "json.hpp"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace piper {
|
||||
|
||||
typedef int64_t SpeakerId;
|
||||
|
||||
struct eSpeakConfig {
|
||||
std::string voice = "en-us";
|
||||
};
|
||||
|
||||
struct PiperConfig {
|
||||
std::string eSpeakDataPath;
|
||||
bool useESpeak = true;
|
||||
};
|
||||
|
||||
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
|
||||
|
||||
struct PhonemizeConfig {
|
||||
PhonemeType phonemeType = eSpeakPhonemes;
|
||||
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
|
||||
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
|
||||
|
||||
PhonemeId idPad = 0; // padding (optionally interspersed)
|
||||
PhonemeId idBos = 1; // beginning of sentence
|
||||
PhonemeId idEos = 2; // end of sentence
|
||||
bool interspersePad = true;
|
||||
|
||||
std::optional<eSpeakConfig> eSpeak;
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
std::optional<SpeakerId> speakerId;
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
int numSpeakers;
|
||||
};
|
||||
|
||||
struct ModelSession {
|
||||
Ort::Session onnx;
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
Ort::SessionOptions options;
|
||||
Ort::Env env;
|
||||
|
||||
ModelSession() : onnx(nullptr){};
|
||||
};
|
||||
|
||||
struct SynthesisResult {
|
||||
double inferSeconds;
|
||||
double audioSeconds;
|
||||
double realTimeFactor;
|
||||
};
|
||||
|
||||
struct Voice {
|
||||
json configRoot;
|
||||
PhonemizeConfig phonemizeConfig;
|
||||
@@ -27,122 +81,25 @@ struct Voice {
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
void initialize(std::filesystem::path cwd) {
|
||||
string dataPath;
|
||||
// Must be called before using textTo* functions
|
||||
void initialize(PiperConfig &config);
|
||||
|
||||
auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
|
||||
if (std::filesystem::is_directory(cwdDataPath)) {
|
||||
dataPath = cwdDataPath.string();
|
||||
}
|
||||
|
||||
cerr << "dataPath: " << dataPath << endl;
|
||||
|
||||
// Set up espeak-ng for calling espeak_TextToPhonemes
|
||||
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||
/*buflength*/ 0,
|
||||
/*path*/ dataPath.c_str(),
|
||||
/*options*/ 0);
|
||||
if (result < 0) {
|
||||
throw runtime_error("Failed to initialize eSpeak-ng");
|
||||
}
|
||||
}
|
||||
|
||||
void terminate() {
|
||||
// Clean up espeak-ng
|
||||
espeak_Terminate();
|
||||
}
|
||||
// Clean up
|
||||
void terminate(PiperConfig &config);
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
|
||||
optional<SpeakerId> &speakerId) {
|
||||
ifstream modelConfigFile(modelConfigPath.c_str());
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||
|
||||
if (voice.modelConfig.numSpeakers > 1) {
|
||||
// Multispeaker model
|
||||
if (speakerId) {
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
} else {
|
||||
// Default speaker
|
||||
voice.synthesisConfig.speakerId = 0;
|
||||
}
|
||||
}
|
||||
|
||||
loadModel(modelPath, voice.session);
|
||||
|
||||
} /* loadVoice */
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId);
|
||||
|
||||
// Phonemize text and synthesize audio
|
||||
void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
|
||||
SynthesisResult &result,
|
||||
const function<void()> &audioCallback) {
|
||||
|
||||
size_t sentenceSilenceSamples = 0;
|
||||
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
|
||||
sentenceSilenceSamples = (size_t)(
|
||||
voice.synthesisConfig.sentenceSilenceSeconds *
|
||||
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
vector<vector<Phoneme>> phonemes;
|
||||
phonemize(text, voice.phonemizeConfig, phonemes);
|
||||
|
||||
vector<PhonemeId> phonemeIds;
|
||||
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||
++phonemesIter) {
|
||||
vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||
SynthesisResult sentenceResult;
|
||||
phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
for (size_t i = 0; i < sentenceSilenceSamples; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (audioCallback) {
|
||||
// Call back must copy audio since it is cleared afterwards.
|
||||
audioCallback();
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
} /* textToAudio */
|
||||
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||
const std::function<void()> &audioCallback);
|
||||
|
||||
// Phonemize text and synthesize audio to WAV file
|
||||
void textToWavFile(Voice &voice, string text, ostream &audioFile,
|
||||
SynthesisResult &result) {
|
||||
|
||||
vector<int16_t> audioBuffer;
|
||||
textToAudio(voice, text, audioBuffer, result, NULL);
|
||||
|
||||
// Write WAV
|
||||
auto synthesisConfig = voice.synthesisConfig;
|
||||
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
|
||||
synthesisConfig.channels, (int32_t)audioBuffer.size(),
|
||||
audioFile);
|
||||
|
||||
audioFile.write((const char *)audioBuffer.data(),
|
||||
sizeof(int16_t) * audioBuffer.size());
|
||||
|
||||
} /* textToWavFile */
|
||||
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::ostream &audioFile, SynthesisResult &result);
|
||||
|
||||
} // namespace piper
|
||||
|
||||
|
||||
Reference in New Issue
Block a user