Split into sentences and output audio as available

This commit is contained in:
Michael Hansen
2023-04-12 15:56:06 -05:00
parent f8386b1984
commit e1d34f14fb
6 changed files with 255 additions and 51 deletions
+1
View File
@@ -30,6 +30,7 @@ set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_
target_link_libraries(piper
onnxruntime
pthread
-static-libgcc -static-libstdc++
${ESPEAK_NG_LIBRARIES}
${PCAUDIO_LIBRARIES})
+7 -4
View File
@@ -21,19 +21,22 @@ typedef char32_t Phoneme;
typedef int64_t PhonemeId;
typedef int64_t SpeakerId;
const string DefaultVoice = "en-gb-x-rp";
const string DefaultVoice = "en-us";
enum eSpeakMode { Text, TextWithPhonemes, SSML };
struct eSpeakConfig {
string voice = DefaultVoice;
eSpeakMode mode = Text;
// Characters that eSpeak uses to break apart paragraphs/sentences
set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
// Characters that piper will use to split utterances
set<Phoneme> sentenceBreakers{U'.', U'?', U'!'};
};
struct PhonemizeConfig {
string text;
optional<vector<Phoneme>> phonemes;
optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
map<Phoneme, vector<PhonemeId>> phonemeIdMap;
@@ -46,7 +49,6 @@ struct PhonemizeConfig {
};
struct SynthesisConfig {
vector<PhonemeId> phonemeIds;
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
@@ -54,6 +56,7 @@ struct SynthesisConfig {
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
optional<SpeakerId> speakerId;
float sentenceSilenceSeconds = 0.2f;
};
struct ModelConfig {
+159 -7
View File
@@ -1,10 +1,13 @@
#include <chrono>
#include <condition_variable>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <vector>
#ifdef HAVE_PCAUDIO
@@ -16,7 +19,13 @@
using namespace std;
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_PLAY };
enum OutputType {
OUTPUT_FILE,
OUTPUT_DIRECTORY,
OUTPUT_STDOUT,
OUTPUT_PLAY,
OUTPUT_RAW
};
struct RunConfig {
filesystem::path modelPath;
@@ -30,6 +39,15 @@ struct RunConfig {
};
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
#ifdef HAVE_PCAUDIO
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
#endif
int main(int argc, char *argv[]) {
RunConfig runConfig;
@@ -118,17 +136,78 @@ int main(int argc, char *argv[]) {
} else if (runConfig.outputType == OUTPUT_STDOUT) {
// Output WAV to stdout
piper::textToWavFile(voice, line, cout, result);
} else if (runConfig.outputType == OUTPUT_RAW) {
// Raw output to stdout
mutex mutAudio;
condition_variable cvAudio;
bool audioReady = false;
bool audioFinished = false;
vector<int16_t> audioBuffer;
vector<int16_t> sharedAudioBuffer;
thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
ref(mutAudio), ref(cvAudio), ref(audioReady),
ref(audioFinished));
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
&cvAudio, &audioReady]() {
// Signal thread that audio is ready
{
unique_lock lockAudio(mutAudio);
copy(audioBuffer.begin(), audioBuffer.end(),
back_inserter(sharedAudioBuffer));
audioReady = true;
cvAudio.notify_one();
}
};
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
// Signal thread that there is no more audio
{
unique_lock lockAudio(mutAudio);
audioReady = true;
audioFinished = true;
cvAudio.notify_one();
}
// Wait for audio output to finish
cerr << "Waiting for audio..." << endl;
rawOutputThread.join();
} else if (runConfig.outputType == OUTPUT_PLAY) {
#ifdef HAVE_PCAUDIO
mutex mutAudio;
condition_variable cvAudio;
bool audioReady = false;
bool audioFinished = false;
vector<int16_t> audioBuffer;
piper::textToAudio(voice, line, audioBuffer, result);
vector<int16_t> sharedAudioBuffer;
int error = audio_object_write(my_audio, (const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
if (error != 0) {
throw runtime_error(audio_object_strerror(my_audio, error));
thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
ref(mutAudio), ref(cvAudio), ref(audioReady),
ref(audioFinished));
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
&cvAudio, &audioReady]() {
// Signal thread that audio is ready
{
unique_lock lockAudio(mutAudio);
copy(audioBuffer.begin(), audioBuffer.end(),
back_inserter(sharedAudioBuffer));
audioReady = true;
cvAudio.notify_one();
}
};
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
// Signal thread that there is no more audio
{
unique_lock lockAudio(mutAudio);
audioReady = true;
audioFinished = true;
cvAudio.notify_one();
}
audio_object_flush(my_audio);
// Wait for audio output to finish
cerr << "Waiting for audio..." << endl;
playThread.join();
#else
throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
#endif
@@ -150,6 +229,74 @@ int main(int argc, char *argv[]) {
return EXIT_SUCCESS;
}
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished) {
vector<int16_t> internalAudioBuffer;
while (true) {
{
unique_lock lockAudio{mutAudio};
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
if (sharedAudioBuffer.empty() && audioFinished) {
break;
}
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
back_inserter(internalAudioBuffer));
sharedAudioBuffer.clear();
if (!audioFinished) {
audioReady = false;
}
}
cout.write((const char *)internalAudioBuffer.data(),
sizeof(int16_t) * internalAudioBuffer.size());
cout.flush();
internalAudioBuffer.clear();
}
} // rawOutputProc
#ifdef HAVE_PCAUDIO
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
bool &audioFinished) {
vector<int16_t> internalAudioBuffer;
while (true) {
{
unique_lock lockAudio{mutAudio};
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
if (sharedAudioBuffer.empty() && audioFinished) {
break;
}
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
back_inserter(internalAudioBuffer));
sharedAudioBuffer.clear();
if (!audioFinished) {
audioReady = false;
}
}
int error =
audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
sizeof(int16_t) * internalAudioBuffer.size());
if (error != 0) {
throw runtime_error(audio_object_strerror(my_audio, error));
}
audio_object_flush(my_audio);
internalAudioBuffer.clear();
}
} // playProc
#endif
void printUsage(char *argv[]) {
cerr << endl;
cerr << "usage: " << argv[0] << " [options]" << endl;
@@ -166,6 +313,9 @@ void printUsage(char *argv[]) {
cerr << " -d DIR --output_dir DIR path to output directory (default: "
"cwd)"
<< endl;
cerr << " --output_raw output raw audio to stdout as it "
"becomes available"
<< endl;
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
cerr << " --noise-scale NUM generator noise (default: 0.667)"
<< endl;
@@ -210,6 +360,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
ensureArg(argc, argv, i);
runConfig.outputType = OUTPUT_DIRECTORY;
runConfig.outputPath = filesystem::path(argv[++i]);
} else if (arg == "--output_raw") {
runConfig.outputType = OUTPUT_RAW;
} else if (arg == "-s" || arg == "--speaker") {
ensureArg(argc, argv, i);
runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
+38 -25
View File
@@ -4,6 +4,7 @@
#include <filesystem>
#include <iostream>
#include <map>
#include <optional>
#include <set>
#include <stdexcept>
#include <string>
@@ -19,27 +20,29 @@ using namespace std;
namespace piper {
// Text to phonemes using eSpeak-ng
void phonemize(PhonemizeConfig &phonemizeConfig) {
void phonemize(string text, PhonemizeConfig &phonemizeConfig,
vector<vector<Phoneme>> &phonemes) {
if (!phonemizeConfig.eSpeak) {
throw runtime_error("Missing eSpeak config");
}
if (!phonemizeConfig.phonemes) {
phonemizeConfig.phonemes.emplace();
}
auto voice = phonemizeConfig.eSpeak->voice;
int result = espeak_SetVoiceByName(voice.c_str());
if (result != 0) {
throw runtime_error("Failed to set eSpeak-ng voice");
}
string text(phonemizeConfig.text);
// Modified by eSpeak
string textCopy(text);
utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
vector<char32_t> textClauseBreakers;
utf8::iterator textIter(text.begin(), text.begin(), text.end());
utf8::iterator textIterEnd(text.end(), text.begin(), text.end());
// Identify clause breakers in the sentence, since eSpeak removes them during
// phonemization.
//
// This will unfortunately do the wrong thing with abbreviations, etc.
while (textIter != textIterEnd) {
auto codepoint = *textIter;
if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
@@ -49,7 +52,8 @@ void phonemize(PhonemizeConfig &phonemizeConfig) {
textIter++;
}
const char *inputTextPointer = text.c_str();
vector<Phoneme> *sentencePhonemes = nullptr;
const char *inputTextPointer = textCopy.c_str();
size_t clauseBreakerIndex = 0;
while (inputTextPointer != NULL) {
@@ -63,11 +67,21 @@ void phonemize(PhonemizeConfig &phonemizeConfig) {
utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
clausePhonemes.end());
phonemizeConfig.phonemes->insert(phonemizeConfig.phonemes->end(),
phonemeIter, phonemeEnd);
if (!sentencePhonemes) {
// Start new sentence
phonemes.emplace_back();
sentencePhonemes = &phonemes[phonemes.size() - 1];
}
sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
if (clauseBreakerIndex < textClauseBreakers.size()) {
phonemizeConfig.phonemes->push_back(
textClauseBreakers[clauseBreakerIndex]);
auto clauseBreaker = textClauseBreakers[clauseBreakerIndex];
sentencePhonemes->push_back(clauseBreaker);
if (phonemizeConfig.eSpeak->sentenceBreakers.contains(clauseBreaker)) {
// End of sentence
sentencePhonemes = nullptr;
}
clauseBreakerIndex++;
}
}
@@ -75,31 +89,30 @@ void phonemize(PhonemizeConfig &phonemizeConfig) {
} /* phonemize */
// Phonemes to ids using JSON map
void phonemes2ids(PhonemizeConfig &phonemizeConfig,
SynthesisConfig &synthesisConfig) {
if (!phonemizeConfig.phonemes) {
throw runtime_error("No phonemes present");
void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
vector<PhonemeId> &phonemeIds) {
if (phonemes.empty()) {
throw runtime_error("No phonemes");
}
synthesisConfig.phonemeIds.push_back(phonemizeConfig.idBos);
phonemeIds.push_back(phonemizeConfig.idBos);
if (phonemizeConfig.interspersePad) {
synthesisConfig.phonemeIds.push_back(phonemizeConfig.idPad);
phonemeIds.push_back(phonemizeConfig.idPad);
}
for (auto phoneme = phonemizeConfig.phonemes->begin();
phoneme != phonemizeConfig.phonemes->end(); phoneme++) {
for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
synthesisConfig.phonemeIds.push_back(id);
phonemeIds.push_back(id);
if (phonemizeConfig.interspersePad) {
synthesisConfig.phonemeIds.push_back(phonemizeConfig.idPad);
phonemeIds.push_back(phonemizeConfig.idPad);
}
}
}
}
synthesisConfig.phonemeIds.push_back(phonemizeConfig.idEos);
phonemeIds.push_back(phonemizeConfig.idEos);
} /* phonemes2ids */
+44 -8
View File
@@ -76,15 +76,51 @@ void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
// Phonemize text and synthesize audio
void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
SynthesisResult &result) {
voice.phonemizeConfig.text = text;
voice.phonemizeConfig.phonemes.reset();
phonemize(voice.phonemizeConfig);
SynthesisResult &result,
const function<void()> &audioCallback) {
voice.synthesisConfig.phonemeIds.clear();
phonemes2ids(voice.phonemizeConfig, voice.synthesisConfig);
size_t sentenceSilenceSamples = 0;
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
sentenceSilenceSamples = (size_t)(
voice.synthesisConfig.sentenceSilenceSeconds *
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
synthesize(voice.synthesisConfig, voice.session, audioBuffer, result);
// Phonemes for each sentence
vector<vector<Phoneme>> phonemes;
phonemize(text, voice.phonemizeConfig, phonemes);
vector<PhonemeId> phonemeIds;
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
vector<Phoneme> &sentencePhonemes = *phonemesIter;
SynthesisResult sentenceResult;
phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
sentenceResult);
// Add end of sentence silence
if (sentenceSilenceSamples > 0) {
for (size_t i = 0; i < sentenceSilenceSamples; i++) {
audioBuffer.push_back(0);
}
}
if (audioCallback) {
// Call back must copy audio since it is cleared afterwards.
audioCallback();
audioBuffer.clear();
}
result.audioSeconds += sentenceResult.audioSeconds;
result.inferSeconds += sentenceResult.inferSeconds;
phonemeIds.clear();
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
} /* textToAudio */
@@ -93,7 +129,7 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile,
SynthesisResult &result) {
vector<int16_t> audioBuffer;
textToAudio(voice, text, audioBuffer, result);
textToAudio(voice, text, audioBuffer, result, NULL);
// Write WAV
auto synthesisConfig = voice.synthesisConfig;
+6 -7
View File
@@ -26,22 +26,21 @@ struct SynthesisResult {
};
// Phoneme ids to WAV audio
void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
vector<int16_t> &audioBuffer, SynthesisResult &result) {
void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
ModelSession &session, vector<int16_t> &audioBuffer,
SynthesisResult &result) {
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
vector<int64_t> phonemeIdLengths{(int64_t)synthesisConfig.phonemeIds.size()};
vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
synthesisConfig.noiseW};
vector<Ort::Value> inputTensors;
vector<int64_t> phonemeIdsShape{1,
(int64_t)synthesisConfig.phonemeIds.size()};
vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, synthesisConfig.phonemeIds.data(),
synthesisConfig.phonemeIds.size(), phonemeIdsShape.data(),
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};