mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-29 19:24:49 +00:00
Add phoneme-silence
This commit is contained in:
@@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
const std::string instanceName{"piper"};
|
||||
|
||||
std::string getVersion() {
|
||||
return VERSION;
|
||||
}
|
||||
std::string getVersion() { return VERSION; }
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s) {
|
||||
@@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
SynthesisResult sentenceResult;
|
||||
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
|
||||
std::vector<SynthesisResult> phraseResults;
|
||||
std::vector<size_t> phraseSilenceSamples;
|
||||
|
||||
// Use phoneme/id map from config
|
||||
PhonemeIdConfig idConfig;
|
||||
idConfig.phonemeIdMap =
|
||||
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
if (voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Split into phrases
|
||||
std::map<Phoneme, float> &phonemeSilenceSeconds =
|
||||
*voice.synthesisConfig.phonemeSilenceSeconds;
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
sentencePhonemes.size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
|
||||
for (auto sentencePhonemesIter = sentencePhonemes.begin();
|
||||
sentencePhonemesIter != sentencePhonemes.end();
|
||||
sentencePhonemesIter++) {
|
||||
Phoneme ¤tPhoneme = *sentencePhonemesIter;
|
||||
currentPhrasePhonemes->push_back(currentPhoneme);
|
||||
|
||||
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
|
||||
// Split at phrase boundary
|
||||
phraseSilenceSamples.push_back(
|
||||
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
|
||||
voice.synthesisConfig.sampleRate *
|
||||
voice.synthesisConfig.channels));
|
||||
|
||||
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use all phonemes
|
||||
phrasePhonemes.push_back(
|
||||
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
// Ensure results/samples are the same size
|
||||
while (phraseResults.size() < phrasePhonemes.size()) {
|
||||
phraseResults.emplace_back();
|
||||
}
|
||||
|
||||
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
|
||||
phraseSilenceSamples.push_back(0);
|
||||
}
|
||||
|
||||
// phonemes -> ids -> audio
|
||||
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
|
||||
if (phrasePhonemes[phraseIdx]->size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
|
||||
missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
phraseResults[phraseIdx]);
|
||||
|
||||
// Add end of phrase silence
|
||||
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
|
||||
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
|
||||
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
@@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user