mirror of
https://github.com/pstrueb/piper.git
synced 2026-04-28 18:54:49 +00:00
Add phoneme-silence
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
@@ -76,6 +77,9 @@ struct RunConfig {
|
||||
// "output_file": str, (optional)
|
||||
// }
|
||||
bool jsonInput = false;
|
||||
|
||||
// Seconds of extra silence to insert after a single phoneme
|
||||
optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -185,6 +189,8 @@ int main(int argc, char *argv[]) {
|
||||
runConfig.sentenceSilenceSeconds.value();
|
||||
}
|
||||
|
||||
voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
|
||||
@@ -453,6 +459,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
|
||||
} else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
ensureArg(argc, argv, i + 1);
|
||||
auto phonemeStr = std::string(argv[++i]);
|
||||
if (!piper::isSingleCodepoint(phonemeStr)) {
|
||||
std::cerr << "Phoneme '" << phonemeStr
|
||||
<< "' is not a single codepoint (--phoneme_silence)"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!runConfig.phonemeSilenceSeconds) {
|
||||
runConfig.phonemeSilenceSeconds.emplace();
|
||||
}
|
||||
|
||||
auto phoneme = piper::getCodepoint(phonemeStr);
|
||||
(*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
|
||||
@@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
const std::string instanceName{"piper"};
|
||||
|
||||
std::string getVersion() {
|
||||
return VERSION;
|
||||
}
|
||||
std::string getVersion() { return VERSION; }
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s) {
|
||||
@@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
SynthesisResult sentenceResult;
|
||||
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
|
||||
std::vector<SynthesisResult> phraseResults;
|
||||
std::vector<size_t> phraseSilenceSamples;
|
||||
|
||||
// Use phoneme/id map from config
|
||||
PhonemeIdConfig idConfig;
|
||||
idConfig.phonemeIdMap =
|
||||
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
if (voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Split into phrases
|
||||
std::map<Phoneme, float> &phonemeSilenceSeconds =
|
||||
*voice.synthesisConfig.phonemeSilenceSeconds;
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
sentencePhonemes.size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
|
||||
for (auto sentencePhonemesIter = sentencePhonemes.begin();
|
||||
sentencePhonemesIter != sentencePhonemes.end();
|
||||
sentencePhonemesIter++) {
|
||||
Phoneme ¤tPhoneme = *sentencePhonemesIter;
|
||||
currentPhrasePhonemes->push_back(currentPhoneme);
|
||||
|
||||
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
|
||||
// Split at phrase boundary
|
||||
phraseSilenceSamples.push_back(
|
||||
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
|
||||
voice.synthesisConfig.sampleRate *
|
||||
voice.synthesisConfig.channels));
|
||||
|
||||
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use all phonemes
|
||||
phrasePhonemes.push_back(
|
||||
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
// Ensure results/samples are the same size
|
||||
while (phraseResults.size() < phrasePhonemes.size()) {
|
||||
phraseResults.emplace_back();
|
||||
}
|
||||
|
||||
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
|
||||
phraseSilenceSamples.push_back(0);
|
||||
}
|
||||
|
||||
// phonemes -> ids -> audio
|
||||
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
|
||||
if (phrasePhonemes[phraseIdx]->size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
|
||||
missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
phraseResults[phraseIdx]);
|
||||
|
||||
// Add end of phrase silence
|
||||
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
|
||||
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
|
||||
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
@@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -49,14 +50,22 @@ struct PhonemizeConfig {
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
// VITS inference settings
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
|
||||
// Audio settings
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
|
||||
// Speaker id from 0 to numSpeakers - 1
|
||||
std::optional<SpeakerId> speakerId;
|
||||
|
||||
// Extra silence
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
@@ -89,6 +98,12 @@ struct Voice {
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s);
|
||||
|
||||
// Get the first UTF-8 codepoint of a string
|
||||
Phoneme getCodepoint(std::string s);
|
||||
|
||||
// Get version of Piper
|
||||
std::string getVersion();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user