diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index 1242b87..aad42af 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,9 @@ struct RunConfig { // "output_file": str, (optional) // } bool jsonInput = false; + + // Seconds of extra silence to insert after a single phoneme + optional> phonemeSilenceSeconds; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); @@ -185,6 +189,8 @@ int main(int argc, char *argv[]) { runConfig.sentenceSilenceSeconds.value(); } + voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds; + if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); spdlog::info("Output directory: {}", runConfig.outputPath.value().string()); @@ -453,6 +459,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) { } else if (arg == "--sentence_silence" || arg == "--sentence-silence") { ensureArg(argc, argv, i); runConfig.sentenceSilenceSeconds = stof(argv[++i]); + } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") { + ensureArg(argc, argv, i); + ensureArg(argc, argv, i + 1); + auto phonemeStr = std::string(argv[++i]); + if (!piper::isSingleCodepoint(phonemeStr)) { + std::cerr << "Phoneme '" << phonemeStr + << "' is not a single codepoint (--phoneme_silence)" + << std::endl; + exit(1); + } + + if (!runConfig.phonemeSilenceSeconds) { + runConfig.phonemeSilenceSeconds.emplace(); + } + + auto phoneme = piper::getCodepoint(phonemeStr); + (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]); } else if (arg == "--espeak_data" || arg == "--espeak-data") { ensureArg(argc, argv, i); runConfig.eSpeakDataPath = filesystem::path(argv[++i]); diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp index d83dd3f..6da95cd 100644 --- a/src/cpp/piper.cpp +++ b/src/cpp/piper.cpp @@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f; const std::string instanceName{"piper"}; -std::string getVersion() { - return VERSION; -} +std::string getVersion() { return VERSION; } // True if the string is a single UTF-8 codepoint bool isSingleCodepoint(std::string s) { @@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, sentencePhonemes.size(), phonemesStr); } - SynthesisResult sentenceResult; + std::vector>> phrasePhonemes; + std::vector phraseResults; + std::vector phraseSilenceSamples; // Use phoneme/id map from config PhonemeIdConfig idConfig; idConfig.phonemeIdMap = std::make_shared(voice.phonemizeConfig.phonemeIdMap); - // phonemes -> ids - phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes); - if (spdlog::should_log(spdlog::level::debug)) { - // DEBUG log for phoneme ids - std::stringstream phonemeIdsStr; - for (auto phonemeId : phonemeIds) { - phonemeIdsStr << phonemeId << ", "; - } + if (voice.synthesisConfig.phonemeSilenceSeconds) { + // Split into phrases + std::map &phonemeSilenceSeconds = + *voice.synthesisConfig.phonemeSilenceSeconds; - spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", - sentencePhonemes.size(), phonemeIds.size(), - phonemeIdsStr.str()); + auto currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + + for (auto sentencePhonemesIter = sentencePhonemes.begin(); + sentencePhonemesIter != sentencePhonemes.end(); + sentencePhonemesIter++) { + Phoneme ¤tPhoneme = *sentencePhonemesIter; + currentPhrasePhonemes->push_back(currentPhoneme); + + if (phonemeSilenceSeconds.count(currentPhoneme) > 0) { + // Split at phrase boundary + phraseSilenceSamples.push_back( + (std::size_t)(phonemeSilenceSeconds[currentPhoneme] * + voice.synthesisConfig.sampleRate * + voice.synthesisConfig.channels)); + + currentPhrasePhonemes = std::make_shared>(); + phrasePhonemes.push_back(currentPhrasePhonemes); + } + } + } else { + // Use all phonemes + phrasePhonemes.push_back( + std::make_shared>(sentencePhonemes)); } - // ids -> audio - synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, - sentenceResult); + // Ensure results/samples are the same size + while (phraseResults.size() < phrasePhonemes.size()) { + phraseResults.emplace_back(); + } + + while (phraseSilenceSamples.size() < phrasePhonemes.size()) { + phraseSilenceSamples.push_back(0); + } + + // phonemes -> ids -> audio + for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) { + if (phrasePhonemes[phraseIdx]->size() <= 0) { + continue; + } + + // phonemes -> ids + phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds, + missingPhonemes); + if (spdlog::should_log(spdlog::level::debug)) { + // DEBUG log for phoneme ids + std::stringstream phonemeIdsStr; + for (auto phonemeId : phonemeIds) { + phonemeIdsStr << phonemeId << ", "; + } + + spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}", + phrasePhonemes[phraseIdx]->size(), phonemeIds.size(), + phonemeIdsStr.str()); + } + + // ids -> audio + synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer, + phraseResults[phraseIdx]); + + // Add end of phrase silence + for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) { + audioBuffer.push_back(0); + } + + result.audioSeconds += phraseResults[phraseIdx].audioSeconds; + result.inferSeconds += phraseResults[phraseIdx].inferSeconds; + + phonemeIds.clear(); + } // Add end of sentence silence if (sentenceSilenceSamples > 0) { @@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text, audioBuffer.clear(); } - result.audioSeconds += sentenceResult.audioSeconds; - result.inferSeconds += sentenceResult.inferSeconds; - phonemeIds.clear(); } diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp index 9e7c222..332a619 100644 --- a/src/cpp/piper.hpp +++ b/src/cpp/piper.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -49,14 +50,22 @@ struct PhonemizeConfig { }; struct SynthesisConfig { + // VITS inference settings float noiseScale = 0.667f; float lengthScale = 1.0f; float noiseW = 0.8f; + + // Audio settings int sampleRate = 22050; int sampleWidth = 2; // 16-bit int channels = 1; // mono + + // Speaker id from 0 to numSpeakers - 1 std::optional speakerId; + + // Extra silence float sentenceSilenceSeconds = 0.2f; + std::optional> phonemeSilenceSeconds; }; struct ModelConfig { @@ -89,6 +98,12 @@ struct Voice { ModelSession session; }; +// True if the string is a single UTF-8 codepoint +bool isSingleCodepoint(std::string s); + +// Get the first UTF-8 codepoint of a string +Phoneme getCodepoint(std::string s); + // Get version of Piper std::string getVersion();