mirror of
https://github.com/pstrueb/piper.git
synced 2026-06-18 17:22:27 +00:00
Merge branch 'rhasspy:master' into master
This commit is contained in:
+17
-27
@@ -1,47 +1,37 @@
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
project(piper C CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(SPDLOG REQUIRED spdlog)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
ADD_EXECUTABLE(piper main.cpp)
|
||||
ADD_EXECUTABLE(piper main.cpp piper.cpp)
|
||||
|
||||
string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
|
||||
string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")
|
||||
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
|
||||
|
||||
# https://github.com/espeak-ng/pcaudiolib
|
||||
check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
|
||||
|
||||
if(PCAUDIO_INCLUDE_FOUND)
|
||||
option(USE_PCAUDIO "Build with pcaudiolib" ON)
|
||||
if(USE_PCAUDIO)
|
||||
target_compile_definitions(piper PUBLIC HAVE_PCAUDIO)
|
||||
set(PCAUDIO_LIBRARIES "pcaudio")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR})
|
||||
set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)
|
||||
|
||||
target_link_libraries(piper
|
||||
piper_phonemize
|
||||
espeak-ng
|
||||
onnxruntime
|
||||
pthread
|
||||
-static-libgcc -static-libstdc++
|
||||
${ESPEAK_NG_LIBRARIES}
|
||||
${PCAUDIO_LIBRARIES})
|
||||
${SPDLOG_LIBRARIES})
|
||||
|
||||
if(NOT APPLE)
|
||||
target_link_libraries(piper -static-libgcc -static-libstdc++)
|
||||
endif()
|
||||
|
||||
target_link_directories(piper PUBLIC
|
||||
${ESPEAK_NG_LIBRARY_DIRS}
|
||||
${ONNXRUNTIME_ROOTDIR}/lib)
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/lib)
|
||||
|
||||
target_include_directories(piper PUBLIC
|
||||
${ONNXRUNTIME_ROOTDIR}/include
|
||||
${ESPEAK_NG_INCLUDE_DIRS})
|
||||
${PIPER_PHONEMIZE_ROOTDIR}/include
|
||||
${SPDLOG_INCLUDE_DIRS})
|
||||
|
||||
target_compile_options(piper PUBLIC
|
||||
${ESPEAK_NG_CFLAGS_OTHER})
|
||||
${SPDLOG_CFLAGS_OTHER})
|
||||
|
||||
@@ -1,155 +0,0 @@
|
||||
#ifndef CONFIG_H_
|
||||
#define CONFIG_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "json.hpp"
|
||||
#include "utf8.h"
|
||||
|
||||
using namespace std;
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace piper {
|
||||
|
||||
typedef char32_t Phoneme;
|
||||
typedef int64_t PhonemeId;
|
||||
typedef int64_t SpeakerId;
|
||||
|
||||
const string DefaultVoice = "en-us";
|
||||
|
||||
enum eSpeakMode { Text, TextWithPhonemes, SSML };
|
||||
|
||||
struct eSpeakConfig {
|
||||
string voice = DefaultVoice;
|
||||
eSpeakMode mode = Text;
|
||||
|
||||
// Characters that eSpeak uses to break apart paragraphs/sentences
|
||||
set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
|
||||
|
||||
Phoneme fullStop = U'.';
|
||||
Phoneme comma = U',';
|
||||
Phoneme question = U'?';
|
||||
Phoneme exclamation = U'!';
|
||||
};
|
||||
|
||||
struct PhonemizeConfig {
|
||||
optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
|
||||
map<Phoneme, vector<PhonemeId>> phonemeIdMap;
|
||||
|
||||
PhonemeId idPad = 0; // padding (optionally interspersed)
|
||||
PhonemeId idBos = 1; // beginning of sentence
|
||||
PhonemeId idEos = 2; // end of sentence
|
||||
bool interspersePad = true;
|
||||
|
||||
optional<eSpeakConfig> eSpeak;
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
optional<SpeakerId> speakerId;
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
int numSpeakers;
|
||||
};
|
||||
|
||||
bool isSingleCodepoint(string s) {
|
||||
return utf8::distance(s.begin(), s.end()) == 1;
|
||||
}
|
||||
|
||||
Phoneme getCodepoint(string s) {
|
||||
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
||||
return *character_iter;
|
||||
}
|
||||
|
||||
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
|
||||
if (configRoot.contains("espeak")) {
|
||||
if (!phonemizeConfig.eSpeak) {
|
||||
phonemizeConfig.eSpeak.emplace();
|
||||
}
|
||||
|
||||
auto espeakValue = configRoot["espeak"];
|
||||
if (espeakValue.contains("voice")) {
|
||||
phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [phoneme] map
|
||||
if (configRoot.contains("phoneme_map")) {
|
||||
if (!phonemizeConfig.phonemeMap) {
|
||||
phonemizeConfig.phonemeMap.emplace();
|
||||
}
|
||||
|
||||
auto phonemeMapValue = configRoot["phoneme_map"];
|
||||
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
||||
string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
||||
string toPhoneme = toPhonemeValue.get<string>();
|
||||
if (!isSingleCodepoint(toPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto toCodepoint = getCodepoint(toPhoneme);
|
||||
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [id] map
|
||||
if (configRoot.contains("phoneme_id_map")) {
|
||||
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
||||
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||
string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} /* parsePhonemizeConfig */
|
||||
|
||||
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
|
||||
if (configRoot.contains("audio")) {
|
||||
auto audioValue = configRoot["audio"];
|
||||
if (audioValue.contains("sample_rate")) {
|
||||
// Default sample rate is 22050 Hz
|
||||
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
||||
}
|
||||
}
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
||||
|
||||
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
||||
|
||||
} /* parseModelConfig */
|
||||
|
||||
} // namespace piper
|
||||
|
||||
#endif // CONFIG_H_
|
||||
+160
-154
@@ -2,6 +2,7 @@
|
||||
#include <condition_variable>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
@@ -10,38 +11,60 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
// https://github.com/espeak-ng/pcaudiolib
|
||||
#include <pcaudiolib/audio.h>
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <mach-o/dyld.h>
|
||||
#endif
|
||||
|
||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "piper.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
enum OutputType {
|
||||
OUTPUT_FILE,
|
||||
OUTPUT_DIRECTORY,
|
||||
OUTPUT_STDOUT,
|
||||
OUTPUT_PLAY,
|
||||
OUTPUT_RAW
|
||||
};
|
||||
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
|
||||
|
||||
struct RunConfig {
|
||||
// Path to .onnx voice file
|
||||
filesystem::path modelPath;
|
||||
|
||||
// Path to JSON voice config file
|
||||
filesystem::path modelConfigPath;
|
||||
OutputType outputType = OUTPUT_PLAY;
|
||||
optional<filesystem::path> outputPath;
|
||||
|
||||
// Type of output to produce.
|
||||
// Default is to write a WAV file in the current directory.
|
||||
OutputType outputType = OUTPUT_DIRECTORY;
|
||||
|
||||
// Path for output
|
||||
optional<filesystem::path> outputPath = filesystem::path(".");
|
||||
|
||||
// Numerical id of the default speaker (multi-speaker voices)
|
||||
optional<piper::SpeakerId> speakerId;
|
||||
|
||||
// Amount of noise to add during audio generation
|
||||
optional<float> noiseScale;
|
||||
|
||||
// Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
|
||||
optional<float> lengthScale;
|
||||
|
||||
// Variation in phoneme lengths
|
||||
optional<float> noiseW;
|
||||
|
||||
// Seconds of silence to add after each sentence
|
||||
optional<float> sentenceSilenceSeconds;
|
||||
|
||||
// Path to espeak-ng data directory (default is next to piper executable)
|
||||
optional<filesystem::path> eSpeakDataPath;
|
||||
|
||||
// Path to libtashkeel ort model
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
optional<filesystem::path> tashkeelModelPath;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
@@ -49,35 +72,89 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished);
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
|
||||
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished);
|
||||
#endif
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
|
||||
|
||||
RunConfig runConfig;
|
||||
parseArgs(argc, argv, runConfig);
|
||||
|
||||
// NOTE: This won't work for Windows (need GetModuleFileName)
|
||||
piper::PiperConfig piperConfig;
|
||||
piper::Voice voice;
|
||||
|
||||
spdlog::debug("Loading voice from {} (config={})",
|
||||
runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string());
|
||||
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(piperConfig, runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
spdlog::info("Loaded voice in {} second(s)",
|
||||
chrono::duration<double>(endTime - startTime).count());
|
||||
|
||||
// Get the path to the piper executable so we can locate espeak-ng-data, etc.
|
||||
// next to it.
|
||||
#ifdef _MSC_VER
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = { 0 };
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#elifdef __APPLE__
|
||||
auto exePath = []() {
|
||||
char moduleFileName[PATH_MAX] = {0};
|
||||
uint32_t moduleFileNameSize = std::size(moduleFileName);
|
||||
_NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
#endif
|
||||
piper::initialize(exePath.parent_path());
|
||||
|
||||
piper::Voice voice;
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
|
||||
voice, runConfig.speakerId);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
|
||||
cerr << "Load time: " << loadSeconds << " sec" << endl;
|
||||
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
|
||||
spdlog::debug("Voice uses eSpeak phonemes ({})",
|
||||
voice.phonemizeConfig.eSpeak.voice);
|
||||
|
||||
if (runConfig.eSpeakDataPath) {
|
||||
// User provided path
|
||||
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.eSpeakDataPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("espeak-ng-data"))
|
||||
.string();
|
||||
|
||||
spdlog::debug("espeak-ng-data directory is expected at {}",
|
||||
piperConfig.eSpeakDataPath);
|
||||
}
|
||||
} else {
|
||||
// Not using eSpeak
|
||||
piperConfig.useESpeak = false;
|
||||
}
|
||||
|
||||
// Enable libtashkeel for Arabic
|
||||
if (voice.phonemizeConfig.eSpeak.voice == "ar") {
|
||||
piperConfig.useTashkeel = true;
|
||||
if (runConfig.tashkeelModelPath) {
|
||||
// User provided path
|
||||
piperConfig.tashkeelModelPath =
|
||||
runConfig.tashkeelModelPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.tashkeelModelPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("libtashkeel_model.ort"))
|
||||
.string();
|
||||
|
||||
spdlog::debug("libtashkeel model is expected at {}",
|
||||
piperConfig.tashkeelModelPath.value());
|
||||
}
|
||||
}
|
||||
|
||||
piper::initialize(piperConfig);
|
||||
|
||||
// Scales
|
||||
if (runConfig.noiseScale) {
|
||||
@@ -92,36 +169,14 @@ int main(int argc, char *argv[]) {
|
||||
voice.synthesisConfig.noiseW = runConfig.noiseW.value();
|
||||
}
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
audio_object *my_audio = nullptr;
|
||||
|
||||
if (runConfig.outputType == OUTPUT_PLAY) {
|
||||
// Output audio to the default audio device
|
||||
my_audio = create_audio_device_object(NULL, "piper", "Text-to-Speech");
|
||||
|
||||
// TODO: Support 32-bit sample widths
|
||||
auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
|
||||
int error = audio_object_open(my_audio, audioFormat,
|
||||
voice.synthesisConfig.sampleRate,
|
||||
voice.synthesisConfig.channels);
|
||||
if (error != 0) {
|
||||
throw runtime_error(audio_object_strerror(my_audio, error));
|
||||
}
|
||||
if (runConfig.sentenceSilenceSeconds) {
|
||||
voice.synthesisConfig.sentenceSilenceSeconds =
|
||||
runConfig.sentenceSilenceSeconds.value();
|
||||
}
|
||||
#else
|
||||
if (runConfig.outputType == OUTPUT_PLAY) {
|
||||
// Cannot play audio directly
|
||||
cerr << "WARNING: Piper was not compiled with pcaudiolib. Output audio "
|
||||
"will be written to the current directory."
|
||||
<< endl;
|
||||
runConfig.outputType = OUTPUT_DIRECTORY;
|
||||
runConfig.outputPath = filesystem::path(".");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
cerr << "Output directory: " << runConfig.outputPath.value() << endl;
|
||||
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
|
||||
}
|
||||
|
||||
string line;
|
||||
@@ -142,15 +197,23 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
// Output audio to automatically-named WAV file in a directory
|
||||
ofstream audioFile(outputPath.string(), ios::binary);
|
||||
piper::textToWavFile(voice, line, audioFile, result);
|
||||
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
|
||||
cout << outputPath.string() << endl;
|
||||
} else if (runConfig.outputType == OUTPUT_FILE) {
|
||||
// Read all of standard input before synthesizing.
|
||||
// Otherwise, we would overwrite the output file for each line.
|
||||
stringstream text;
|
||||
text << line;
|
||||
while (getline(cin, line)) {
|
||||
text << " " << line;
|
||||
}
|
||||
|
||||
// Output audio to WAV file
|
||||
ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
|
||||
piper::textToWavFile(voice, line, audioFile, result);
|
||||
piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result);
|
||||
} else if (runConfig.outputType == OUTPUT_STDOUT) {
|
||||
// Output WAV to stdout
|
||||
piper::textToWavFile(voice, line, cout, result);
|
||||
piper::textToWavFile(piperConfig, voice, line, cout, result);
|
||||
} else if (runConfig.outputType == OUTPUT_RAW) {
|
||||
// Raw output to stdout
|
||||
mutex mutAudio;
|
||||
@@ -174,7 +237,8 @@ int main(int argc, char *argv[]) {
|
||||
cvAudio.notify_one();
|
||||
}
|
||||
};
|
||||
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
|
||||
piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
|
||||
audioCallback);
|
||||
|
||||
// Signal thread that there is no more audio
|
||||
{
|
||||
@@ -185,65 +249,22 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
// Wait for audio output to finish
|
||||
cerr << "Waiting for audio..." << endl;
|
||||
spdlog::info("Waiting for audio to finish playing...");
|
||||
rawOutputThread.join();
|
||||
} else if (runConfig.outputType == OUTPUT_PLAY) {
|
||||
#ifdef HAVE_PCAUDIO
|
||||
mutex mutAudio;
|
||||
condition_variable cvAudio;
|
||||
bool audioReady = false;
|
||||
bool audioFinished = false;
|
||||
vector<int16_t> audioBuffer;
|
||||
vector<int16_t> sharedAudioBuffer;
|
||||
|
||||
thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
|
||||
ref(mutAudio), ref(cvAudio), ref(audioReady),
|
||||
ref(audioFinished));
|
||||
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
|
||||
&cvAudio, &audioReady]() {
|
||||
// Signal thread that audio is ready
|
||||
{
|
||||
unique_lock lockAudio(mutAudio);
|
||||
copy(audioBuffer.begin(), audioBuffer.end(),
|
||||
back_inserter(sharedAudioBuffer));
|
||||
audioReady = true;
|
||||
cvAudio.notify_one();
|
||||
}
|
||||
};
|
||||
piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
|
||||
|
||||
// Signal thread that there is no more audio
|
||||
{
|
||||
unique_lock lockAudio(mutAudio);
|
||||
audioReady = true;
|
||||
audioFinished = true;
|
||||
cvAudio.notify_one();
|
||||
}
|
||||
|
||||
// Wait for audio output to finish
|
||||
cerr << "Waiting for audio..." << endl;
|
||||
playThread.join();
|
||||
#else
|
||||
throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
|
||||
#endif
|
||||
}
|
||||
|
||||
cerr << "Real-time factor: " << result.realTimeFactor
|
||||
<< " (infer=" << result.inferSeconds
|
||||
<< " sec, audio=" << result.audioSeconds << " sec)" << endl;
|
||||
spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
|
||||
result.realTimeFactor, result.inferSeconds,
|
||||
result.audioSeconds);
|
||||
}
|
||||
|
||||
piper::terminate();
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
audio_object_close(my_audio);
|
||||
audio_object_destroy(my_audio);
|
||||
my_audio = nullptr;
|
||||
#endif
|
||||
piper::terminate(piperConfig);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished) {
|
||||
@@ -275,42 +296,7 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
|
||||
} // rawOutputProc
|
||||
|
||||
#ifdef HAVE_PCAUDIO
|
||||
void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
|
||||
mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished) {
|
||||
vector<int16_t> internalAudioBuffer;
|
||||
while (true) {
|
||||
{
|
||||
unique_lock lockAudio{mutAudio};
|
||||
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
|
||||
|
||||
if (sharedAudioBuffer.empty() && audioFinished) {
|
||||
break;
|
||||
}
|
||||
|
||||
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
|
||||
back_inserter(internalAudioBuffer));
|
||||
|
||||
sharedAudioBuffer.clear();
|
||||
|
||||
if (!audioFinished) {
|
||||
audioReady = false;
|
||||
}
|
||||
}
|
||||
|
||||
int error =
|
||||
audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
|
||||
sizeof(int16_t) * internalAudioBuffer.size());
|
||||
if (error != 0) {
|
||||
throw runtime_error(audio_object_strerror(my_audio, error));
|
||||
}
|
||||
audio_object_flush(my_audio);
|
||||
internalAudioBuffer.clear();
|
||||
}
|
||||
|
||||
} // playProc
|
||||
#endif
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void printUsage(char *argv[]) {
|
||||
cerr << endl;
|
||||
@@ -332,11 +318,18 @@ void printUsage(char *argv[]) {
|
||||
"becomes available"
|
||||
<< endl;
|
||||
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
|
||||
cerr << " --noise-scale NUM generator noise (default: 0.667)"
|
||||
cerr << " --noise_scale NUM generator noise (default: 0.667)"
|
||||
<< endl;
|
||||
cerr << " --length-scale NUM phoneme length (default: 1.0)"
|
||||
cerr << " --length_scale NUM phoneme length (default: 1.0)"
|
||||
<< endl;
|
||||
cerr << " --noise-w NUM phonene width noise (default: 0.8)"
|
||||
cerr << " --noise_w NUM phoneme width noise (default: 0.8)"
|
||||
<< endl;
|
||||
cerr << " --silence_seconds NUM seconds of silence after each "
|
||||
"sentence (default: 0.2)"
|
||||
<< endl;
|
||||
cerr << " --espeak_data DIR path to espeak-ng data directory"
|
||||
<< endl;
|
||||
cerr << " --debug print DEBUG messages to the console"
|
||||
<< endl;
|
||||
cerr << endl;
|
||||
}
|
||||
@@ -361,7 +354,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
} else if (arg == "-c" || arg == "--config") {
|
||||
ensureArg(argc, argv, i);
|
||||
modelConfigPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "-f" || arg == "--output_file") {
|
||||
} else if (arg == "-f" || arg == "--output_file" ||
|
||||
arg == "--output-file") {
|
||||
ensureArg(argc, argv, i);
|
||||
std::string filePath = argv[++i];
|
||||
if (filePath == "-") {
|
||||
@@ -371,24 +365,36 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
runConfig.outputType = OUTPUT_FILE;
|
||||
runConfig.outputPath = filesystem::path(filePath);
|
||||
}
|
||||
} else if (arg == "-d" || arg == "--output_dir") {
|
||||
} else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.outputType = OUTPUT_DIRECTORY;
|
||||
runConfig.outputPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--output_raw") {
|
||||
} else if (arg == "--output_raw" || arg == "--output-raw") {
|
||||
runConfig.outputType = OUTPUT_RAW;
|
||||
} else if (arg == "-s" || arg == "--speaker") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
|
||||
} else if (arg == "--noise-scale") {
|
||||
} else if (arg == "--noise_scale" || arg == "--noise-scale") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.noiseScale = stof(argv[++i]);
|
||||
} else if (arg == "--length-scale") {
|
||||
} else if (arg == "--length_scale" || arg == "--length-scale") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.lengthScale = stof(argv[++i]);
|
||||
} else if (arg == "--noise-w") {
|
||||
} else if (arg == "--noise_w" || arg == "--noise-w") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.noiseW = stof(argv[++i]);
|
||||
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--debug") {
|
||||
// Set DEBUG logging
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
printUsage(argv);
|
||||
exit(0);
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
#ifndef MODEL_H_
|
||||
#define MODEL_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace piper {
|
||||
const string instanceName{"piper"};
|
||||
|
||||
struct ModelSession {
|
||||
Ort::Session onnx;
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
Ort::SessionOptions options;
|
||||
Ort::Env env;
|
||||
|
||||
ModelSession() : onnx(nullptr){};
|
||||
};
|
||||
|
||||
void loadModel(string modelPath, ModelSession &session) {
|
||||
|
||||
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
||||
instanceName.c_str());
|
||||
session.env.DisableTelemetryEvents();
|
||||
|
||||
// Slows down performance by ~2x
|
||||
// session.options.SetIntraOpNumThreads(1);
|
||||
|
||||
// Roughly doubles load time for no visible inference benefit
|
||||
// session.options.SetGraphOptimizationLevel(
|
||||
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||||
|
||||
session.options.SetGraphOptimizationLevel(
|
||||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||||
|
||||
// Slows down performance very slightly
|
||||
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
|
||||
|
||||
session.options.DisableCpuMemArena();
|
||||
session.options.DisableMemPattern();
|
||||
session.options.DisableProfiling();
|
||||
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
auto loadDuration = chrono::duration<double>(endTime - startTime);
|
||||
}
|
||||
|
||||
} // namespace piper
|
||||
|
||||
#endif // MODEL_H_
|
||||
@@ -1,138 +0,0 @@
|
||||
#ifndef PHONEMIZE_H_
|
||||
#define PHONEMIZE_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "utf8.h"
|
||||
|
||||
#define CLAUSE_INTONATION_FULL_STOP 0x00000000
|
||||
#define CLAUSE_INTONATION_COMMA 0x00001000
|
||||
#define CLAUSE_INTONATION_QUESTION 0x00002000
|
||||
#define CLAUSE_INTONATION_EXCLAMATION 0x00003000
|
||||
|
||||
#define CLAUSE_TYPE_SENTENCE 0x00080000
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace piper {
|
||||
|
||||
// Text to phonemes using eSpeak-ng
|
||||
void phonemize(string text, PhonemizeConfig &phonemizeConfig,
|
||||
vector<vector<Phoneme>> &phonemes) {
|
||||
if (!phonemizeConfig.eSpeak) {
|
||||
throw runtime_error("Missing eSpeak config");
|
||||
}
|
||||
|
||||
auto voice = phonemizeConfig.eSpeak->voice;
|
||||
int result = espeak_SetVoiceByName(voice.c_str());
|
||||
if (result != 0) {
|
||||
throw runtime_error("Failed to set eSpeak-ng voice");
|
||||
}
|
||||
|
||||
// Modified by eSpeak
|
||||
string textCopy(text);
|
||||
|
||||
utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
|
||||
utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
|
||||
vector<char32_t> textClauseBreakers;
|
||||
|
||||
// Identify clause breakers in the sentence, since eSpeak removes them during
|
||||
// phonemization.
|
||||
//
|
||||
// This will unfortunately do the wrong thing with abbreviations, etc.
|
||||
while (textIter != textIterEnd) {
|
||||
auto codepoint = *textIter;
|
||||
if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
|
||||
textClauseBreakers.push_back(codepoint);
|
||||
}
|
||||
|
||||
textIter++;
|
||||
}
|
||||
|
||||
vector<Phoneme> *sentencePhonemes = nullptr;
|
||||
const char *inputTextPointer = textCopy.c_str();
|
||||
int terminator = 0;
|
||||
|
||||
while (inputTextPointer != NULL) {
|
||||
// Modified espeak-ng API to get access to clause terminator
|
||||
string clausePhonemes(
|
||||
espeak_TextToPhonemes2((const void **)&inputTextPointer,
|
||||
/*textmode*/ espeakCHARS_AUTO,
|
||||
/*phonememode = IPA*/ 0x02,
|
||||
&terminator));
|
||||
|
||||
utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
|
||||
clausePhonemes.end());
|
||||
utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
|
||||
clausePhonemes.end());
|
||||
|
||||
if (!sentencePhonemes) {
|
||||
// Start new sentence
|
||||
phonemes.emplace_back();
|
||||
sentencePhonemes = &phonemes[phonemes.size() - 1];
|
||||
}
|
||||
|
||||
sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
|
||||
|
||||
// Add appropriate puntuation depending on terminator type
|
||||
int intonation = terminator & 0x0000F000;
|
||||
if (intonation == CLAUSE_INTONATION_FULL_STOP) {
|
||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
|
||||
} else if (intonation == CLAUSE_INTONATION_COMMA) {
|
||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
|
||||
} else if (intonation == CLAUSE_INTONATION_QUESTION) {
|
||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
|
||||
} else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
|
||||
sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
|
||||
}
|
||||
|
||||
if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
|
||||
// End of sentence
|
||||
sentencePhonemes = nullptr;
|
||||
}
|
||||
|
||||
} // while inputTextPointer != NULL
|
||||
|
||||
} /* phonemize */
|
||||
|
||||
// Phonemes to ids using JSON map
|
||||
void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
|
||||
vector<PhonemeId> &phonemeIds) {
|
||||
if (phonemes.empty()) {
|
||||
throw runtime_error("No phonemes");
|
||||
}
|
||||
|
||||
phonemeIds.push_back(phonemizeConfig.idBos);
|
||||
if (phonemizeConfig.interspersePad) {
|
||||
phonemeIds.push_back(phonemizeConfig.idPad);
|
||||
}
|
||||
|
||||
for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
|
||||
if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
|
||||
for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
|
||||
phonemeIds.push_back(id);
|
||||
|
||||
if (phonemizeConfig.interspersePad) {
|
||||
phonemeIds.push_back(phonemizeConfig.idPad);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
phonemeIds.push_back(phonemizeConfig.idEos);
|
||||
|
||||
} /* phonemes2ids */
|
||||
|
||||
} // namespace piper
|
||||
|
||||
#endif // PHONEMIZE_H_
|
||||
@@ -0,0 +1,514 @@
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "piper.hpp"
|
||||
#include "utf8.h"
|
||||
#include "wavfile.hpp"
|
||||
|
||||
namespace piper {
|
||||
|
||||
// Maximum value for 16-bit signed WAV sample
|
||||
const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
const std::string instanceName{"piper"};
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s) {
|
||||
return utf8::distance(s.begin(), s.end()) == 1;
|
||||
}
|
||||
|
||||
// Get the first UTF-8 codepoint of a string
|
||||
Phoneme getCodepoint(std::string s) {
|
||||
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
||||
return *character_iter;
|
||||
}
|
||||
|
||||
// Load JSON config information for phonemization
|
||||
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
// {
|
||||
// "espeak": {
|
||||
// "voice": "<language code>"
|
||||
// },
|
||||
// "phoneme_type": "<espeak or text>",
|
||||
// "phoneme_map": {
|
||||
// "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
|
||||
// },
|
||||
// "phoneme_id_map": {
|
||||
// "<phoneme>": [<id1>, <id2>, ...]
|
||||
// }
|
||||
// }
|
||||
|
||||
if (configRoot.contains("espeak")) {
|
||||
auto espeakValue = configRoot["espeak"];
|
||||
if (espeakValue.contains("voice")) {
|
||||
phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
|
||||
}
|
||||
}
|
||||
|
||||
if (configRoot.contains("phoneme_type")) {
|
||||
auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
|
||||
if (phonemeTypeStr == "text") {
|
||||
phonemizeConfig.phonemeType = TextPhonemes;
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [id] map
|
||||
// Maps phonemes to one or more phoneme ids (required).
|
||||
if (configRoot.contains("phoneme_id_map")) {
|
||||
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
||||
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||
std::string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme id map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [phoneme] map
|
||||
// Maps phonemes to one or more other phonemes (not normally used).
|
||||
if (configRoot.contains("phoneme_map")) {
|
||||
if (!phonemizeConfig.phonemeMap) {
|
||||
phonemizeConfig.phonemeMap.emplace();
|
||||
}
|
||||
|
||||
auto phonemeMapValue = configRoot["phoneme_map"];
|
||||
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
||||
std::string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
||||
std::string toPhoneme = toPhonemeValue.get<std::string>();
|
||||
if (!isSingleCodepoint(toPhoneme)) {
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto toCodepoint = getCodepoint(toPhoneme);
|
||||
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} /* parsePhonemizeConfig */
|
||||
|
||||
// Load JSON config for audio synthesis
|
||||
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
// {
|
||||
// "audio": {
|
||||
// "sample_rate": 22050
|
||||
// },
|
||||
// "inference": {
|
||||
// "noise_scale": 0.667,
|
||||
// "length_scale": 1,
|
||||
// "noise_w": 0.8
|
||||
// }
|
||||
// }
|
||||
|
||||
if (configRoot.contains("audio")) {
|
||||
auto audioValue = configRoot["audio"];
|
||||
if (audioValue.contains("sample_rate")) {
|
||||
// Default sample rate is 22050 Hz
|
||||
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
||||
}
|
||||
}
|
||||
|
||||
if (configRoot.contains("inference")) {
|
||||
// Overrides default inference settings
|
||||
auto inferenceValue = configRoot["inference"];
|
||||
if (inferenceValue.contains("noise_scale")) {
|
||||
synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("length_scale")) {
|
||||
synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("noise_w")) {
|
||||
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
||||
|
||||
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
||||
|
||||
} /* parseModelConfig */
|
||||
|
||||
void initialize(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
|
||||
// See: https://github.com/rhasspy/espeak-ng
|
||||
spdlog::debug("Initializing eSpeak");
|
||||
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||
/*buflength*/ 0,
|
||||
/*path*/ config.eSpeakDataPath.c_str(),
|
||||
/*options*/ 0);
|
||||
if (result < 0) {
|
||||
throw std::runtime_error("Failed to initialize eSpeak-ng");
|
||||
}
|
||||
|
||||
spdlog::debug("Initialized eSpeak");
|
||||
}
|
||||
|
||||
// Load onnx model for libtashkeel
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
if (config.useTashkeel) {
|
||||
spdlog::debug("Using libtashkeel for diacritization");
|
||||
if (!config.tashkeelModelPath) {
|
||||
throw std::runtime_error("No path to libtashkeel model");
|
||||
}
|
||||
|
||||
spdlog::debug("Loading libtashkeel model from {}",
|
||||
config.tashkeelModelPath.value());
|
||||
config.tashkeelState = std::make_unique<tashkeel::State>();
|
||||
tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
|
||||
*config.tashkeelState);
|
||||
spdlog::debug("Initialized libtashkeel");
|
||||
}
|
||||
|
||||
spdlog::info("Initialized piper");
|
||||
}
|
||||
|
||||
void terminate(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Clean up espeak-ng
|
||||
spdlog::debug("Terminating eSpeak");
|
||||
espeak_Terminate();
|
||||
spdlog::debug("Terminated eSpeak");
|
||||
}
|
||||
|
||||
spdlog::info("Terminated piper");
|
||||
}
|
||||
|
||||
void loadModel(std::string modelPath, ModelSession &session) {
|
||||
spdlog::debug("Loading onnx model from {}", modelPath);
|
||||
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
||||
instanceName.c_str());
|
||||
session.env.DisableTelemetryEvents();
|
||||
|
||||
// Slows down performance by ~2x
|
||||
// session.options.SetIntraOpNumThreads(1);
|
||||
|
||||
// Roughly doubles load time for no visible inference benefit
|
||||
// session.options.SetGraphOptimizationLevel(
|
||||
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||||
|
||||
session.options.SetGraphOptimizationLevel(
|
||||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||||
|
||||
// Slows down performance very slightly
|
||||
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
|
||||
|
||||
session.options.DisableCpuMemArena();
|
||||
session.options.DisableMemPattern();
|
||||
session.options.DisableProfiling();
|
||||
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
spdlog::debug("Loaded onnx model in {} second(s)",
|
||||
std::chrono::duration<double>(endTime - startTime).count());
|
||||
}
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId) {
|
||||
spdlog::debug("Parsing voice config at {}", modelConfigPath);
|
||||
std::ifstream modelConfigFile(modelConfigPath);
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||
|
||||
if (voice.modelConfig.numSpeakers > 1) {
|
||||
// Multi-speaker model
|
||||
if (speakerId) {
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
} else {
|
||||
// Default speaker
|
||||
voice.synthesisConfig.speakerId = 0;
|
||||
}
|
||||
}
|
||||
|
||||
spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
|
||||
|
||||
loadModel(modelPath, voice.session);
|
||||
|
||||
} /* loadVoice */
|
||||
|
||||
// Phoneme ids to WAV audio
|
||||
void synthesize(std::vector<PhonemeId> &phonemeIds,
|
||||
SynthesisConfig &synthesisConfig, ModelSession &session,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
|
||||
spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
|
||||
|
||||
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
||||
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
||||
|
||||
// Allocate
|
||||
std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
|
||||
std::vector<float> scales{synthesisConfig.noiseScale,
|
||||
synthesisConfig.lengthScale,
|
||||
synthesisConfig.noiseW};
|
||||
|
||||
std::vector<Ort::Value> inputTensors;
|
||||
std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
|
||||
phonemeIdsShape.size()));
|
||||
|
||||
std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
|
||||
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
|
||||
|
||||
std::vector<int64_t> scalesShape{(int64_t)scales.size()};
|
||||
inputTensors.push_back(
|
||||
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
||||
scalesShape.data(), scalesShape.size()));
|
||||
|
||||
// Add speaker id.
|
||||
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
|
||||
std::vector<int64_t> speakerId{
|
||||
(int64_t)synthesisConfig.speakerId.value_or(0)};
|
||||
std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
|
||||
|
||||
if (synthesisConfig.speakerId) {
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
||||
speakerIdShape.size()));
|
||||
}
|
||||
|
||||
// From export_onnx.py
|
||||
std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
|
||||
"sid"};
|
||||
std::array<const char *, 1> outputNames = {"output"};
|
||||
|
||||
// Infer
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
auto outputTensors = session.onnx.Run(
|
||||
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
|
||||
inputTensors.size(), outputNames.data(), outputNames.size());
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
|
||||
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
|
||||
throw std::runtime_error("Invalid output tensors");
|
||||
}
|
||||
auto inferDuration = std::chrono::duration<double>(endTime - startTime);
|
||||
result.inferSeconds = inferDuration.count();
|
||||
|
||||
const float *audio = outputTensors.front().GetTensorData<float>();
|
||||
auto audioShape =
|
||||
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
|
||||
int64_t audioCount = audioShape[audioShape.size() - 1];
|
||||
|
||||
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
|
||||
result.realTimeFactor = 0.0;
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
|
||||
result.audioSeconds, result.inferSeconds);
|
||||
|
||||
// Get max audio value for scaling
|
||||
float maxAudioValue = 0.01f;
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
float audioValue = abs(audio[i]);
|
||||
if (audioValue > maxAudioValue) {
|
||||
maxAudioValue = audioValue;
|
||||
}
|
||||
}
|
||||
|
||||
// We know the size up front
|
||||
audioBuffer.reserve(audioCount);
|
||||
|
||||
// Scale audio to fill range and convert to int16
|
||||
float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
int16_t intAudioValue = static_cast<int16_t>(
|
||||
std::clamp(audio[i] * audioScale,
|
||||
static_cast<float>(std::numeric_limits<int16_t>::min()),
|
||||
static_cast<float>(std::numeric_limits<int16_t>::max())));
|
||||
|
||||
audioBuffer.push_back(intAudioValue);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
for (std::size_t i = 0; i < outputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(outputTensors[i].release());
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < inputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(inputTensors[i].release());
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// Phonemize text and synthesize audio
|
||||
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||
const std::function<void()> &audioCallback) {
|
||||
|
||||
std::size_t sentenceSilenceSamples = 0;
|
||||
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
|
||||
sentenceSilenceSamples = (std::size_t)(
|
||||
voice.synthesisConfig.sentenceSilenceSeconds *
|
||||
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||
}
|
||||
|
||||
if (config.useTashkeel) {
|
||||
if (!config.tashkeelState) {
|
||||
throw std::runtime_error("Tashkeel model is not loaded");
|
||||
}
|
||||
|
||||
spdlog::debug("Diacritizing text with libtashkeel: {}", text);
|
||||
text = tashkeel::tashkeel_run(text, *config.tashkeelState);
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
spdlog::debug("Phonemizing text: {}", text);
|
||||
std::vector<std::vector<Phoneme>> phonemes;
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
|
||||
// Use espeak-ng for phonemization
|
||||
eSpeakPhonemeConfig eSpeakConfig;
|
||||
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
|
||||
phonemize_eSpeak(text, eSpeakConfig, phonemes);
|
||||
} else {
|
||||
// Use UTF-8 codepoints as "phonemes"
|
||||
CodepointsPhonemeConfig codepointsConfig;
|
||||
phonemize_codepoints(text, codepointsConfig, phonemes);
|
||||
}
|
||||
|
||||
// Synthesize each sentence independently.
|
||||
std::vector<PhonemeId> phonemeIds;
|
||||
std::map<Phoneme, std::size_t> missingPhonemes;
|
||||
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||
++phonemesIter) {
|
||||
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phonemes
|
||||
std::string phonemesStr;
|
||||
for (auto phoneme : sentencePhonemes) {
|
||||
utf8::append(phoneme, phonemesStr);
|
||||
}
|
||||
|
||||
spdlog::debug("Converting {} phoneme(s) to ids: {}",
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
SynthesisResult sentenceResult;
|
||||
|
||||
PhonemeIdConfig idConfig;
|
||||
if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
|
||||
auto &language = voice.phonemizeConfig.eSpeak.voice;
|
||||
spdlog::debug("Text phoneme language: {}", language);
|
||||
if (DEFAULT_ALPHABET.count(language) < 1) {
|
||||
throw std::runtime_error(
|
||||
"Text phoneme language for voice is not supported");
|
||||
}
|
||||
|
||||
// Use alphabet for language
|
||||
idConfig.phonemeIdMap =
|
||||
std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
|
||||
}
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
sentencePhonemes.size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (audioCallback) {
|
||||
// Call back must copy audio since it is cleared afterwards.
|
||||
audioCallback();
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
if (missingPhonemes.size() > 0) {
|
||||
spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
|
||||
missingPhonemes.size());
|
||||
|
||||
for (auto phonemeCount : missingPhonemes) {
|
||||
std::string phonemeStr;
|
||||
utf8::append(phonemeCount.first, phonemeStr);
|
||||
spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
|
||||
(uint32_t)phonemeCount.first, phonemeCount.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
} /* textToAudio */
|
||||
|
||||
// Phonemize text and synthesize audio to WAV file
|
||||
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::ostream &audioFile, SynthesisResult &result) {
|
||||
|
||||
std::vector<int16_t> audioBuffer;
|
||||
textToAudio(config, voice, text, audioBuffer, result, NULL);
|
||||
|
||||
// Write WAV
|
||||
auto synthesisConfig = voice.synthesisConfig;
|
||||
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
|
||||
synthesisConfig.channels, (int32_t)audioBuffer.size(),
|
||||
audioFile);
|
||||
|
||||
audioFile.write((const char *)audioBuffer.data(),
|
||||
sizeof(int16_t) * audioBuffer.size());
|
||||
|
||||
} /* textToWavFile */
|
||||
|
||||
} // namespace piper
|
||||
+80
-118
@@ -1,24 +1,83 @@
|
||||
#ifndef PIPER_H_
|
||||
#define PIPER_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "json.hpp"
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <phoneme_ids.hpp>
|
||||
#include <phonemize.hpp>
|
||||
#include <tashkeel.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "model.hpp"
|
||||
#include "phonemize.hpp"
|
||||
#include "synthesize.hpp"
|
||||
#include "wavfile.hpp"
|
||||
#include "json.hpp"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace piper {
|
||||
|
||||
typedef int64_t SpeakerId;
|
||||
|
||||
struct eSpeakConfig {
|
||||
std::string voice = "en-us";
|
||||
};
|
||||
|
||||
struct PiperConfig {
|
||||
std::string eSpeakDataPath;
|
||||
bool useESpeak = true;
|
||||
|
||||
bool useTashkeel = false;
|
||||
std::optional<std::string> tashkeelModelPath;
|
||||
std::unique_ptr<tashkeel::State> tashkeelState;
|
||||
};
|
||||
|
||||
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
|
||||
|
||||
struct PhonemizeConfig {
|
||||
PhonemeType phonemeType = eSpeakPhonemes;
|
||||
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
|
||||
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
|
||||
|
||||
PhonemeId idPad = 0; // padding (optionally interspersed)
|
||||
PhonemeId idBos = 1; // beginning of sentence
|
||||
PhonemeId idEos = 2; // end of sentence
|
||||
bool interspersePad = true;
|
||||
|
||||
eSpeakConfig eSpeak;
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
std::optional<SpeakerId> speakerId;
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
int numSpeakers;
|
||||
};
|
||||
|
||||
struct ModelSession {
|
||||
Ort::Session onnx;
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
Ort::SessionOptions options;
|
||||
Ort::Env env;
|
||||
|
||||
ModelSession() : onnx(nullptr){};
|
||||
};
|
||||
|
||||
struct SynthesisResult {
|
||||
double inferSeconds;
|
||||
double audioSeconds;
|
||||
double realTimeFactor;
|
||||
};
|
||||
|
||||
struct Voice {
|
||||
json configRoot;
|
||||
PhonemizeConfig phonemizeConfig;
|
||||
@@ -27,122 +86,25 @@ struct Voice {
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
void initialize(std::filesystem::path cwd) {
|
||||
string dataPath;
|
||||
// Must be called before using textTo* functions
|
||||
void initialize(PiperConfig &config);
|
||||
|
||||
auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
|
||||
if (std::filesystem::is_directory(cwdDataPath)) {
|
||||
dataPath = cwdDataPath.string();
|
||||
}
|
||||
|
||||
cerr << "dataPath: " << dataPath << endl;
|
||||
|
||||
// Set up espeak-ng for calling espeak_TextToPhonemes
|
||||
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||
/*buflength*/ 0,
|
||||
/*path*/ dataPath.c_str(),
|
||||
/*options*/ 0);
|
||||
if (result < 0) {
|
||||
throw runtime_error("Failed to initialize eSpeak-ng");
|
||||
}
|
||||
}
|
||||
|
||||
void terminate() {
|
||||
// Clean up espeak-ng
|
||||
espeak_Terminate();
|
||||
}
|
||||
// Clean up
|
||||
void terminate(PiperConfig &config);
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
|
||||
optional<SpeakerId> &speakerId) {
|
||||
ifstream modelConfigFile(modelConfigPath.c_str());
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||
|
||||
if (voice.modelConfig.numSpeakers > 1) {
|
||||
// Multispeaker model
|
||||
if (speakerId) {
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
} else {
|
||||
// Default speaker
|
||||
voice.synthesisConfig.speakerId = 0;
|
||||
}
|
||||
}
|
||||
|
||||
loadModel(modelPath, voice.session);
|
||||
|
||||
} /* loadVoice */
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId);
|
||||
|
||||
// Phonemize text and synthesize audio
|
||||
void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
|
||||
SynthesisResult &result,
|
||||
const function<void()> &audioCallback) {
|
||||
|
||||
size_t sentenceSilenceSamples = 0;
|
||||
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
|
||||
sentenceSilenceSamples = (size_t)(
|
||||
voice.synthesisConfig.sentenceSilenceSeconds *
|
||||
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
vector<vector<Phoneme>> phonemes;
|
||||
phonemize(text, voice.phonemizeConfig, phonemes);
|
||||
|
||||
vector<PhonemeId> phonemeIds;
|
||||
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||
++phonemesIter) {
|
||||
vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||
SynthesisResult sentenceResult;
|
||||
phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
sentenceResult);
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
for (size_t i = 0; i < sentenceSilenceSamples; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (audioCallback) {
|
||||
// Call back must copy audio since it is cleared afterwards.
|
||||
audioCallback();
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
result.audioSeconds += sentenceResult.audioSeconds;
|
||||
result.inferSeconds += sentenceResult.inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
} /* textToAudio */
|
||||
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||
const std::function<void()> &audioCallback);
|
||||
|
||||
// Phonemize text and synthesize audio to WAV file
|
||||
void textToWavFile(Voice &voice, string text, ostream &audioFile,
|
||||
SynthesisResult &result) {
|
||||
|
||||
vector<int16_t> audioBuffer;
|
||||
textToAudio(voice, text, audioBuffer, result, NULL);
|
||||
|
||||
// Write WAV
|
||||
auto synthesisConfig = voice.synthesisConfig;
|
||||
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
|
||||
synthesisConfig.channels, (int32_t)audioBuffer.size(),
|
||||
audioFile);
|
||||
|
||||
audioFile.write((const char *)audioBuffer.data(),
|
||||
sizeof(int16_t) * audioBuffer.size());
|
||||
|
||||
} /* textToWavFile */
|
||||
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::ostream &audioFile, SynthesisResult &result);
|
||||
|
||||
} // namespace piper
|
||||
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
#ifndef SYNTHESIZE_H_
|
||||
#define SYNTHESIZE_H_
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "model.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace piper {
|
||||
|
||||
// Maximum value for 16-bit signed WAV sample
|
||||
const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
struct SynthesisResult {
|
||||
double inferSeconds;
|
||||
double audioSeconds;
|
||||
double realTimeFactor;
|
||||
};
|
||||
|
||||
// Phoneme ids to WAV audio
|
||||
void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
|
||||
ModelSession &session, vector<int16_t> &audioBuffer,
|
||||
SynthesisResult &result) {
|
||||
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
||||
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
||||
|
||||
// Allocate
|
||||
vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
|
||||
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
|
||||
synthesisConfig.noiseW};
|
||||
|
||||
vector<Ort::Value> inputTensors;
|
||||
vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
|
||||
phonemeIdsShape.size()));
|
||||
|
||||
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
|
||||
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
|
||||
|
||||
vector<int64_t> scalesShape{(int64_t)scales.size()};
|
||||
inputTensors.push_back(
|
||||
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
||||
scalesShape.data(), scalesShape.size()));
|
||||
|
||||
// Add speaker id.
|
||||
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
|
||||
vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
|
||||
vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
|
||||
|
||||
if (synthesisConfig.speakerId) {
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
||||
speakerIdShape.size()));
|
||||
}
|
||||
|
||||
// From export_onnx.py
|
||||
array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
|
||||
"sid"};
|
||||
array<const char *, 1> outputNames = {"output"};
|
||||
|
||||
// Infer
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
auto outputTensors = session.onnx.Run(
|
||||
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
|
||||
inputTensors.size(), outputNames.data(), outputNames.size());
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
|
||||
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
|
||||
throw runtime_error("Invalid output tensors");
|
||||
}
|
||||
auto inferDuration = chrono::duration<double>(endTime - startTime);
|
||||
result.inferSeconds = inferDuration.count();
|
||||
|
||||
const float *audio = outputTensors.front().GetTensorData<float>();
|
||||
auto audioShape =
|
||||
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
|
||||
int64_t audioCount = audioShape[audioShape.size() - 1];
|
||||
|
||||
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
|
||||
result.realTimeFactor = 0.0;
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
// Get max audio value for scaling
|
||||
float maxAudioValue = 0.01f;
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
float audioValue = abs(audio[i]);
|
||||
if (audioValue > maxAudioValue) {
|
||||
maxAudioValue = audioValue;
|
||||
}
|
||||
}
|
||||
|
||||
// We know the size up front
|
||||
audioBuffer.reserve(audioCount);
|
||||
|
||||
// Scale audio to fill range and convert to int16
|
||||
float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
int16_t intAudioValue = static_cast<int16_t>(
|
||||
clamp(audio[i] * audioScale,
|
||||
static_cast<float>(numeric_limits<int16_t>::min()),
|
||||
static_cast<float>(numeric_limits<int16_t>::max())));
|
||||
|
||||
audioBuffer.push_back(intAudioValue);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
for (size_t i = 0; i < outputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(outputTensors[i].release());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < inputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(inputTensors[i].release());
|
||||
}
|
||||
}
|
||||
} // namespace piper
|
||||
|
||||
#endif // SYNTHESIZE_H_
|
||||
+1
-5
@@ -3,8 +3,6 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace piper {
|
||||
|
||||
struct WavHeader {
|
||||
uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
|
||||
uint32_t chunkSize;
|
||||
@@ -14,7 +12,7 @@ struct WavHeader {
|
||||
uint8_t fmt[4] = {'f', 'm', 't', ' '};
|
||||
uint32_t fmtSize = 16; // bytes
|
||||
uint16_t audioFormat = 1; // PCM
|
||||
uint16_t numChannels; // mono
|
||||
uint16_t numChannels; // mono
|
||||
uint32_t sampleRate; // Hertz
|
||||
uint32_t bytesPerSec; // sampleRate * sampleWidth
|
||||
uint16_t blockAlign = 2; // 16-bit mono
|
||||
@@ -39,6 +37,4 @@ void writeWavHeader(int sampleRate, int sampleWidth, int channels,
|
||||
|
||||
} /* writeWavHeader */
|
||||
|
||||
} // namespace piper
|
||||
|
||||
#endif // WAVFILE_H_
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
|
||||
def main() -> None:
|
||||
used_phonemes: "Counter[str]" = Counter()
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
for phoneme in utt["phonemes"]:
|
||||
used_phonemes[phoneme] += 1
|
||||
|
||||
if phoneme not in DEFAULT_PHONEME_ID_MAP:
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"used": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in used_phonemes.most_common()
|
||||
},
|
||||
"missing": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in missing_phonemes.most_common()
|
||||
},
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2,7 +2,6 @@
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@@ -41,7 +40,6 @@ def main():
|
||||
model_g = model.model_g
|
||||
|
||||
num_symbols = model_g.n_vocab
|
||||
num_speakers = model_g.n_speakers
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .norm_audio import make_silence_detector, trim_silence
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
# Removed from the speaking rate calculation
|
||||
_PUNCTUATION = re.compile(".。,,?¿?؟!!;;::-—")
|
||||
|
||||
|
||||
class ExcludeReason(str, Enum):
|
||||
MISSING = "file_missing"
|
||||
EMPTY = "file_empty"
|
||||
LOW = "rate_low"
|
||||
HIGH = "rate_high"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
id: str
|
||||
text: str
|
||||
duration_sec: float
|
||||
speaker: str
|
||||
exclude_reason: Optional[ExcludeReason] = None
|
||||
rate: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.duration_sec > 0:
|
||||
# Don't include punctuation is speaking rate calculation since we
|
||||
# remove silence.
|
||||
text_nopunct = _PUNCTUATION.sub("", self.text)
|
||||
self.rate = len(text_nopunct) / self.duration_sec
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--write-json", help="Path to write information about excluded utterances"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
|
||||
)
|
||||
parser.add_argument("--scale-lower", type=float, default=2.0)
|
||||
parser.add_argument("--scale-upper", type=float, default=2.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not shutil.which("ffprobe"):
|
||||
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
|
||||
|
||||
dataset_dir = Path(args.dataset_dir)
|
||||
wav_dir = dataset_dir / "wav"
|
||||
if not wav_dir.is_dir():
|
||||
wav_dir = dataset_dir / "wavs"
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
|
||||
text_and_audio = []
|
||||
for row in reader:
|
||||
filename, text = row[0], row[-1]
|
||||
speaker = row[1] if len(row) > 2 else "default"
|
||||
|
||||
# Try file name relative to metadata
|
||||
wav_path = dataset_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = dataset_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try wav/ or wavs/
|
||||
wav_path = wav_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
text_and_audio.append((filename, text, wav_path, speaker))
|
||||
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
# speaker -> [rate]
|
||||
utts_by_speaker = defaultdict(list)
|
||||
process_utterance = ProcessUtterance()
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
|
||||
utts_by_speaker[utt.speaker].append(utt)
|
||||
|
||||
is_multispeaker = len(utts_by_speaker) > 1
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
speaker_details = {}
|
||||
for speaker, utts in utts_by_speaker.items():
|
||||
rates = [utt.rate for utt in utts]
|
||||
if rates:
|
||||
# Exclude rates well outside the 25%/75% quantiles
|
||||
rate_qs = statistics.quantiles(rates, n=4)
|
||||
q1 = rate_qs[0] # 25%
|
||||
q3 = rate_qs[-1] # 75%
|
||||
iqr = q3 - q1
|
||||
lower = q1 - (args.scale_lower * iqr)
|
||||
upper = q3 + (args.scale_upper * iqr)
|
||||
speaker_details[speaker] = {
|
||||
"min": min(rates),
|
||||
"max": max(rates),
|
||||
"quanties": rate_qs,
|
||||
"lower": lower,
|
||||
"upper": upper,
|
||||
}
|
||||
|
||||
for utt in utts:
|
||||
if utt.rate < lower:
|
||||
utt.exclude_reason = ExcludeReason.LOW
|
||||
elif utt.rate > upper:
|
||||
utt.exclude_reason = ExcludeReason.HIGH
|
||||
else:
|
||||
if is_multispeaker:
|
||||
writer.writerow((utt.id, utt.speaker, utt.text))
|
||||
else:
|
||||
writer.writerow((utt.id, utt.text))
|
||||
|
||||
if args.write_json:
|
||||
speaker_excluded = {
|
||||
speaker: [
|
||||
asdict(utt)
|
||||
for utt in utts_by_speaker[speaker]
|
||||
if utt.exclude_reason is not None
|
||||
]
|
||||
for speaker in speaker_details
|
||||
}
|
||||
|
||||
with open(args.write_json, "w") as json_file:
|
||||
json.dump(
|
||||
{
|
||||
speaker: {
|
||||
"details": speaker_details[speaker],
|
||||
"num_utterances": len(utts_by_speaker[speaker]),
|
||||
"num_excluded": len(speaker_excluded[speaker]),
|
||||
"excluded": speaker_excluded[speaker],
|
||||
}
|
||||
for speaker in speaker_details
|
||||
},
|
||||
json_file,
|
||||
indent=4,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
class ProcessUtterance:
|
||||
def __init__(self):
|
||||
self.thread_data = threading.local()
|
||||
|
||||
def __call__(
|
||||
self, utt_id: str, text: str, wav_path: Path, speaker: str
|
||||
) -> Utterance:
|
||||
if not wav_path.exists():
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.MISSING,
|
||||
)
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.EMPTY,
|
||||
)
|
||||
|
||||
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
|
||||
|
||||
def get_duration(self, audio_path: Path) -> float:
|
||||
"""Uses ffmpeg to get audio duration."""
|
||||
if not hasattr(self.thread_data, "detector"):
|
||||
self.thread_data.detector = make_silence_detector()
|
||||
|
||||
vad_sample_rate = 16000
|
||||
audio_16khz_bytes = subprocess.check_output(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
str(audio_path),
|
||||
"-f",
|
||||
"s16le",
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
str(vad_sample_rate),
|
||||
"pipe:",
|
||||
],
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
# Normalize
|
||||
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
|
||||
np.float32
|
||||
)
|
||||
audio_16khz /= np.abs(np.max(audio_16khz))
|
||||
|
||||
# Get speaking duration
|
||||
offset_sec, duration_sec = trim_silence(
|
||||
audio_16khz,
|
||||
self.thread_data.detector,
|
||||
threshold=0.8,
|
||||
samples_per_chunk=480,
|
||||
sample_rate=vad_sample_rate,
|
||||
keep_chunks_before=2,
|
||||
keep_chunks_after=2,
|
||||
)
|
||||
|
||||
if duration_sec is None:
|
||||
# Speech goes to end of audio
|
||||
if len(audio_16khz) > 0:
|
||||
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
|
||||
else:
|
||||
duration_sec = 0.0
|
||||
|
||||
return duration_sec
|
||||
|
||||
# return float(
|
||||
# subprocess.check_output(
|
||||
# [
|
||||
# "ffprobe",
|
||||
# "-i",
|
||||
# str(audio_path),
|
||||
# "-show_entries",
|
||||
# "format=duration",
|
||||
# "-v",
|
||||
# "quiet",
|
||||
# "-of",
|
||||
# "csv=p=0",
|
||||
# ],
|
||||
# stderr=subprocess.DEVNULL,
|
||||
# universal_newlines=True,
|
||||
# ).strip()
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,9 +1,23 @@
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from enum import Enum
|
||||
from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
MAX_PHONEMES = 256
|
||||
DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
@@ -135,14 +149,115 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
|
||||
"χ": [127],
|
||||
"ᵻ": [128],
|
||||
"ⱱ": [129],
|
||||
"0": [130], # tones
|
||||
"1": [131],
|
||||
"2": [132],
|
||||
"3": [133],
|
||||
"4": [134],
|
||||
"5": [135],
|
||||
"6": [136],
|
||||
"7": [137],
|
||||
"8": [138],
|
||||
"9": [139],
|
||||
"\u0327": [140], # combining cedilla
|
||||
"\u0303": [141], # combining tilde
|
||||
"\u032a": [142], # combining bridge below
|
||||
"\u032f": [143], # combining inverted breve below
|
||||
"\u0329": [144], # combining vertical line below
|
||||
"ʰ": [145],
|
||||
"ˤ": [146],
|
||||
"ε": [147],
|
||||
"↓": [148],
|
||||
"#": [149], # Icelandic
|
||||
'"': [150], # Russian
|
||||
"↑": [151],
|
||||
"\u033a": [152], # Basque
|
||||
"\u033b": [153],
|
||||
}
|
||||
|
||||
PHONEME_MAPS = {
|
||||
# Brazilian Portuguese
|
||||
"pt-br": {"c": ["k"]}
|
||||
}
|
||||
|
||||
ALPHABETS = {
|
||||
# Ukrainian
|
||||
"uk": {
|
||||
"_": [0],
|
||||
"^": [1],
|
||||
"$": [2],
|
||||
" ": [3],
|
||||
"!": [4],
|
||||
"'": [5],
|
||||
",": [6],
|
||||
"-": [7],
|
||||
".": [8],
|
||||
":": [9],
|
||||
";": [10],
|
||||
"?": [11],
|
||||
"а": [12],
|
||||
"б": [13],
|
||||
"в": [14],
|
||||
"г": [15],
|
||||
"ґ": [16],
|
||||
"д": [17],
|
||||
"е": [18],
|
||||
"є": [19],
|
||||
"ж": [20],
|
||||
"з": [21],
|
||||
"и": [22],
|
||||
"і": [23],
|
||||
"ї": [24],
|
||||
"й": [25],
|
||||
"к": [26],
|
||||
"л": [27],
|
||||
"м": [28],
|
||||
"н": [29],
|
||||
"о": [30],
|
||||
"п": [31],
|
||||
"р": [32],
|
||||
"с": [33],
|
||||
"т": [34],
|
||||
"у": [35],
|
||||
"ф": [36],
|
||||
"х": [37],
|
||||
"ц": [38],
|
||||
"ч": [39],
|
||||
"ш": [40],
|
||||
"щ": [41],
|
||||
"ь": [42],
|
||||
"ю": [43],
|
||||
"я": [44],
|
||||
"\u0301": [45], # combining acute accent
|
||||
"\u0306": [46], # combining breve
|
||||
"\u0308": [47], # combining diaeresis
|
||||
"—": [48], # em dash
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
|
||||
def phonemize(
|
||||
text: str,
|
||||
phonemizer: Phonemizer,
|
||||
phoneme_map: Optional[Dict[str, List[str]]] = None,
|
||||
) -> List[str]:
|
||||
phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)
|
||||
|
||||
# Phonemes are decomposed into unicode codepoints
|
||||
return list(unicodedata.normalize("NFD", phonemes_str))
|
||||
unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
|
||||
if not phoneme_map:
|
||||
return unmapped_phonemes
|
||||
|
||||
# Phonemes can be mapped to lists of other phonemes
|
||||
mapped_phonemes = []
|
||||
for phoneme in unmapped_phonemes:
|
||||
sub_phonemes = phoneme_map.get(phoneme)
|
||||
if sub_phonemes:
|
||||
mapped_phonemes.extend(sub_phonemes)
|
||||
else:
|
||||
mapped_phonemes.append(phoneme)
|
||||
|
||||
return mapped_phonemes
|
||||
|
||||
|
||||
def phonemes_to_ids(
|
||||
@@ -179,3 +294,79 @@ def phonemes_to_ids(
|
||||
phoneme_ids.extend(phoneme_id_map[eos])
|
||||
|
||||
return phoneme_ids
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("language")
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
phonemizer: Optional[Phonemizer] = None
|
||||
|
||||
if args.text_casing == "lower":
|
||||
casing = str.lower
|
||||
elif args.text_casing == "upper":
|
||||
casing = str.upper
|
||||
else:
|
||||
# ignore
|
||||
casing = lambda s: s
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
# Use text directly
|
||||
phoneme_id_map = ALPHABETS[args.language]
|
||||
else:
|
||||
# Use eSpeak
|
||||
phonemizer = Phonemizer(args.language)
|
||||
phoneme_id_map = DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
phonemes = list(unicodedata.normalize("NFD", casing(line)))
|
||||
else:
|
||||
assert phonemizer is not None
|
||||
phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
|
||||
|
||||
phoneme_ids = phonemes_to_ids(
|
||||
phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
|
||||
)
|
||||
json.dump(
|
||||
{
|
||||
"text": line,
|
||||
"phonemes": phonemes,
|
||||
"phoneme_ids": phoneme_ids,
|
||||
},
|
||||
sys.stdout,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
print("")
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
print(phoneme, count, file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -6,9 +6,9 @@ import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
@@ -16,7 +16,15 @@ from typing import Dict, Iterable, List, Optional
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
|
||||
from .phonemize import (
|
||||
ALPHABETS,
|
||||
DEFAULT_PHONEME_ID_MAP,
|
||||
MAX_PHONEMES,
|
||||
PHONEME_MAPS,
|
||||
PhonemeType,
|
||||
phonemes_to_ids,
|
||||
phonemize,
|
||||
)
|
||||
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
@@ -49,6 +57,23 @@ def main() -> None:
|
||||
parser.add_argument(
|
||||
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
@@ -84,9 +109,9 @@ def main() -> None:
|
||||
|
||||
# Count speakers
|
||||
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
|
||||
speaker_counts: Counter[str] = Counter()
|
||||
speaker_counts: "Counter[str]" = Counter()
|
||||
num_utterances = 0
|
||||
for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
|
||||
for utt in make_dataset(args):
|
||||
speaker = utt.speaker or ""
|
||||
speaker_counts[speaker] += 1
|
||||
num_utterances += 1
|
||||
@@ -118,11 +143,12 @@ def main() -> None:
|
||||
"voice": args.language,
|
||||
},
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_type": str(args.phoneme_type),
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": len(
|
||||
set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
|
||||
),
|
||||
"phoneme_id_map": ALPHABETS[args.language]
|
||||
if args.phoneme_type == PhonemeType.TEXT
|
||||
else DEFAULT_PHONEME_ID_MAP,
|
||||
"num_symbols": MAX_PHONEMES,
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
},
|
||||
@@ -142,8 +168,13 @@ def main() -> None:
|
||||
queue_out: "Queue[Optional[Utterance]]" = Queue()
|
||||
|
||||
# Start workers
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
target = phonemize_batch_text
|
||||
else:
|
||||
target = phonemize_batch_espeak
|
||||
|
||||
processes = [
|
||||
Process(target=process_batch, args=(args, queue_in, queue_out))
|
||||
Process(target=target, args=(args, queue_in, queue_out))
|
||||
for _ in range(args.max_workers)
|
||||
]
|
||||
for proc in processes:
|
||||
@@ -154,27 +185,39 @@ def main() -> None:
|
||||
)
|
||||
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
||||
for utt_batch in batched(
|
||||
make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
|
||||
make_dataset(args),
|
||||
batch_size,
|
||||
):
|
||||
queue_in.put(utt_batch)
|
||||
|
||||
_LOGGER.debug("Waiting for jobs to finish")
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
for _ in range(num_utterances):
|
||||
utt = queue_out.get()
|
||||
if utt is not None:
|
||||
if utt.speaker is not None:
|
||||
utt.speaker_id = speaker_ids[utt.speaker]
|
||||
|
||||
utt_dict = dataclasses.asdict(utt)
|
||||
utt_dict.pop("missing_phonemes")
|
||||
|
||||
# JSONL
|
||||
json.dump(
|
||||
dataclasses.asdict(utt),
|
||||
utt_dict,
|
||||
dataset_file,
|
||||
ensure_ascii=False,
|
||||
cls=PathEncoder,
|
||||
)
|
||||
print("", file=dataset_file)
|
||||
|
||||
missing_phonemes.update(utt.missing_phonemes)
|
||||
|
||||
if missing_phonemes:
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
_LOGGER.warning("Missing %s (%s)", phoneme, count)
|
||||
|
||||
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
|
||||
|
||||
# Signal workers to stop
|
||||
for proc in processes:
|
||||
queue_in.put(None)
|
||||
@@ -187,10 +230,27 @@ def main() -> None:
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
|
||||
def get_text_casing(casing: str):
|
||||
if casing == "lower":
|
||||
return str.lower
|
||||
|
||||
if casing == "upper":
|
||||
return str.upper
|
||||
|
||||
if casing == "casefold":
|
||||
return str.casefold
|
||||
|
||||
return lambda s: s
|
||||
|
||||
|
||||
def phonemize_batch_espeak(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
phonemizer = Phonemizer(default_voice=args.language)
|
||||
phoneme_map = PHONEME_MAPS.get(args.language)
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
@@ -200,14 +260,20 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = phonemize(utt.text, phonemizer)
|
||||
utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
utt.phonemes = phonemize(
|
||||
casing(utt.text), phonemizer, phoneme_map=phoneme_map
|
||||
)
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
@@ -217,7 +283,48 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("process_batch")
|
||||
_LOGGER.exception("phonemize_batch_espeak")
|
||||
|
||||
|
||||
def phonemize_batch_text(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
alphabet = ALPHABETS[args.language]
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
if utt_batch is None:
|
||||
break
|
||||
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
_LOGGER.debug(utt)
|
||||
utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
|
||||
utt.phoneme_ids = phonemes_to_ids(
|
||||
utt.phonemes,
|
||||
phoneme_id_map=alphabet,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
except Exception:
|
||||
_LOGGER.exception("Failed to process utterance: %s", utt)
|
||||
queue_out.put(None)
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("phonemize_batch_text")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -233,6 +340,7 @@ class Utterance:
|
||||
phoneme_ids: Optional[List[int]] = None
|
||||
audio_norm_path: Optional[Path] = None
|
||||
audio_spec_path: Optional[Path] = None
|
||||
missing_phonemes: "Counter[str]" = field(default_factory=Counter)
|
||||
|
||||
|
||||
class PathEncoder(json.JSONEncoder):
|
||||
@@ -242,9 +350,12 @@ class PathEncoder(json.JSONEncoder):
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def ljspeech_dataset(
|
||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
||||
) -> Iterable[Utterance]:
|
||||
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
dataset_dir = args.input_dir
|
||||
is_single_speaker = args.single_speaker
|
||||
speaker_id = args.speaker_id
|
||||
skip_audio = args.skip_audio
|
||||
|
||||
# filename|speaker|text
|
||||
# speaker is optional
|
||||
metadata_path = dataset_dir / "metadata.csv"
|
||||
@@ -257,7 +368,7 @@ def ljspeech_dataset(
|
||||
with open(metadata_path, "r", encoding="utf-8") as csv_file:
|
||||
reader = csv.reader(csv_file, delimiter="|")
|
||||
for row in reader:
|
||||
assert len(row) >= 2, "Not enough colums"
|
||||
assert len(row) >= 2, "Not enough columns"
|
||||
|
||||
speaker: Optional[str] = None
|
||||
if is_single_speaker or (len(row) == 2):
|
||||
@@ -280,18 +391,25 @@ def ljspeech_dataset(
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
_LOGGER.warning("Missing %s", filename)
|
||||
continue
|
||||
if not skip_audio:
|
||||
if not wav_path.exists():
|
||||
_LOGGER.warning("Missing %s", filename)
|
||||
continue
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
_LOGGER.warning("Empty file: %s", wav_path)
|
||||
continue
|
||||
|
||||
yield Utterance(
|
||||
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
|
||||
)
|
||||
|
||||
|
||||
def mycroft_dataset(
|
||||
dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
|
||||
) -> Iterable[Utterance]:
|
||||
def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
dataset_dir = args.input_dir
|
||||
is_single_speaker = args.single_speaker
|
||||
skip_audio = args.skip_audio
|
||||
|
||||
speaker_id = 0
|
||||
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
|
||||
speaker = metadata_path.parent.name if not is_single_speaker else None
|
||||
@@ -301,15 +419,15 @@ def mycroft_dataset(
|
||||
for row in reader:
|
||||
filename, text = row[0], row[1]
|
||||
wav_path = metadata_path.parent / filename
|
||||
yield Utterance(
|
||||
text=text,
|
||||
audio_path=wav_path,
|
||||
speaker=speaker,
|
||||
speaker_id=speaker_id if not is_single_speaker else None,
|
||||
)
|
||||
if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
|
||||
yield Utterance(
|
||||
text=text,
|
||||
audio_path=wav_path,
|
||||
speaker=speaker,
|
||||
speaker_id=speaker_id if not is_single_speaker else None,
|
||||
)
|
||||
speaker_id += 1
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--speaker-number", type=int)
|
||||
parser.add_argument("--speaker-name")
|
||||
args = parser.parse_args()
|
||||
|
||||
assert (args.speaker_number is not None) or (args.speaker_name is not None)
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
if args.speaker_name is not None:
|
||||
for row in reader:
|
||||
audio, speaker_id, text = row[0], row[1], row[-1]
|
||||
if args.speaker_name == speaker_id:
|
||||
writer.writerow((audio, text))
|
||||
else:
|
||||
utterances = defaultdict(list)
|
||||
counts = Counter()
|
||||
for row in reader:
|
||||
audio, speaker_id, text = row[0], row[1], row[-1]
|
||||
utterances[speaker_id].append((audio, text))
|
||||
counts[speaker_id] += 1
|
||||
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
for i, (speaker_id, _count) in enumerate(counts.most_common()):
|
||||
if i == args.speaker_number:
|
||||
for row in utterances[speaker_id]:
|
||||
writer.writerow(row)
|
||||
|
||||
print(speaker_id, file=sys.stderr)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -8,7 +8,8 @@ docker run \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--ipc=host \
|
||||
-v "${HOME}:${HOME}" \
|
||||
-v /media/cache:/media/cache:ro \
|
||||
-v /etc/hostname:/etc/hostname:ro \
|
||||
-v /etc/localtime:/etc/localtime:ro \
|
||||
piper-train \
|
||||
larynx2-train \
|
||||
"$@"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -9,6 +10,8 @@ import numpy as np
|
||||
import onnxruntime
|
||||
from espeak_phonemizer import Phonemizer
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
_BOS = "^"
|
||||
_EOS = "$"
|
||||
_PAD = "_"
|
||||
@@ -69,8 +72,11 @@ class Piper:
|
||||
phoneme_ids: List[int] = []
|
||||
|
||||
for phoneme in phonemes:
|
||||
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
|
||||
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
|
||||
if phoneme in self.config.phoneme_id_map:
|
||||
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
|
||||
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
|
||||
else:
|
||||
_LOGGER.warning("No id for phoneme: %s", phoneme)
|
||||
|
||||
phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user