#include #include #include #include #include #include #include #include #include #include #include #include #ifdef _MSC_VER #define WIN32_LEAN_AND_MEAN #define NOMINMAX #include #endif #ifdef __APPLE__ #include #endif #include #include #include "piper.hpp" using namespace std; enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW }; struct RunConfig { // Path to .onnx voice file filesystem::path modelPath; // Path to JSON voice config file filesystem::path modelConfigPath; // Type of output to produce. // Default is to write a WAV file in the current directory. OutputType outputType = OUTPUT_DIRECTORY; // Path for output optional outputPath = filesystem::path("."); // Numerical id of the default speaker (multi-speaker voices) optional speakerId; // Amount of noise to add during audio generation optional noiseScale; // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower) optional lengthScale; // Variation in phoneme lengths optional noiseW; // Seconds of silence to add after each sentence optional sentenceSilenceSeconds; // Path to espeak-ng data directory (default is next to piper executable) optional eSpeakDataPath; // Path to libtashkeel ort model // https://github.com/mush42/libtashkeel/ optional tashkeelModelPath; }; void parseArgs(int argc, char *argv[], RunConfig &runConfig); void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, condition_variable &cvAudio, bool &audioReady, bool &audioFinished); // ---------------------------------------------------------------------------- int main(int argc, char *argv[]) { spdlog::set_default_logger(spdlog::stderr_color_st("piper")); RunConfig runConfig; parseArgs(argc, argv, runConfig); piper::PiperConfig piperConfig; piper::Voice voice; spdlog::debug("Loading voice from {} (config={})", runConfig.modelPath.string(), runConfig.modelConfigPath.string()); auto startTime = chrono::steady_clock::now(); loadVoice(piperConfig, runConfig.modelPath.string(), runConfig.modelConfigPath.string(), voice, runConfig.speakerId); auto endTime = chrono::steady_clock::now(); spdlog::info("Loaded voice in {} second(s)", chrono::duration(endTime - startTime).count()); // Get the path to the piper executable so we can locate espeak-ng-data, etc. // next to it. #ifdef _MSC_VER auto exePath = []() { wchar_t moduleFileName[MAX_PATH] = {0}; GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName)); return filesystem::path(moduleFileName); }(); #elifdef __APPLE__ auto exePath = []() { char moduleFileName[PATH_MAX] = {0}; uint32_t moduleFileNameSize = std::size(moduleFileName); _NSGetExecutablePath(moduleFileName, &moduleFileNameSize); return filesystem::path(moduleFileName); }(); #else auto exePath = filesystem::canonical("/proc/self/exe"); #endif if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) { spdlog::debug("Voice uses eSpeak phonemes ({})", voice.phonemizeConfig.eSpeak.voice); if (runConfig.eSpeakDataPath) { // User provided path piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string(); } else { // Assume next to piper executable piperConfig.eSpeakDataPath = std::filesystem::absolute( exePath.parent_path().append("espeak-ng-data")) .string(); spdlog::debug("espeak-ng-data directory is expected at {}", piperConfig.eSpeakDataPath); } } else { // Not using eSpeak piperConfig.useESpeak = false; } // Enable libtashkeel for Arabic if (voice.phonemizeConfig.eSpeak.voice == "ar") { piperConfig.useTashkeel = true; if (runConfig.tashkeelModelPath) { // User provided path piperConfig.tashkeelModelPath = runConfig.tashkeelModelPath.value().string(); } else { // Assume next to piper executable piperConfig.tashkeelModelPath = std::filesystem::absolute( exePath.parent_path().append("libtashkeel_model.ort")) .string(); spdlog::debug("libtashkeel model is expected at {}", piperConfig.tashkeelModelPath.value()); } } piper::initialize(piperConfig); // Scales if (runConfig.noiseScale) { voice.synthesisConfig.noiseScale = runConfig.noiseScale.value(); } if (runConfig.lengthScale) { voice.synthesisConfig.lengthScale = runConfig.lengthScale.value(); } if (runConfig.noiseW) { voice.synthesisConfig.noiseW = runConfig.noiseW.value(); } if (runConfig.sentenceSilenceSeconds) { voice.synthesisConfig.sentenceSilenceSeconds = runConfig.sentenceSilenceSeconds.value(); } if (runConfig.outputType == OUTPUT_DIRECTORY) { runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value()); spdlog::info("Output directory: {}", runConfig.outputPath.value().string()); } string line; piper::SynthesisResult result; while (getline(cin, line)) { // Path to output WAV file const auto now = chrono::system_clock::now(); const auto timestamp = chrono::duration_cast(now.time_since_epoch()) .count(); if (runConfig.outputType == OUTPUT_DIRECTORY) { stringstream outputName; outputName << timestamp << ".wav"; filesystem::path outputPath = runConfig.outputPath.value(); outputPath.append(outputName.str()); // Output audio to automatically-named WAV file in a directory ofstream audioFile(outputPath.string(), ios::binary); piper::textToWavFile(piperConfig, voice, line, audioFile, result); cout << outputPath.string() << endl; } else if (runConfig.outputType == OUTPUT_FILE) { // Read all of standard input before synthesizing. // Otherwise, we would overwrite the output file for each line. stringstream text; text << line; while (getline(cin, line)) { text << " " << line; } // Output audio to WAV file ofstream audioFile(runConfig.outputPath.value().string(), ios::binary); piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result); } else if (runConfig.outputType == OUTPUT_STDOUT) { // Output WAV to stdout piper::textToWavFile(piperConfig, voice, line, cout, result); } else if (runConfig.outputType == OUTPUT_RAW) { // Raw output to stdout mutex mutAudio; condition_variable cvAudio; bool audioReady = false; bool audioFinished = false; vector audioBuffer; vector sharedAudioBuffer; thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer), ref(mutAudio), ref(cvAudio), ref(audioReady), ref(audioFinished)); auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio, &cvAudio, &audioReady]() { // Signal thread that audio is ready { unique_lock lockAudio(mutAudio); copy(audioBuffer.begin(), audioBuffer.end(), back_inserter(sharedAudioBuffer)); audioReady = true; cvAudio.notify_one(); } }; piper::textToAudio(piperConfig, voice, line, audioBuffer, result, audioCallback); // Signal thread that there is no more audio { unique_lock lockAudio(mutAudio); audioReady = true; audioFinished = true; cvAudio.notify_one(); } // Wait for audio output to finish spdlog::info("Waiting for audio to finish playing..."); rawOutputThread.join(); } spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)", result.realTimeFactor, result.inferSeconds, result.audioSeconds); } piper::terminate(piperConfig); return EXIT_SUCCESS; } // ---------------------------------------------------------------------------- void rawOutputProc(vector &sharedAudioBuffer, mutex &mutAudio, condition_variable &cvAudio, bool &audioReady, bool &audioFinished) { vector internalAudioBuffer; while (true) { { unique_lock lockAudio{mutAudio}; cvAudio.wait(lockAudio, [&audioReady] { return audioReady; }); if (sharedAudioBuffer.empty() && audioFinished) { break; } copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(), back_inserter(internalAudioBuffer)); sharedAudioBuffer.clear(); if (!audioFinished) { audioReady = false; } } cout.write((const char *)internalAudioBuffer.data(), sizeof(int16_t) * internalAudioBuffer.size()); cout.flush(); internalAudioBuffer.clear(); } } // rawOutputProc // ---------------------------------------------------------------------------- void printUsage(char *argv[]) { cerr << endl; cerr << "usage: " << argv[0] << " [options]" << endl; cerr << endl; cerr << "options:" << endl; cerr << " -h --help show this message and exit" << endl; cerr << " -m FILE --model FILE path to onnx model file" << endl; cerr << " -c FILE --config FILE path to model config file " "(default: model path + .json)" << endl; cerr << " -f FILE --output_file FILE path to output WAV file ('-' for " "stdout)" << endl; cerr << " -d DIR --output_dir DIR path to output directory (default: " "cwd)" << endl; cerr << " --output_raw output raw audio to stdout as it " "becomes available" << endl; cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl; cerr << " --noise_scale NUM generator noise (default: 0.667)" << endl; cerr << " --length_scale NUM phoneme length (default: 1.0)" << endl; cerr << " --noise_w NUM phoneme width noise (default: 0.8)" << endl; cerr << " --silence_seconds NUM seconds of silence after each " "sentence (default: 0.2)" << endl; cerr << " --espeak_data DIR path to espeak-ng data directory" << endl; cerr << " --tashkeel_model FILE path to libtashkeel onnx model " "(arabic)" << endl; cerr << " --debug print DEBUG messages to the console" << endl; cerr << endl; } void ensureArg(int argc, char *argv[], int argi) { if ((argi + 1) >= argc) { printUsage(argv); exit(0); } } // Parse command-line arguments void parseArgs(int argc, char *argv[], RunConfig &runConfig) { optional modelConfigPath; for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg == "-m" || arg == "--model") { ensureArg(argc, argv, i); runConfig.modelPath = filesystem::path(argv[++i]); } else if (arg == "-c" || arg == "--config") { ensureArg(argc, argv, i); modelConfigPath = filesystem::path(argv[++i]); } else if (arg == "-f" || arg == "--output_file" || arg == "--output-file") { ensureArg(argc, argv, i); std::string filePath = argv[++i]; if (filePath == "-") { runConfig.outputType = OUTPUT_STDOUT; runConfig.outputPath = nullopt; } else { runConfig.outputType = OUTPUT_FILE; runConfig.outputPath = filesystem::path(filePath); } } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") { ensureArg(argc, argv, i); runConfig.outputType = OUTPUT_DIRECTORY; runConfig.outputPath = filesystem::path(argv[++i]); } else if (arg == "--output_raw" || arg == "--output-raw") { runConfig.outputType = OUTPUT_RAW; } else if (arg == "-s" || arg == "--speaker") { ensureArg(argc, argv, i); runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]); } else if (arg == "--noise_scale" || arg == "--noise-scale") { ensureArg(argc, argv, i); runConfig.noiseScale = stof(argv[++i]); } else if (arg == "--length_scale" || arg == "--length-scale") { ensureArg(argc, argv, i); runConfig.lengthScale = stof(argv[++i]); } else if (arg == "--noise_w" || arg == "--noise-w") { ensureArg(argc, argv, i); runConfig.noiseW = stof(argv[++i]); } else if (arg == "--sentence_silence" || arg == "--sentence-silence") { ensureArg(argc, argv, i); runConfig.sentenceSilenceSeconds = stof(argv[++i]); } else if (arg == "--espeak_data" || arg == "--espeak-data") { ensureArg(argc, argv, i); runConfig.eSpeakDataPath = filesystem::path(argv[++i]); } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") { ensureArg(argc, argv, i); runConfig.tashkeelModelPath = filesystem::path(argv[++i]); } else if (arg == "--debug") { // Set DEBUG logging spdlog::set_level(spdlog::level::debug); } else if (arg == "-h" || arg == "--help") { printUsage(argv); exit(0); } } // Verify model file exists ifstream modelFile(runConfig.modelPath.c_str(), ios::binary); if (!modelFile.good()) { throw runtime_error("Model file doesn't exist"); } if (!modelConfigPath) { runConfig.modelConfigPath = filesystem::path(runConfig.modelPath.string() + ".json"); } else { runConfig.modelConfigPath = modelConfigPath.value(); } // Verify model config exists ifstream modelConfigFile(runConfig.modelConfigPath.c_str()); if (!modelConfigFile.good()) { throw runtime_error("Model config doesn't exist"); } }