piper/src/cpp/main.cpp

#include <chrono>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>

#ifdef HAVE_PCAUDIO
// https://github.com/espeak-ng/pcaudiolib
#include <pcaudiolib/audio.h>
#endif

#include "larynx.hpp"

using namespace std;

enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_PLAY };

struct RunConfig {
  filesystem::path modelPath;
  filesystem::path modelConfigPath;
  OutputType outputType = OUTPUT_PLAY;
  optional<filesystem::path> outputPath;
  optional<larynx::SpeakerId> speakerId;
};

void parseArgs(int argc, char *argv[], RunConfig &runConfig);

int main(int argc, char *argv[]) {
  RunConfig runConfig;
  parseArgs(argc, argv, runConfig);

  larynx::initialize();

  larynx::Voice voice;
  auto startTime = chrono::steady_clock::now();
  loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
            voice, runConfig.speakerId);
  auto endTime = chrono::steady_clock::now();
  auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
  cerr << "Load time: " << loadSeconds << " sec" << endl;

#ifdef HAVE_PCAUDIO
  audio_object *my_audio = nullptr;

  if (runConfig.outputType == OUTPUT_PLAY) {
    // Output audio to the default audio device
    my_audio = create_audio_device_object(NULL, "larynx", "Text-to-Speech");

    // TODO: Support 32-bit sample widths
    auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
    int error = audio_object_open(my_audio, audioFormat,
                                  voice.synthesisConfig.sampleRate,
                                  voice.synthesisConfig.channels);
    if (error != 0) {
      throw runtime_error(audio_object_strerror(my_audio, error));
    }
  }
#else
  if (runConfig.outputType == OUTPUT_PLAY) {
    // Cannot play audio directly
    cerr << "WARNING: Larynx was not compiled with pcaudiolib. Output audio "
            "will be written to the current directory."
         << endl;
    runConfig.outputType = OUTPUT_DIRECTORY;
    runConfig.outputPath = filesystem::path(".");
  }
#endif

  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
    cerr << "Output directory: " << runConfig.outputPath.value() << endl;
  }

  string line;
  larynx::SynthesisResult result;
  while (getline(cin, line)) {

    // Path to output WAV file
    const auto now = chrono::system_clock::now();
    const auto timestamp =
        chrono::duration_cast<chrono::seconds>(now.time_since_epoch()).count();

    if (runConfig.outputType == OUTPUT_DIRECTORY) {
      stringstream outputName;
      outputName << timestamp << ".wav";
      filesystem::path outputPath = runConfig.outputPath.value();
      outputPath.append(outputName.str());

      // Output audio to automatically-named WAV file in a directory
      ofstream audioFile(outputPath.string(), ios::binary);
      larynx::textToWavFile(voice, line, audioFile, result);
      cout << outputPath.string() << endl;
    } else if (runConfig.outputType == OUTPUT_FILE) {
      // Output audio to WAV file
      ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
      larynx::textToWavFile(voice, line, audioFile, result);
    } else if (runConfig.outputType == OUTPUT_STDOUT) {
      // Output WAV to stdout
      larynx::textToWavFile(voice, line, cout, result);
    } else if (runConfig.outputType == OUTPUT_PLAY) {
#ifdef HAVE_PCAUDIO
      vector<int16_t> audioBuffer;
      larynx::textToAudio(voice, line, audioBuffer, result);

      int error = audio_object_write(my_audio, (const char *)audioBuffer.data(),
                                     sizeof(int16_t) * audioBuffer.size());
      if (error != 0) {
        throw runtime_error(audio_object_strerror(my_audio, error));
      }
      audio_object_flush(my_audio);
#else
      throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
#endif
    }

    cerr << "Real-time factor: " << result.realTimeFactor
         << " (infer=" << result.inferSeconds
         << " sec, audio=" << result.audioSeconds << " sec)" << endl;
  }

  larynx::terminate();

#ifdef HAVE_PCAUDIO
  audio_object_close(my_audio);
  audio_object_destroy(my_audio);
  my_audio = nullptr;
#endif

  return EXIT_SUCCESS;
}

void printUsage(char *argv[]) {
  cerr << endl;
  cerr << "usage: " << argv[0] << " [options]" << endl;
  cerr << endl;
  cerr << "options:" << endl;
  cerr << "   -h        --help              show this message and exit" << endl;
  cerr << "   -m  FILE  --model       FILE  path to onnx model file" << endl;
  cerr << "   -c  FILE  --config      FILE  path to model config file "
          "(default: model path + .json)"
       << endl;
  cerr << "   -f  FILE  --output_file FILE  path to output WAV file ('-' for "
          "stdout)"
       << endl;
  cerr << "   -d  DIR   --output_dir  DIR   path to output directory (default: "
          "cwd)"
       << endl;
  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
  cerr << endl;
}

void ensureArg(int argc, char *argv[], int argi) {
  if ((argi + 1) >= argc) {
    printUsage(argv);
    exit(0);
  }
}

// Parse command-line arguments
void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
  optional<filesystem::path> modelConfigPath;

  for (int i = 1; i < argc; i++) {
    std::string arg = argv[i];

    if (arg == "-m" || arg == "--model") {
      ensureArg(argc, argv, i);
      runConfig.modelPath = filesystem::path(argv[++i]);
    } else if (arg == "-c" || arg == "--config") {
      ensureArg(argc, argv, i);
      modelConfigPath = filesystem::path(argv[++i]);
    } else if (arg == "-f" || arg == "--output_file") {
      ensureArg(argc, argv, i);
      std::string filePath = argv[++i];
      if (filePath == "-") {
        runConfig.outputType = OUTPUT_STDOUT;
        runConfig.outputPath = nullopt;
      } else {
        runConfig.outputType = OUTPUT_FILE;
        runConfig.outputPath = filesystem::path(filePath);
      }
    } else if (arg == "-d" || arg == "--output_dir") {
      ensureArg(argc, argv, i);
      runConfig.outputType = OUTPUT_DIRECTORY;
      runConfig.outputPath = filesystem::path(argv[++i]);
    } else if (arg == "-s" || arg == "--speaker") {
      ensureArg(argc, argv, i);
      runConfig.speakerId = (larynx::SpeakerId)stoll(argv[++i]);
    } else if (arg == "-h" || arg == "--help") {
      printUsage(argv);
      exit(0);
    }
  }

  // Verify model file exists
  ifstream modelFile(runConfig.modelPath.c_str(), ios::binary);
  if (!modelFile.good()) {
    throw runtime_error("Model file doesn't exist");
  }

  if (!modelConfigPath) {
    runConfig.modelConfigPath =
        filesystem::path(runConfig.modelPath.string() + ".json");
  } else {
    runConfig.modelConfigPath = modelConfigPath.value();
  }

  // Verify model config exists
  ifstream modelConfigFile(runConfig.modelConfigPath.c_str());
  if (!modelConfigFile.good()) {
    throw runtime_error("Model config doesn't exist");
  }
}