Merge branch 'rhasspy:master' into master

2026-06-18 17:22:27 +00:00 · 2023-06-10 09:15:39 -05:00
parent 1052b7e38c d94387374f
commit 49cb95f157
53 changed files with 1967 additions and 888 deletions
@@ -1,47 +1,37 @@
 cmake_minimum_required(VERSION 3.13)

-include(CheckIncludeFileCXX)
-
 project(piper C CXX)

-set(CMAKE_CXX_STANDARD 20)
+find_package(PkgConfig)
+pkg_check_modules(SPDLOG REQUIRED spdlog)
+
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

-ADD_EXECUTABLE(piper main.cpp)
+ADD_EXECUTABLE(piper main.cpp piper.cpp)

 string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Wl,-rpath,'$ORIGIN'")
 string(APPEND CMAKE_C_FLAGS " -Wall -Wextra")

-find_package(PkgConfig)
-pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
-
-# https://github.com/espeak-ng/pcaudiolib
-check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
-
-if(PCAUDIO_INCLUDE_FOUND)
-  option(USE_PCAUDIO "Build with pcaudiolib" ON)
-  if(USE_PCAUDIO)
-    target_compile_definitions(piper PUBLIC HAVE_PCAUDIO)
-    set(PCAUDIO_LIBRARIES "pcaudio")
-  endif()
-endif()
-
-set(ONNXRUNTIME_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR})
+set(PIPER_PHONEMIZE_ROOTDIR ${CMAKE_CURRENT_LIST_DIR}/../../lib/${CMAKE_HOST_SYSTEM_NAME}-${CMAKE_HOST_SYSTEM_PROCESSOR}/piper_phonemize)

 target_link_libraries(piper
+                      piper_phonemize
+                      espeak-ng
                      onnxruntime
                      pthread
-                      -static-libgcc -static-libstdc++
-                      ${ESPEAK_NG_LIBRARIES}
-                      ${PCAUDIO_LIBRARIES})
+                      ${SPDLOG_LIBRARIES})
+
+if(NOT APPLE)
+  target_link_libraries(piper -static-libgcc -static-libstdc++)
+endif()

 target_link_directories(piper PUBLIC
-                        ${ESPEAK_NG_LIBRARY_DIRS}
-                        ${ONNXRUNTIME_ROOTDIR}/lib)
+                        ${PIPER_PHONEMIZE_ROOTDIR}/lib)

 target_include_directories(piper PUBLIC
-                           ${ONNXRUNTIME_ROOTDIR}/include
-                           ${ESPEAK_NG_INCLUDE_DIRS})
+                           ${PIPER_PHONEMIZE_ROOTDIR}/include
+                           ${SPDLOG_INCLUDE_DIRS})

 target_compile_options(piper PUBLIC
-                       ${ESPEAK_NG_CFLAGS_OTHER})
+                       ${SPDLOG_CFLAGS_OTHER})
@@ -1,155 +0,0 @@
-#ifndef CONFIG_H_
-#define CONFIG_H_
-
-#include <filesystem>
-#include <map>
-#include <optional>
-#include <set>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "json.hpp"
-#include "utf8.h"
-
-using namespace std;
-using json = nlohmann::json;
-
-namespace piper {
-
-typedef char32_t Phoneme;
-typedef int64_t PhonemeId;
-typedef int64_t SpeakerId;
-
-const string DefaultVoice = "en-us";
-
-enum eSpeakMode { Text, TextWithPhonemes, SSML };
-
-struct eSpeakConfig {
-  string voice = DefaultVoice;
-  eSpeakMode mode = Text;
-
-  // Characters that eSpeak uses to break apart paragraphs/sentences
-  set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
-
-  Phoneme fullStop = U'.';
-  Phoneme comma = U',';
-  Phoneme question = U'?';
-  Phoneme exclamation = U'!';
-};
-
-struct PhonemizeConfig {
-  optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
-  map<Phoneme, vector<PhonemeId>> phonemeIdMap;
-
-  PhonemeId idPad = 0; // padding (optionally interspersed)
-  PhonemeId idBos = 1; // beginning of sentence
-  PhonemeId idEos = 2; // end of sentence
-  bool interspersePad = true;
-
-  optional<eSpeakConfig> eSpeak;
-};
-
-struct SynthesisConfig {
-  float noiseScale = 0.667f;
-  float lengthScale = 1.0f;
-  float noiseW = 0.8f;
-  int sampleRate = 22050;
-  int sampleWidth = 2; // 16-bit
-  int channels = 1;    // mono
-  optional<SpeakerId> speakerId;
-  float sentenceSilenceSeconds = 0.2f;
-};
-
-struct ModelConfig {
-  int numSpeakers;
-};
-
-bool isSingleCodepoint(string s) {
-  return utf8::distance(s.begin(), s.end()) == 1;
-}
-
-Phoneme getCodepoint(string s) {
-  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
-  return *character_iter;
-}
-
-void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
-
-  if (configRoot.contains("espeak")) {
-    if (!phonemizeConfig.eSpeak) {
-      phonemizeConfig.eSpeak.emplace();
-    }
-
-    auto espeakValue = configRoot["espeak"];
-    if (espeakValue.contains("voice")) {
-      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
-    }
-  }
-
-  // phoneme to [phoneme] map
-  if (configRoot.contains("phoneme_map")) {
-    if (!phonemizeConfig.phonemeMap) {
-      phonemizeConfig.phonemeMap.emplace();
-    }
-
-    auto phonemeMapValue = configRoot["phoneme_map"];
-    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
-      string fromPhoneme = fromPhonemeItem.key();
-      if (!isSingleCodepoint(fromPhoneme)) {
-        throw runtime_error("Phonemes must be one codepoint (phoneme map)");
-      }
-
-      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
-        string toPhoneme = toPhonemeValue.get<string>();
-        if (!isSingleCodepoint(toPhoneme)) {
-          throw runtime_error("Phonemes must be one codepoint (phoneme map)");
-        }
-
-        auto toCodepoint = getCodepoint(toPhoneme);
-        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
-      }
-    }
-  }
-
-  // phoneme to [id] map
-  if (configRoot.contains("phoneme_id_map")) {
-    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
-    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
-      string fromPhoneme = fromPhonemeItem.key();
-      if (!isSingleCodepoint(fromPhoneme)) {
-        throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
-      }
-
-      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto &toIdValue : fromPhonemeItem.value()) {
-        PhonemeId toId = toIdValue.get<PhonemeId>();
-        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
-      }
-    }
-  }
-
-} /* parsePhonemizeConfig */
-
-void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
-
-  if (configRoot.contains("audio")) {
-    auto audioValue = configRoot["audio"];
-    if (audioValue.contains("sample_rate")) {
-      // Default sample rate is 22050 Hz
-      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
-    }
-  }
-
-} /* parseSynthesisConfig */
-
-void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
-
-  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
-
-} /* parseModelConfig */
-
-} // namespace piper
-
-#endif // CONFIG_H_
@@ -2,6 +2,7 @@
 #include <condition_variable>
 #include <filesystem>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <mutex>
 #include <sstream>
@@ -10,38 +11,60 @@
 #include <thread>
 #include <vector>

-#ifdef HAVE_PCAUDIO
-// https://github.com/espeak-ng/pcaudiolib
-#include <pcaudiolib/audio.h>
-#endif
-
 #ifdef _MSC_VER
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #endif

+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+
 #include "piper.hpp"

 using namespace std;

-enum OutputType {
-  OUTPUT_FILE,
-  OUTPUT_DIRECTORY,
-  OUTPUT_STDOUT,
-  OUTPUT_PLAY,
-  OUTPUT_RAW
-};
+enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };

 struct RunConfig {
+  // Path to .onnx voice file
  filesystem::path modelPath;
+
+  // Path to JSON voice config file
  filesystem::path modelConfigPath;
-  OutputType outputType = OUTPUT_PLAY;
-  optional<filesystem::path> outputPath;
+
+  // Type of output to produce.
+  // Default is to write a WAV file in the current directory.
+  OutputType outputType = OUTPUT_DIRECTORY;
+
+  // Path for output
+  optional<filesystem::path> outputPath = filesystem::path(".");
+
+  // Numerical id of the default speaker (multi-speaker voices)
  optional<piper::SpeakerId> speakerId;
+
+  // Amount of noise to add during audio generation
  optional<float> noiseScale;
+
+  // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
  optional<float> lengthScale;
+
+  // Variation in phoneme lengths
  optional<float> noiseW;
+
+  // Seconds of silence to add after each sentence
+  optional<float> sentenceSilenceSeconds;
+
+  // Path to espeak-ng data directory (default is next to piper executable)
+  optional<filesystem::path> eSpeakDataPath;
+
+  // Path to libtashkeel ort model
+  // https://github.com/mush42/libtashkeel/
+  optional<filesystem::path> tashkeelModelPath;
 };

 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -49,35 +72,89 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
                   condition_variable &cvAudio, bool &audioReady,
                   bool &audioFinished);

-#ifdef HAVE_PCAUDIO
-void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
-              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
-              bool &audioFinished);
-#endif
+// ----------------------------------------------------------------------------

 int main(int argc, char *argv[]) {
+  spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
+
  RunConfig runConfig;
  parseArgs(argc, argv, runConfig);

-  // NOTE: This won't work for Windows (need GetModuleFileName)
+  piper::PiperConfig piperConfig;
+  piper::Voice voice;
+
+  spdlog::debug("Loading voice from {} (config={})",
+                runConfig.modelPath.string(),
+                runConfig.modelConfigPath.string());
+
+  auto startTime = chrono::steady_clock::now();
+  loadVoice(piperConfig, runConfig.modelPath.string(),
+            runConfig.modelConfigPath.string(), voice, runConfig.speakerId);
+  auto endTime = chrono::steady_clock::now();
+  spdlog::info("Loaded voice in {} second(s)",
+               chrono::duration<double>(endTime - startTime).count());
+
+  // Get the path to the piper executable so we can locate espeak-ng-data, etc.
+  // next to it.
 #ifdef _MSC_VER
  auto exePath = []() {
-    wchar_t moduleFileName[MAX_PATH] = { 0 };
+    wchar_t moduleFileName[MAX_PATH] = {0};
    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
    return filesystem::path(moduleFileName);
  }();
+#elifdef __APPLE__
+  auto exePath = []() {
+    char moduleFileName[PATH_MAX] = {0};
+    uint32_t moduleFileNameSize = std::size(moduleFileName);
+    _NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
+    return filesystem::path(moduleFileName);
+  }();
 #else
  auto exePath = filesystem::canonical("/proc/self/exe");
 #endif
-  piper::initialize(exePath.parent_path());

-  piper::Voice voice;
-  auto startTime = chrono::steady_clock::now();
-  loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
-            voice, runConfig.speakerId);
-  auto endTime = chrono::steady_clock::now();
-  auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
-  cerr << "Load time: " << loadSeconds << " sec" << endl;
+  if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
+    spdlog::debug("Voice uses eSpeak phonemes ({})",
+                  voice.phonemizeConfig.eSpeak.voice);
+
+    if (runConfig.eSpeakDataPath) {
+      // User provided path
+      piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.eSpeakDataPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("espeak-ng-data"))
+              .string();
+
+      spdlog::debug("espeak-ng-data directory is expected at {}",
+                    piperConfig.eSpeakDataPath);
+    }
+  } else {
+    // Not using eSpeak
+    piperConfig.useESpeak = false;
+  }
+
+  // Enable libtashkeel for Arabic
+  if (voice.phonemizeConfig.eSpeak.voice == "ar") {
+    piperConfig.useTashkeel = true;
+    if (runConfig.tashkeelModelPath) {
+      // User provided path
+      piperConfig.tashkeelModelPath =
+          runConfig.tashkeelModelPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.tashkeelModelPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("libtashkeel_model.ort"))
+              .string();
+
+      spdlog::debug("libtashkeel model is expected at {}",
+                    piperConfig.tashkeelModelPath.value());
+    }
+  }
+
+  piper::initialize(piperConfig);

  // Scales
  if (runConfig.noiseScale) {
@@ -92,36 +169,14 @@ int main(int argc, char *argv[]) {
    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
  }

-#ifdef HAVE_PCAUDIO
-  audio_object *my_audio = nullptr;
-
-  if (runConfig.outputType == OUTPUT_PLAY) {
-    // Output audio to the default audio device
-    my_audio = create_audio_device_object(NULL, "piper", "Text-to-Speech");
-
-    // TODO: Support 32-bit sample widths
-    auto audioFormat = AUDIO_OBJECT_FORMAT_S16LE;
-    int error = audio_object_open(my_audio, audioFormat,
-                                  voice.synthesisConfig.sampleRate,
-                                  voice.synthesisConfig.channels);
-    if (error != 0) {
-      throw runtime_error(audio_object_strerror(my_audio, error));
-    }
+  if (runConfig.sentenceSilenceSeconds) {
+    voice.synthesisConfig.sentenceSilenceSeconds =
+        runConfig.sentenceSilenceSeconds.value();
  }
-#else
-  if (runConfig.outputType == OUTPUT_PLAY) {
-    // Cannot play audio directly
-    cerr << "WARNING: Piper was not compiled with pcaudiolib. Output audio "
-            "will be written to the current directory."
-         << endl;
-    runConfig.outputType = OUTPUT_DIRECTORY;
-    runConfig.outputPath = filesystem::path(".");
-  }
-#endif

  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
-    cerr << "Output directory: " << runConfig.outputPath.value() << endl;
+    spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
  }

  string line;
@@ -142,15 +197,23 @@ int main(int argc, char *argv[]) {

      // Output audio to automatically-named WAV file in a directory
      ofstream audioFile(outputPath.string(), ios::binary);
-      piper::textToWavFile(voice, line, audioFile, result);
+      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
      cout << outputPath.string() << endl;
    } else if (runConfig.outputType == OUTPUT_FILE) {
+      // Read all of standard input before synthesizing.
+      // Otherwise, we would overwrite the output file for each line.
+      stringstream text;
+      text << line;
+      while (getline(cin, line)) {
+        text << " " << line;
+      }
+
      // Output audio to WAV file
      ofstream audioFile(runConfig.outputPath.value().string(), ios::binary);
-      piper::textToWavFile(voice, line, audioFile, result);
+      piper::textToWavFile(piperConfig, voice, text.str(), audioFile, result);
    } else if (runConfig.outputType == OUTPUT_STDOUT) {
      // Output WAV to stdout
-      piper::textToWavFile(voice, line, cout, result);
+      piper::textToWavFile(piperConfig, voice, line, cout, result);
    } else if (runConfig.outputType == OUTPUT_RAW) {
      // Raw output to stdout
      mutex mutAudio;
@@ -174,7 +237,8 @@ int main(int argc, char *argv[]) {
          cvAudio.notify_one();
        }
      };
-      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
+      piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
+                         audioCallback);

      // Signal thread that there is no more audio
      {
@@ -185,65 +249,22 @@ int main(int argc, char *argv[]) {
      }

      // Wait for audio output to finish
-      cerr << "Waiting for audio..." << endl;
+      spdlog::info("Waiting for audio to finish playing...");
      rawOutputThread.join();
-    } else if (runConfig.outputType == OUTPUT_PLAY) {
-#ifdef HAVE_PCAUDIO
-      mutex mutAudio;
-      condition_variable cvAudio;
-      bool audioReady = false;
-      bool audioFinished = false;
-      vector<int16_t> audioBuffer;
-      vector<int16_t> sharedAudioBuffer;
-
-      thread playThread(playProc, my_audio, ref(sharedAudioBuffer),
-                        ref(mutAudio), ref(cvAudio), ref(audioReady),
-                        ref(audioFinished));
-      auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
-                            &cvAudio, &audioReady]() {
-        // Signal thread that audio is ready
-        {
-          unique_lock lockAudio(mutAudio);
-          copy(audioBuffer.begin(), audioBuffer.end(),
-               back_inserter(sharedAudioBuffer));
-          audioReady = true;
-          cvAudio.notify_one();
-        }
-      };
-      piper::textToAudio(voice, line, audioBuffer, result, audioCallback);
-
-      // Signal thread that there is no more audio
-      {
-        unique_lock lockAudio(mutAudio);
-        audioReady = true;
-        audioFinished = true;
-        cvAudio.notify_one();
-      }
-
-      // Wait for audio output to finish
-      cerr << "Waiting for audio..." << endl;
-      playThread.join();
-#else
-      throw runtime_error("Cannot play audio! Not compiled with pcaudiolib.");
-#endif
    }

-    cerr << "Real-time factor: " << result.realTimeFactor
-         << " (infer=" << result.inferSeconds
-         << " sec, audio=" << result.audioSeconds << " sec)" << endl;
+    spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
+                 result.realTimeFactor, result.inferSeconds,
+                 result.audioSeconds);
  }

-  piper::terminate();
-
-#ifdef HAVE_PCAUDIO
-  audio_object_close(my_audio);
-  audio_object_destroy(my_audio);
-  my_audio = nullptr;
-#endif
+  piper::terminate(piperConfig);

  return EXIT_SUCCESS;
 }

+// ----------------------------------------------------------------------------
+
 void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
                   condition_variable &cvAudio, bool &audioReady,
                   bool &audioFinished) {
@@ -275,42 +296,7 @@ void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,

 } // rawOutputProc

-#ifdef HAVE_PCAUDIO
-void playProc(audio_object *my_audio, vector<int16_t> &sharedAudioBuffer,
-              mutex &mutAudio, condition_variable &cvAudio, bool &audioReady,
-              bool &audioFinished) {
-  vector<int16_t> internalAudioBuffer;
-  while (true) {
-    {
-      unique_lock lockAudio{mutAudio};
-      cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
-
-      if (sharedAudioBuffer.empty() && audioFinished) {
-        break;
-      }
-
-      copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
-           back_inserter(internalAudioBuffer));
-
-      sharedAudioBuffer.clear();
-
-      if (!audioFinished) {
-        audioReady = false;
-      }
-    }
-
-    int error =
-        audio_object_write(my_audio, (const char *)internalAudioBuffer.data(),
-                           sizeof(int16_t) * internalAudioBuffer.size());
-    if (error != 0) {
-      throw runtime_error(audio_object_strerror(my_audio, error));
-    }
-    audio_object_flush(my_audio);
-    internalAudioBuffer.clear();
-  }
-
-} // playProc
-#endif
+// ----------------------------------------------------------------------------

 void printUsage(char *argv[]) {
  cerr << endl;
@@ -332,11 +318,18 @@ void printUsage(char *argv[]) {
          "becomes available"
       << endl;
  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
-  cerr << "   --noise-scale           NUM   generator noise (default: 0.667)"
+  cerr << "   --noise_scale           NUM   generator noise (default: 0.667)"
       << endl;
-  cerr << "   --length-scale          NUM   phoneme length (default: 1.0)"
+  cerr << "   --length_scale          NUM   phoneme length (default: 1.0)"
       << endl;
-  cerr << "   --noise-w               NUM   phonene width noise (default: 0.8)"
+  cerr << "   --noise_w               NUM   phoneme width noise (default: 0.8)"
+       << endl;
+  cerr << "   --silence_seconds       NUM   seconds of silence after each "
+          "sentence (default: 0.2)"
+       << endl;
+  cerr << "   --espeak_data           DIR   path to espeak-ng data directory"
+       << endl;
+  cerr << "   --debug                       print DEBUG messages to the console"
       << endl;
  cerr << endl;
 }
@@ -361,7 +354,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
    } else if (arg == "-c" || arg == "--config") {
      ensureArg(argc, argv, i);
      modelConfigPath = filesystem::path(argv[++i]);
-    } else if (arg == "-f" || arg == "--output_file") {
+    } else if (arg == "-f" || arg == "--output_file" ||
+               arg == "--output-file") {
      ensureArg(argc, argv, i);
      std::string filePath = argv[++i];
      if (filePath == "-") {
@@ -371,24 +365,36 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
        runConfig.outputType = OUTPUT_FILE;
        runConfig.outputPath = filesystem::path(filePath);
      }
-    } else if (arg == "-d" || arg == "--output_dir") {
+    } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
      ensureArg(argc, argv, i);
      runConfig.outputType = OUTPUT_DIRECTORY;
      runConfig.outputPath = filesystem::path(argv[++i]);
-    } else if (arg == "--output_raw") {
+    } else if (arg == "--output_raw" || arg == "--output-raw") {
      runConfig.outputType = OUTPUT_RAW;
    } else if (arg == "-s" || arg == "--speaker") {
      ensureArg(argc, argv, i);
      runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
-    } else if (arg == "--noise-scale") {
+    } else if (arg == "--noise_scale" || arg == "--noise-scale") {
      ensureArg(argc, argv, i);
      runConfig.noiseScale = stof(argv[++i]);
-    } else if (arg == "--length-scale") {
+    } else if (arg == "--length_scale" || arg == "--length-scale") {
      ensureArg(argc, argv, i);
      runConfig.lengthScale = stof(argv[++i]);
-    } else if (arg == "--noise-w") {
+    } else if (arg == "--noise_w" || arg == "--noise-w") {
      ensureArg(argc, argv, i);
      runConfig.noiseW = stof(argv[++i]);
+    } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
+      ensureArg(argc, argv, i);
+      runConfig.sentenceSilenceSeconds = stof(argv[++i]);
+    } else if (arg == "--espeak_data" || arg == "--espeak-data") {
+      ensureArg(argc, argv, i);
+      runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
+    } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
+      ensureArg(argc, argv, i);
+      runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
+    } else if (arg == "--debug") {
+      // Set DEBUG logging
+      spdlog::set_level(spdlog::level::debug);
    } else if (arg == "-h" || arg == "--help") {
      printUsage(argv);
      exit(0);
@@ -1,53 +0,0 @@
-#ifndef MODEL_H_
-#define MODEL_H_
-
-#include <string>
-
-#include <onnxruntime_cxx_api.h>
-
-using namespace std;
-
-namespace piper {
-const string instanceName{"piper"};
-
-struct ModelSession {
-  Ort::Session onnx;
-  Ort::AllocatorWithDefaultOptions allocator;
-  Ort::SessionOptions options;
-  Ort::Env env;
-
-  ModelSession() : onnx(nullptr){};
-};
-
-void loadModel(string modelPath, ModelSession &session) {
-
-  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
-                         instanceName.c_str());
-  session.env.DisableTelemetryEvents();
-
-  // Slows down performance by ~2x
-  // session.options.SetIntraOpNumThreads(1);
-
-  // Roughly doubles load time for no visible inference benefit
-  // session.options.SetGraphOptimizationLevel(
-  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-
-  session.options.SetGraphOptimizationLevel(
-      GraphOptimizationLevel::ORT_DISABLE_ALL);
-
-  // Slows down performance very slightly
-  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-
-  session.options.DisableCpuMemArena();
-  session.options.DisableMemPattern();
-  session.options.DisableProfiling();
-
-  auto startTime = chrono::steady_clock::now();
-  session.onnx = Ort::Session(session.env, filesystem::path(modelPath).c_str(), session.options);
-  auto endTime = chrono::steady_clock::now();
-  auto loadDuration = chrono::duration<double>(endTime - startTime);
-}
-
-} // namespace piper
-
-#endif // MODEL_H_
@@ -1,138 +0,0 @@
-#ifndef PHONEMIZE_H_
-#define PHONEMIZE_H_
-
-#include <filesystem>
-#include <iostream>
-#include <map>
-#include <optional>
-#include <set>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <espeak-ng/speak_lib.h>
-
-#include "config.hpp"
-#include "utf8.h"
-
-#define CLAUSE_INTONATION_FULL_STOP   0x00000000
-#define CLAUSE_INTONATION_COMMA       0x00001000
-#define CLAUSE_INTONATION_QUESTION    0x00002000
-#define CLAUSE_INTONATION_EXCLAMATION 0x00003000
-
-#define CLAUSE_TYPE_SENTENCE          0x00080000
-
-using namespace std;
-
-namespace piper {
-
-// Text to phonemes using eSpeak-ng
-void phonemize(string text, PhonemizeConfig &phonemizeConfig,
-               vector<vector<Phoneme>> &phonemes) {
-  if (!phonemizeConfig.eSpeak) {
-    throw runtime_error("Missing eSpeak config");
-  }
-
-  auto voice = phonemizeConfig.eSpeak->voice;
-  int result = espeak_SetVoiceByName(voice.c_str());
-  if (result != 0) {
-    throw runtime_error("Failed to set eSpeak-ng voice");
-  }
-
-  // Modified by eSpeak
-  string textCopy(text);
-
-  utf8::iterator textIter(textCopy.begin(), textCopy.begin(), textCopy.end());
-  utf8::iterator textIterEnd(textCopy.end(), textCopy.begin(), textCopy.end());
-  vector<char32_t> textClauseBreakers;
-
-  // Identify clause breakers in the sentence, since eSpeak removes them during
-  // phonemization.
-  //
-  // This will unfortunately do the wrong thing with abbreviations, etc.
-  while (textIter != textIterEnd) {
-    auto codepoint = *textIter;
-    if (phonemizeConfig.eSpeak->clauseBreakers.contains(codepoint)) {
-      textClauseBreakers.push_back(codepoint);
-    }
-
-    textIter++;
-  }
-
-  vector<Phoneme> *sentencePhonemes = nullptr;
-  const char *inputTextPointer = textCopy.c_str();
-  int terminator = 0;
-
-  while (inputTextPointer != NULL) {
-    // Modified espeak-ng API to get access to clause terminator
-    string clausePhonemes(
-        espeak_TextToPhonemes2((const void **)&inputTextPointer,
-                              /*textmode*/ espeakCHARS_AUTO,
-                              /*phonememode = IPA*/ 0x02,
-                               &terminator));
-
-    utf8::iterator phonemeIter(clausePhonemes.begin(), clausePhonemes.begin(),
-                               clausePhonemes.end());
-    utf8::iterator phonemeEnd(clausePhonemes.end(), clausePhonemes.begin(),
-                              clausePhonemes.end());
-
-    if (!sentencePhonemes) {
-      // Start new sentence
-      phonemes.emplace_back();
-      sentencePhonemes = &phonemes[phonemes.size() - 1];
-    }
-
-    sentencePhonemes->insert(sentencePhonemes->end(), phonemeIter, phonemeEnd);
-
-    // Add appropriate puntuation depending on terminator type
-    int intonation = terminator & 0x0000F000;
-    if (intonation == CLAUSE_INTONATION_FULL_STOP) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->fullStop);
-    } else if (intonation == CLAUSE_INTONATION_COMMA) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->comma);
-    } else if (intonation == CLAUSE_INTONATION_QUESTION) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->question);
-    } else if (intonation == CLAUSE_INTONATION_EXCLAMATION) {
-      sentencePhonemes->push_back(phonemizeConfig.eSpeak->exclamation);
-    }
-
-    if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) {
-        // End of sentence
-        sentencePhonemes = nullptr;
-    }
-
-  }  // while inputTextPointer != NULL
-
-} /* phonemize */
-
-// Phonemes to ids using JSON map
-void phonemes2ids(vector<Phoneme> &phonemes, PhonemizeConfig &phonemizeConfig,
-                  vector<PhonemeId> &phonemeIds) {
-  if (phonemes.empty()) {
-    throw runtime_error("No phonemes");
-  }
-
-  phonemeIds.push_back(phonemizeConfig.idBos);
-  if (phonemizeConfig.interspersePad) {
-    phonemeIds.push_back(phonemizeConfig.idPad);
-  }
-
-  for (auto phoneme = phonemes.begin(); phoneme != phonemes.end(); phoneme++) {
-    if (phonemizeConfig.phonemeIdMap.contains(*phoneme)) {
-      for (auto id : phonemizeConfig.phonemeIdMap[*phoneme]) {
-        phonemeIds.push_back(id);
-
-        if (phonemizeConfig.interspersePad) {
-          phonemeIds.push_back(phonemizeConfig.idPad);
-        }
-      }
-    }
-  }
-
-  phonemeIds.push_back(phonemizeConfig.idEos);
-
-} /* phonemes2ids */
-
-} // namespace piper
-
-#endif // PHONEMIZE_H_
@@ -0,0 +1,514 @@
+#include <array>
+#include <chrono>
+#include <fstream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+
+#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <spdlog/spdlog.h>
+
+#include "piper.hpp"
+#include "utf8.h"
+#include "wavfile.hpp"
+
+namespace piper {
+
+// Maximum value for 16-bit signed WAV sample
+const float MAX_WAV_VALUE = 32767.0f;
+
+const std::string instanceName{"piper"};
+
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s) {
+  return utf8::distance(s.begin(), s.end()) == 1;
+}
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s) {
+  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
+  return *character_iter;
+}
+
+// Load JSON config information for phonemization
+void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+  // {
+  //     "espeak": {
+  //         "voice": "<language code>"
+  //     },
+  //     "phoneme_type": "<espeak or text>",
+  //     "phoneme_map": {
+  //         "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
+  //     },
+  //     "phoneme_id_map": {
+  //         "<phoneme>": [<id1>, <id2>, ...]
+  //     }
+  // }
+
+  if (configRoot.contains("espeak")) {
+    auto espeakValue = configRoot["espeak"];
+    if (espeakValue.contains("voice")) {
+      phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
+    }
+  }
+
+  if (configRoot.contains("phoneme_type")) {
+    auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
+    if (phonemeTypeStr == "text") {
+      phonemizeConfig.phonemeType = TextPhonemes;
+    }
+  }
+
+  // phoneme to [id] map
+  // Maps phonemes to one or more phoneme ids (required).
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
+  // phoneme to [phoneme] map
+  // Maps phonemes to one or more other phonemes (not normally used).
+  if (configRoot.contains("phoneme_map")) {
+    if (!phonemizeConfig.phonemeMap) {
+      phonemizeConfig.phonemeMap.emplace();
+    }
+
+    auto phonemeMapValue = configRoot["phoneme_map"];
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
+        std::string toPhoneme = toPhonemeValue.get<std::string>();
+        if (!isSingleCodepoint(toPhoneme)) {
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme map)");
+        }
+
+        auto toCodepoint = getCodepoint(toPhoneme);
+        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
+      }
+    }
+  }
+
+} /* parsePhonemizeConfig */
+
+// Load JSON config for audio synthesis
+void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
+  // {
+  //     "audio": {
+  //         "sample_rate": 22050
+  //     },
+  //     "inference": {
+  //         "noise_scale": 0.667,
+  //         "length_scale": 1,
+  //         "noise_w": 0.8
+  //     }
+  // }
+
+  if (configRoot.contains("audio")) {
+    auto audioValue = configRoot["audio"];
+    if (audioValue.contains("sample_rate")) {
+      // Default sample rate is 22050 Hz
+      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
+    }
+  }
+
+  if (configRoot.contains("inference")) {
+    // Overrides default inference settings
+    auto inferenceValue = configRoot["inference"];
+    if (inferenceValue.contains("noise_scale")) {
+      synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
+    }
+
+    if (inferenceValue.contains("length_scale")) {
+      synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
+    }
+
+    if (inferenceValue.contains("noise_w")) {
+      synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
+    }
+  }
+
+} /* parseSynthesisConfig */
+
+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+} /* parseModelConfig */
+
+void initialize(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
+    // See: https://github.com/rhasspy/espeak-ng
+    spdlog::debug("Initializing eSpeak");
+    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                   /*buflength*/ 0,
+                                   /*path*/ config.eSpeakDataPath.c_str(),
+                                   /*options*/ 0);
+    if (result < 0) {
+      throw std::runtime_error("Failed to initialize eSpeak-ng");
+    }
+
+    spdlog::debug("Initialized eSpeak");
+  }
+
+  // Load onnx model for libtashkeel
+  // https://github.com/mush42/libtashkeel/
+  if (config.useTashkeel) {
+    spdlog::debug("Using libtashkeel for diacritization");
+    if (!config.tashkeelModelPath) {
+      throw std::runtime_error("No path to libtashkeel model");
+    }
+
+    spdlog::debug("Loading libtashkeel model from {}",
+                  config.tashkeelModelPath.value());
+    config.tashkeelState = std::make_unique<tashkeel::State>();
+    tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
+                            *config.tashkeelState);
+    spdlog::debug("Initialized libtashkeel");
+  }
+
+  spdlog::info("Initialized piper");
+}
+
+void terminate(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Clean up espeak-ng
+    spdlog::debug("Terminating eSpeak");
+    espeak_Terminate();
+    spdlog::debug("Terminated eSpeak");
+  }
+
+  spdlog::info("Terminated piper");
+}
+
+void loadModel(std::string modelPath, ModelSession &session) {
+  spdlog::debug("Loading onnx model from {}", modelPath);
+  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
+                         instanceName.c_str());
+  session.env.DisableTelemetryEvents();
+
+  // Slows down performance by ~2x
+  // session.options.SetIntraOpNumThreads(1);
+
+  // Roughly doubles load time for no visible inference benefit
+  // session.options.SetGraphOptimizationLevel(
+  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+  session.options.SetGraphOptimizationLevel(
+      GraphOptimizationLevel::ORT_DISABLE_ALL);
+
+  // Slows down performance very slightly
+  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+
+  session.options.DisableCpuMemArena();
+  session.options.DisableMemPattern();
+  session.options.DisableProfiling();
+
+  auto startTime = std::chrono::steady_clock::now();
+  session.onnx = Ort::Session(session.env, modelPath.c_str(), session.options);
+  auto endTime = std::chrono::steady_clock::now();
+  spdlog::debug("Loaded onnx model in {} second(s)",
+                std::chrono::duration<double>(endTime - startTime).count());
+}
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId) {
+  spdlog::debug("Parsing voice config at {}", modelConfigPath);
+  std::ifstream modelConfigFile(modelConfigPath);
+  voice.configRoot = json::parse(modelConfigFile);
+
+  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
+  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multi-speaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }
+
+  spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
+
+  loadModel(modelPath, voice.session);
+
+} /* loadVoice */
+
+// Phoneme ids to WAV audio
+void synthesize(std::vector<PhonemeId> &phonemeIds,
+                SynthesisConfig &synthesisConfig, ModelSession &session,
+                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
+  spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
+
+  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
+      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+
+  // Allocate
+  std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
+  std::vector<float> scales{synthesisConfig.noiseScale,
+                            synthesisConfig.lengthScale,
+                            synthesisConfig.noiseW};
+
+  std::vector<Ort::Value> inputTensors;
+  std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
+      phonemeIdsShape.size()));
+
+  std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
+      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
+
+  std::vector<int64_t> scalesShape{(int64_t)scales.size()};
+  inputTensors.push_back(
+      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
+                                      scalesShape.data(), scalesShape.size()));
+
+  // Add speaker id.
+  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
+  std::vector<int64_t> speakerId{
+      (int64_t)synthesisConfig.speakerId.value_or(0)};
+  std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
+
+  if (synthesisConfig.speakerId) {
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
+  // From export_onnx.py
+  std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
+                                            "sid"};
+  std::array<const char *, 1> outputNames = {"output"};
+
+  // Infer
+  auto startTime = std::chrono::steady_clock::now();
+  auto outputTensors = session.onnx.Run(
+      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
+      inputTensors.size(), outputNames.data(), outputNames.size());
+  auto endTime = std::chrono::steady_clock::now();
+
+  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
+    throw std::runtime_error("Invalid output tensors");
+  }
+  auto inferDuration = std::chrono::duration<double>(endTime - startTime);
+  result.inferSeconds = inferDuration.count();
+
+  const float *audio = outputTensors.front().GetTensorData<float>();
+  auto audioShape =
+      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
+  int64_t audioCount = audioShape[audioShape.size() - 1];
+
+  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
+  result.realTimeFactor = 0.0;
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+  spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
+                result.audioSeconds, result.inferSeconds);
+
+  // Get max audio value for scaling
+  float maxAudioValue = 0.01f;
+  for (int64_t i = 0; i < audioCount; i++) {
+    float audioValue = abs(audio[i]);
+    if (audioValue > maxAudioValue) {
+      maxAudioValue = audioValue;
+    }
+  }
+
+  // We know the size up front
+  audioBuffer.reserve(audioCount);
+
+  // Scale audio to fill range and convert to int16
+  float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
+  for (int64_t i = 0; i < audioCount; i++) {
+    int16_t intAudioValue = static_cast<int16_t>(
+        std::clamp(audio[i] * audioScale,
+                   static_cast<float>(std::numeric_limits<int16_t>::min()),
+                   static_cast<float>(std::numeric_limits<int16_t>::max())));
+
+    audioBuffer.push_back(intAudioValue);
+  }
+
+  // Clean up
+  for (std::size_t i = 0; i < outputTensors.size(); i++) {
+    Ort::detail::OrtRelease(outputTensors[i].release());
+  }
+
+  for (std::size_t i = 0; i < inputTensors.size(); i++) {
+    Ort::detail::OrtRelease(inputTensors[i].release());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback) {
+
+  std::size_t sentenceSilenceSamples = 0;
+  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
+    sentenceSilenceSamples = (std::size_t)(
+        voice.synthesisConfig.sentenceSilenceSeconds *
+        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
+  }
+
+  if (config.useTashkeel) {
+    if (!config.tashkeelState) {
+      throw std::runtime_error("Tashkeel model is not loaded");
+    }
+
+    spdlog::debug("Diacritizing text with libtashkeel: {}", text);
+    text = tashkeel::tashkeel_run(text, *config.tashkeelState);
+  }
+
+  // Phonemes for each sentence
+  spdlog::debug("Phonemizing text: {}", text);
+  std::vector<std::vector<Phoneme>> phonemes;
+
+  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
+    // Use espeak-ng for phonemization
+    eSpeakPhonemeConfig eSpeakConfig;
+    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
+    phonemize_eSpeak(text, eSpeakConfig, phonemes);
+  } else {
+    // Use UTF-8 codepoints as "phonemes"
+    CodepointsPhonemeConfig codepointsConfig;
+    phonemize_codepoints(text, codepointsConfig, phonemes);
+  }
+
+  // Synthesize each sentence independently.
+  std::vector<PhonemeId> phonemeIds;
+  std::map<Phoneme, std::size_t> missingPhonemes;
+  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
+       ++phonemesIter) {
+    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
+
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phonemes
+      std::string phonemesStr;
+      for (auto phoneme : sentencePhonemes) {
+        utf8::append(phoneme, phonemesStr);
+      }
+
+      spdlog::debug("Converting {} phoneme(s) to ids: {}",
+                    sentencePhonemes.size(), phonemesStr);
+    }
+
+    SynthesisResult sentenceResult;
+
+    PhonemeIdConfig idConfig;
+    if (voice.phonemizeConfig.phonemeType == TextPhonemes) {
+      auto &language = voice.phonemizeConfig.eSpeak.voice;
+      spdlog::debug("Text phoneme language: {}", language);
+      if (DEFAULT_ALPHABET.count(language) < 1) {
+        throw std::runtime_error(
+            "Text phoneme language for voice is not supported");
+      }
+
+      // Use alphabet for language
+      idConfig.phonemeIdMap =
+          std::make_shared<PhonemeIdMap>(DEFAULT_ALPHABET[language]);
+    }
+
+    // phonemes -> ids
+    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phoneme ids
+      std::stringstream phonemeIdsStr;
+      for (auto phonemeId : phonemeIds) {
+        phonemeIdsStr << phonemeId << ", ";
+      }
+
+      spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                    sentencePhonemes.size(), phonemeIds.size(),
+                    phonemeIdsStr.str());
+    }
+
+    // ids -> audio
+    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+               sentenceResult);
+
+    // Add end of sentence silence
+    if (sentenceSilenceSamples > 0) {
+      for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
+        audioBuffer.push_back(0);
+      }
+    }
+
+    if (audioCallback) {
+      // Call back must copy audio since it is cleared afterwards.
+      audioCallback();
+      audioBuffer.clear();
+    }
+
+    result.audioSeconds += sentenceResult.audioSeconds;
+    result.inferSeconds += sentenceResult.inferSeconds;
+
+    phonemeIds.clear();
+  }
+
+  if (missingPhonemes.size() > 0) {
+    spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
+                 missingPhonemes.size());
+
+    for (auto phonemeCount : missingPhonemes) {
+      std::string phonemeStr;
+      utf8::append(phonemeCount.first, phonemeStr);
+      spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
+                   (uint32_t)phonemeCount.first, phonemeCount.second);
+    }
+  }
+
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+
+} /* textToAudio */
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result) {
+
+  std::vector<int16_t> audioBuffer;
+  textToAudio(config, voice, text, audioBuffer, result, NULL);
+
+  // Write WAV
+  auto synthesisConfig = voice.synthesisConfig;
+  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
+                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
+                 audioFile);
+
+  audioFile.write((const char *)audioBuffer.data(),
+                  sizeof(int16_t) * audioBuffer.size());
+
+} /* textToWavFile */
+
+} // namespace piper
@@ -1,24 +1,83 @@
 #ifndef PIPER_H_
 #define PIPER_H_

-#include <filesystem>
-#include <iostream>
+#include <fstream>
+#include <functional>
+#include <optional>
 #include <string>
 #include <vector>

-#include "json.hpp"
-#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <phoneme_ids.hpp>
+#include <phonemize.hpp>
+#include <tashkeel.hpp>

-#include "config.hpp"
-#include "model.hpp"
-#include "phonemize.hpp"
-#include "synthesize.hpp"
-#include "wavfile.hpp"
+#include "json.hpp"

 using json = nlohmann::json;

 namespace piper {

+typedef int64_t SpeakerId;
+
+struct eSpeakConfig {
+  std::string voice = "en-us";
+};
+
+struct PiperConfig {
+  std::string eSpeakDataPath;
+  bool useESpeak = true;
+
+  bool useTashkeel = false;
+  std::optional<std::string> tashkeelModelPath;
+  std::unique_ptr<tashkeel::State> tashkeelState;
+};
+
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };
+
+struct PhonemizeConfig {
+  PhonemeType phonemeType = eSpeakPhonemes;
+  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
+  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
+
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;
+
+  eSpeakConfig eSpeak;
+};
+
+struct SynthesisConfig {
+  float noiseScale = 0.667f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+  std::optional<SpeakerId> speakerId;
+  float sentenceSilenceSeconds = 0.2f;
+};
+
+struct ModelConfig {
+  int numSpeakers;
+};
+
+struct ModelSession {
+  Ort::Session onnx;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::SessionOptions options;
+  Ort::Env env;
+
+  ModelSession() : onnx(nullptr){};
+};
+
+struct SynthesisResult {
+  double inferSeconds;
+  double audioSeconds;
+  double realTimeFactor;
+};
+
 struct Voice {
  json configRoot;
  PhonemizeConfig phonemizeConfig;
@@ -27,122 +86,25 @@ struct Voice {
  ModelSession session;
 };

-void initialize(std::filesystem::path cwd) {
-  string dataPath;
+// Must be called before using textTo* functions
+void initialize(PiperConfig &config);

-  auto cwdDataPath = std::filesystem::absolute(cwd.append("espeak-ng-data"));
-  if (std::filesystem::is_directory(cwdDataPath)) {
-    dataPath = cwdDataPath.string();
-  }
-
-	cerr << "dataPath: " << dataPath << endl;
-
-  // Set up espeak-ng for calling espeak_TextToPhonemes
-  int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
-                                 /*buflength*/ 0,
-                                 /*path*/ dataPath.c_str(),
-                                 /*options*/ 0);
-  if (result < 0) {
-    throw runtime_error("Failed to initialize eSpeak-ng");
-  }
-}
-
-void terminate() {
-  // Clean up espeak-ng
-  espeak_Terminate();
-}
+// Clean up
+void terminate(PiperConfig &config);

 // Load Onnx model and JSON config file
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
-               optional<SpeakerId> &speakerId) {
-  ifstream modelConfigFile(modelConfigPath.c_str());
-  voice.configRoot = json::parse(modelConfigFile);
-
-  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
-  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
-  parseModelConfig(voice.configRoot, voice.modelConfig);
-
-  if (voice.modelConfig.numSpeakers > 1) {
-    // Multispeaker model
-    if (speakerId) {
-      voice.synthesisConfig.speakerId = speakerId;
-    } else {
-      // Default speaker
-      voice.synthesisConfig.speakerId = 0;
-    }
-  }
-
-  loadModel(modelPath, voice.session);
-
-} /* loadVoice */
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId);

 // Phonemize text and synthesize audio
-void textToAudio(Voice &voice, string text, vector<int16_t> &audioBuffer,
-                 SynthesisResult &result,
-                 const function<void()> &audioCallback) {
-
-  size_t sentenceSilenceSamples = 0;
-  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
-    sentenceSilenceSamples = (size_t)(
-        voice.synthesisConfig.sentenceSilenceSeconds *
-        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
-  }
-
-  // Phonemes for each sentence
-  vector<vector<Phoneme>> phonemes;
-  phonemize(text, voice.phonemizeConfig, phonemes);
-
-  vector<PhonemeId> phonemeIds;
-  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
-       ++phonemesIter) {
-    vector<Phoneme> &sentencePhonemes = *phonemesIter;
-    SynthesisResult sentenceResult;
-    phonemes2ids(sentencePhonemes, voice.phonemizeConfig, phonemeIds);
-    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
-               sentenceResult);
-
-    // Add end of sentence silence
-    if (sentenceSilenceSamples > 0) {
-      for (size_t i = 0; i < sentenceSilenceSamples; i++) {
-        audioBuffer.push_back(0);
-      }
-    }
-
-    if (audioCallback) {
-      // Call back must copy audio since it is cleared afterwards.
-      audioCallback();
-      audioBuffer.clear();
-    }
-
-    result.audioSeconds += sentenceResult.audioSeconds;
-    result.inferSeconds += sentenceResult.inferSeconds;
-
-    phonemeIds.clear();
-  }
-
-  if (result.audioSeconds > 0) {
-    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
-  }
-
-} /* textToAudio */
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback);

 // Phonemize text and synthesize audio to WAV file
-void textToWavFile(Voice &voice, string text, ostream &audioFile,
-                   SynthesisResult &result) {
-
-  vector<int16_t> audioBuffer;
-  textToAudio(voice, text, audioBuffer, result, NULL);
-
-  // Write WAV
-  auto synthesisConfig = voice.synthesisConfig;
-  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
-                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
-                 audioFile);
-
-  audioFile.write((const char *)audioBuffer.data(),
-                  sizeof(int16_t) * audioBuffer.size());
-
-} /* textToWavFile */
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result);

 } // namespace piper

@@ -1,130 +0,0 @@
-#ifndef SYNTHESIZE_H_
-#define SYNTHESIZE_H_
-
-#include <array>
-#include <chrono>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include <onnxruntime_cxx_api.h>
-
-#include "config.hpp"
-#include "model.hpp"
-
-using namespace std;
-
-namespace piper {
-
-// Maximum value for 16-bit signed WAV sample
-const float MAX_WAV_VALUE = 32767.0f;
-
-struct SynthesisResult {
-  double inferSeconds;
-  double audioSeconds;
-  double realTimeFactor;
-};
-
-// Phoneme ids to WAV audio
-void synthesize(vector<PhonemeId> &phonemeIds, SynthesisConfig &synthesisConfig,
-                ModelSession &session, vector<int16_t> &audioBuffer,
-                SynthesisResult &result) {
-  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
-      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-
-  // Allocate
-  vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
-  vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
-                       synthesisConfig.noiseW};
-
-  vector<Ort::Value> inputTensors;
-  vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
-  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
-      phonemeIdsShape.size()));
-
-  vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
-  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
-      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
-
-  vector<int64_t> scalesShape{(int64_t)scales.size()};
-  inputTensors.push_back(
-      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
-                                      scalesShape.data(), scalesShape.size()));
-
-  // Add speaker id.
-  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
-  vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)};
-  vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
-
-  if (synthesisConfig.speakerId) {
-    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
-        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
-        speakerIdShape.size()));
-  }
-
-  // From export_onnx.py
-  array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
-                                       "sid"};
-  array<const char *, 1> outputNames = {"output"};
-
-  // Infer
-  auto startTime = chrono::steady_clock::now();
-  auto outputTensors = session.onnx.Run(
-      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
-      inputTensors.size(), outputNames.data(), outputNames.size());
-  auto endTime = chrono::steady_clock::now();
-
-  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
-    throw runtime_error("Invalid output tensors");
-  }
-  auto inferDuration = chrono::duration<double>(endTime - startTime);
-  result.inferSeconds = inferDuration.count();
-
-  const float *audio = outputTensors.front().GetTensorData<float>();
-  auto audioShape =
-      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
-  int64_t audioCount = audioShape[audioShape.size() - 1];
-
-  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
-  result.realTimeFactor = 0.0;
-  if (result.audioSeconds > 0) {
-    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
-  }
-
-  // Get max audio value for scaling
-  float maxAudioValue = 0.01f;
-  for (int64_t i = 0; i < audioCount; i++) {
-    float audioValue = abs(audio[i]);
-    if (audioValue > maxAudioValue) {
-      maxAudioValue = audioValue;
-    }
-  }
-
-  // We know the size up front
-  audioBuffer.reserve(audioCount);
-
-  // Scale audio to fill range and convert to int16
-  float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
-  for (int64_t i = 0; i < audioCount; i++) {
-    int16_t intAudioValue = static_cast<int16_t>(
-        clamp(audio[i] * audioScale,
-              static_cast<float>(numeric_limits<int16_t>::min()),
-              static_cast<float>(numeric_limits<int16_t>::max())));
-
-    audioBuffer.push_back(intAudioValue);
-  }
-
-  // Clean up
-  for (size_t i = 0; i < outputTensors.size(); i++) {
-    Ort::detail::OrtRelease(outputTensors[i].release());
-  }
-
-  for (size_t i = 0; i < inputTensors.size(); i++) {
-    Ort::detail::OrtRelease(inputTensors[i].release());
-  }
-}
-} // namespace piper
-
-#endif // SYNTHESIZE_H_
@@ -3,8 +3,6 @@

 #include <iostream>

-namespace piper {
-
 struct WavHeader {
  uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
  uint32_t chunkSize;
@@ -14,7 +12,7 @@ struct WavHeader {
  uint8_t fmt[4] = {'f', 'm', 't', ' '};
  uint32_t fmtSize = 16;    // bytes
  uint16_t audioFormat = 1; // PCM
-  uint16_t numChannels; // mono
+  uint16_t numChannels;     // mono
  uint32_t sampleRate;      // Hertz
  uint32_t bytesPerSec;     // sampleRate * sampleWidth
  uint16_t blockAlign = 2;  // 16-bit mono
@@ -39,6 +37,4 @@ void writeWavHeader(int sampleRate, int sampleWidth, int channels,

 } /* writeWavHeader */

-} // namespace piper
-
 #endif // WAVFILE_H_
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import json
+import sys
+import unicodedata
+from collections import Counter
+
+from .phonemize import DEFAULT_PHONEME_ID_MAP
+
+
+def main() -> None:
+    used_phonemes: "Counter[str]" = Counter()
+    missing_phonemes: "Counter[str]" = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        for phoneme in utt["phonemes"]:
+            used_phonemes[phoneme] += 1
+
+            if phoneme not in DEFAULT_PHONEME_ID_MAP:
+                missing_phonemes[phoneme] += 1
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
+
+    json.dump(
+        {
+            "used": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in used_phonemes.most_common()
+            },
+            "missing": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in missing_phonemes.most_common()
+            },
+        },
+        sys.stdout,
+    )
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
@@ -2,7 +2,6 @@
 import argparse
 import logging
 from pathlib import Path
-from typing import Optional

 import torch

@@ -41,7 +40,6 @@ def main():
    model_g = model.model_g

    num_symbols = model_g.n_vocab
-    num_speakers = model_g.n_speakers

    # Inference only
    model_g.eval()
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import json
+import re
+import shutil
+import statistics
+import subprocess
+import sys
+import threading
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict, dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+from .norm_audio import make_silence_detector, trim_silence
+
+_DIR = Path(__file__).parent
+
+# Removed from the speaking rate calculation
+_PUNCTUATION = re.compile(".。,，?¿？؟!！;；:：-—")
+
+
+class ExcludeReason(str, Enum):
+    MISSING = "file_missing"
+    EMPTY = "file_empty"
+    LOW = "rate_low"
+    HIGH = "rate_high"
+
+
+@dataclass
+class Utterance:
+    id: str
+    text: str
+    duration_sec: float
+    speaker: str
+    exclude_reason: Optional[ExcludeReason] = None
+    rate: float = 0.0
+
+    def __post_init__(self):
+        if self.duration_sec > 0:
+            # Don't include punctuation is speaking rate calculation since we
+            # remove silence.
+            text_nopunct = _PUNCTUATION.sub("", self.text)
+            self.rate = len(text_nopunct) / self.duration_sec
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--write-json", help="Path to write information about excluded utterances"
+    )
+    parser.add_argument(
+        "--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
+    )
+    parser.add_argument("--scale-lower", type=float, default=2.0)
+    parser.add_argument("--scale-upper", type=float, default=2.0)
+    args = parser.parse_args()
+
+    if not shutil.which("ffprobe"):
+        raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
+
+    dataset_dir = Path(args.dataset_dir)
+    wav_dir = dataset_dir / "wav"
+    if not wav_dir.is_dir():
+        wav_dir = dataset_dir / "wavs"
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+
+    text_and_audio = []
+    for row in reader:
+        filename, text = row[0], row[-1]
+        speaker = row[1] if len(row) > 2 else "default"
+
+        # Try file name relative to metadata
+        wav_path = dataset_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = dataset_dir / f"{filename}.wav"
+
+        if not wav_path.exists():
+            # Try wav/ or wavs/
+            wav_path = wav_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = wav_dir / f"{filename}.wav"
+
+        text_and_audio.append((filename, text, wav_path, speaker))
+
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    # speaker -> [rate]
+    utts_by_speaker = defaultdict(list)
+    process_utterance = ProcessUtterance()
+    with ThreadPoolExecutor() as executor:
+        for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
+            utts_by_speaker[utt.speaker].append(utt)
+
+    is_multispeaker = len(utts_by_speaker) > 1
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    speaker_details = {}
+    for speaker, utts in utts_by_speaker.items():
+        rates = [utt.rate for utt in utts]
+        if rates:
+            # Exclude rates well outside the 25%/75% quantiles
+            rate_qs = statistics.quantiles(rates, n=4)
+            q1 = rate_qs[0]  # 25%
+            q3 = rate_qs[-1]  # 75%
+            iqr = q3 - q1
+            lower = q1 - (args.scale_lower * iqr)
+            upper = q3 + (args.scale_upper * iqr)
+            speaker_details[speaker] = {
+                "min": min(rates),
+                "max": max(rates),
+                "quanties": rate_qs,
+                "lower": lower,
+                "upper": upper,
+            }
+
+            for utt in utts:
+                if utt.rate < lower:
+                    utt.exclude_reason = ExcludeReason.LOW
+                elif utt.rate > upper:
+                    utt.exclude_reason = ExcludeReason.HIGH
+                else:
+                    if is_multispeaker:
+                        writer.writerow((utt.id, utt.speaker, utt.text))
+                    else:
+                        writer.writerow((utt.id, utt.text))
+
+    if args.write_json:
+        speaker_excluded = {
+            speaker: [
+                asdict(utt)
+                for utt in utts_by_speaker[speaker]
+                if utt.exclude_reason is not None
+            ]
+            for speaker in speaker_details
+        }
+
+        with open(args.write_json, "w") as json_file:
+            json.dump(
+                {
+                    speaker: {
+                        "details": speaker_details[speaker],
+                        "num_utterances": len(utts_by_speaker[speaker]),
+                        "num_excluded": len(speaker_excluded[speaker]),
+                        "excluded": speaker_excluded[speaker],
+                    }
+                    for speaker in speaker_details
+                },
+                json_file,
+                indent=4,
+                ensure_ascii=False,
+            )
+
+
+class ProcessUtterance:
+    def __init__(self):
+        self.thread_data = threading.local()
+
+    def __call__(
+        self, utt_id: str, text: str, wav_path: Path, speaker: str
+    ) -> Utterance:
+        if not wav_path.exists():
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.MISSING,
+            )
+
+        if wav_path.stat().st_size == 0:
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.EMPTY,
+            )
+
+        return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
+
+    def get_duration(self, audio_path: Path) -> float:
+        """Uses ffmpeg to get audio duration."""
+        if not hasattr(self.thread_data, "detector"):
+            self.thread_data.detector = make_silence_detector()
+
+        vad_sample_rate = 16000
+        audio_16khz_bytes = subprocess.check_output(
+            [
+                "ffmpeg",
+                "-i",
+                str(audio_path),
+                "-f",
+                "s16le",
+                "-acodec",
+                "pcm_s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(vad_sample_rate),
+                "pipe:",
+            ],
+            stderr=subprocess.DEVNULL,
+        )
+
+        # Normalize
+        audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
+            np.float32
+        )
+        audio_16khz /= np.abs(np.max(audio_16khz))
+
+        # Get speaking duration
+        offset_sec, duration_sec = trim_silence(
+            audio_16khz,
+            self.thread_data.detector,
+            threshold=0.8,
+            samples_per_chunk=480,
+            sample_rate=vad_sample_rate,
+            keep_chunks_before=2,
+            keep_chunks_after=2,
+        )
+
+        if duration_sec is None:
+            # Speech goes to end of audio
+            if len(audio_16khz) > 0:
+                duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
+            else:
+                duration_sec = 0.0
+
+        return duration_sec
+
+        # return float(
+        #     subprocess.check_output(
+        #         [
+        #             "ffprobe",
+        #             "-i",
+        #             str(audio_path),
+        #             "-show_entries",
+        #             "format=duration",
+        #             "-v",
+        #             "quiet",
+        #             "-of",
+        #             "csv=p=0",
+        #         ],
+        #         stderr=subprocess.DEVNULL,
+        #         universal_newlines=True,
+        #     ).strip()
+        # )
+
+
+if __name__ == "__main__":
+    main()
@@ -1,9 +1,23 @@
+import argparse
+import json
+import sys
 import unicodedata
 from collections import Counter
+from enum import Enum
 from typing import Dict, Iterable, List, Mapping, Optional

 from espeak_phonemizer import Phonemizer

+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    """Phonemes come from espeak-ng"""
+
+    TEXT = "text"
+    """Phonemes come from text itself"""
+
+
+MAX_PHONEMES = 256
 DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "_": [0],
    "^": [1],
@@ -135,14 +149,115 @@ DEFAULT_PHONEME_ID_MAP: Dict[str, List[int]] = {
    "χ": [127],
    "ᵻ": [128],
    "ⱱ": [129],
+    "0": [130],  # tones
+    "1": [131],
+    "2": [132],
+    "3": [133],
+    "4": [134],
+    "5": [135],
+    "6": [136],
+    "7": [137],
+    "8": [138],
+    "9": [139],
+    "\u0327": [140],  # combining cedilla
+    "\u0303": [141],  # combining tilde
+    "\u032a": [142],  # combining bridge below
+    "\u032f": [143],  # combining inverted breve below
+    "\u0329": [144],  # combining vertical line below
+    "ʰ": [145],
+    "ˤ": [146],
+    "ε": [147],
+    "↓": [148],
+    "#": [149],  # Icelandic
+    '"': [150],  # Russian
+    "↑": [151],
+    "\u033a": [152],  # Basque
+    "\u033b": [153],
+}
+
+PHONEME_MAPS = {
+    # Brazilian Portuguese
+    "pt-br": {"c": ["k"]}
+}
+
+ALPHABETS = {
+    # Ukrainian
+    "uk": {
+        "_": [0],
+        "^": [1],
+        "$": [2],
+        " ": [3],
+        "!": [4],
+        "'": [5],
+        ",": [6],
+        "-": [7],
+        ".": [8],
+        ":": [9],
+        ";": [10],
+        "?": [11],
+        "а": [12],
+        "б": [13],
+        "в": [14],
+        "г": [15],
+        "ґ": [16],
+        "д": [17],
+        "е": [18],
+        "є": [19],
+        "ж": [20],
+        "з": [21],
+        "и": [22],
+        "і": [23],
+        "ї": [24],
+        "й": [25],
+        "к": [26],
+        "л": [27],
+        "м": [28],
+        "н": [29],
+        "о": [30],
+        "п": [31],
+        "р": [32],
+        "с": [33],
+        "т": [34],
+        "у": [35],
+        "ф": [36],
+        "х": [37],
+        "ц": [38],
+        "ч": [39],
+        "ш": [40],
+        "щ": [41],
+        "ь": [42],
+        "ю": [43],
+        "я": [44],
+        "\u0301": [45],  # combining acute accent
+        "\u0306": [46],  # combining breve
+        "\u0308": [47],  # combining diaeresis
+        "—": [48],  # em dash
+    }
 }


-def phonemize(text: str, phonemizer: Phonemizer) -> List[str]:
+def phonemize(
+    text: str,
+    phonemizer: Phonemizer,
+    phoneme_map: Optional[Dict[str, List[str]]] = None,
+) -> List[str]:
    phonemes_str = phonemizer.phonemize(text=text, keep_clause_breakers=True)

    # Phonemes are decomposed into unicode codepoints
-    return list(unicodedata.normalize("NFD", phonemes_str))
+    unmapped_phonemes = list(unicodedata.normalize("NFD", phonemes_str))
+    if not phoneme_map:
+        return unmapped_phonemes
+
+    # Phonemes can be mapped to lists of other phonemes
+    mapped_phonemes = []
+    for phoneme in unmapped_phonemes:
+        sub_phonemes = phoneme_map.get(phoneme)
+        if sub_phonemes:
+            mapped_phonemes.extend(sub_phonemes)
+        else:
+            mapped_phonemes.append(phoneme)
+
+    return mapped_phonemes


 def phonemes_to_ids(
@@ -179,3 +294,79 @@ def phonemes_to_ids(
        phoneme_ids.extend(phoneme_id_map[eos])

    return phoneme_ids
+
+
+# -----------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("language")
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    args = parser.parse_args()
+
+    phonemizer: Optional[Phonemizer] = None
+
+    if args.text_casing == "lower":
+        casing = str.lower
+    elif args.text_casing == "upper":
+        casing = str.upper
+    else:
+        # ignore
+        casing = lambda s: s
+
+    if args.phoneme_type == PhonemeType.TEXT:
+        # Use text directly
+        phoneme_id_map = ALPHABETS[args.language]
+    else:
+        # Use eSpeak
+        phonemizer = Phonemizer(args.language)
+        phoneme_id_map = DEFAULT_PHONEME_ID_MAP
+
+    phoneme_map = PHONEME_MAPS.get(args.language)
+    missing_phonemes: "Counter[str]" = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        if args.phoneme_type == PhonemeType.TEXT:
+            phonemes = list(unicodedata.normalize("NFD", casing(line)))
+        else:
+            assert phonemizer is not None
+            phonemes = phonemize(line, phonemizer, phoneme_map=phoneme_map)
+
+        phoneme_ids = phonemes_to_ids(
+            phonemes, phoneme_id_map=phoneme_id_map, missing_phonemes=missing_phonemes
+        )
+        json.dump(
+            {
+                "text": line,
+                "phonemes": phonemes,
+                "phoneme_ids": phoneme_ids,
+            },
+            sys.stdout,
+            ensure_ascii=False,
+        )
+        print("")
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phonemes", file=sys.stderr)
+        for phoneme, count in missing_phonemes.most_common():
+            print(phoneme, count, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
@@ -6,9 +6,9 @@ import itertools
 import json
 import logging
 import os
+import unicodedata
 from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from multiprocessing import JoinableQueue, Process, Queue
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
@@ -16,7 +16,15 @@ from typing import Dict, Iterable, List, Optional
 from espeak_phonemizer import Phonemizer

 from .norm_audio import cache_norm_audio, make_silence_detector
-from .phonemize import DEFAULT_PHONEME_ID_MAP, phonemes_to_ids, phonemize
+from .phonemize import (
+    ALPHABETS,
+    DEFAULT_PHONEME_ID_MAP,
+    MAX_PHONEMES,
+    PHONEME_MAPS,
+    PhonemeType,
+    phonemes_to_ids,
+    phonemize,
+)

 _LOGGER = logging.getLogger("preprocess")

@@ -49,6 +57,23 @@ def main() -> None:
    parser.add_argument(
        "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
    )
+    #
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    #
+    parser.add_argument(
+        "--skip-audio", action="store_true", help="Don't preprocess audio"
+    )
    parser.add_argument(
        "--debug", action="store_true", help="Print DEBUG messages to the console"
    )
@@ -84,9 +109,9 @@ def main() -> None:

    # Count speakers
    _LOGGER.debug("Counting number of speakers/utterances in the dataset")
-    speaker_counts: Counter[str] = Counter()
+    speaker_counts: "Counter[str]" = Counter()
    num_utterances = 0
-    for utt in make_dataset(args.input_dir, args.single_speaker, args.speaker_id):
+    for utt in make_dataset(args):
        speaker = utt.speaker or ""
        speaker_counts[speaker] += 1
        num_utterances += 1
@@ -118,11 +143,12 @@ def main() -> None:
                    "voice": args.language,
                },
                "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
+                "phoneme_type": str(args.phoneme_type),
                "phoneme_map": {},
-                "phoneme_id_map": DEFAULT_PHONEME_ID_MAP,
-                "num_symbols": len(
-                    set(itertools.chain.from_iterable(DEFAULT_PHONEME_ID_MAP.values()))
-                ),
+                "phoneme_id_map": ALPHABETS[args.language]
+                if args.phoneme_type == PhonemeType.TEXT
+                else DEFAULT_PHONEME_ID_MAP,
+                "num_symbols": MAX_PHONEMES,
                "num_speakers": len(speaker_counts),
                "speaker_id_map": speaker_ids,
            },
@@ -142,8 +168,13 @@ def main() -> None:
    queue_out: "Queue[Optional[Utterance]]" = Queue()

    # Start workers
+    if args.phoneme_type == PhonemeType.TEXT:
+        target = phonemize_batch_text
+    else:
+        target = phonemize_batch_espeak
+
    processes = [
-        Process(target=process_batch, args=(args, queue_in, queue_out))
+        Process(target=target, args=(args, queue_in, queue_out))
        for _ in range(args.max_workers)
    ]
    for proc in processes:
@@ -154,27 +185,39 @@ def main() -> None:
    )
    with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
        for utt_batch in batched(
-            make_dataset(args.input_dir, args.single_speaker, args.speaker_id),
+            make_dataset(args),
            batch_size,
        ):
            queue_in.put(utt_batch)

        _LOGGER.debug("Waiting for jobs to finish")
+        missing_phonemes: "Counter[str]" = Counter()
        for _ in range(num_utterances):
            utt = queue_out.get()
            if utt is not None:
                if utt.speaker is not None:
                    utt.speaker_id = speaker_ids[utt.speaker]

+                utt_dict = dataclasses.asdict(utt)
+                utt_dict.pop("missing_phonemes")
+
                # JSONL
                json.dump(
-                    dataclasses.asdict(utt),
+                    utt_dict,
                    dataset_file,
                    ensure_ascii=False,
                    cls=PathEncoder,
                )
                print("", file=dataset_file)

+                missing_phonemes.update(utt.missing_phonemes)
+
+        if missing_phonemes:
+            for phoneme, count in missing_phonemes.most_common():
+                _LOGGER.warning("Missing %s (%s)", phoneme, count)
+
+            _LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
+
    # Signal workers to stop
    for proc in processes:
        queue_in.put(None)
@@ -187,10 +230,27 @@ def main() -> None:
 # -----------------------------------------------------------------------------


-def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue):
+def get_text_casing(casing: str):
+    if casing == "lower":
+        return str.lower
+
+    if casing == "upper":
+        return str.upper
+
+    if casing == "casefold":
+        return str.casefold
+
+    return lambda s: s
+
+
+def phonemize_batch_espeak(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
    try:
+        casing = get_text_casing(args.text_casing)
        silence_detector = make_silence_detector()
        phonemizer = Phonemizer(default_voice=args.language)
+        phoneme_map = PHONEME_MAPS.get(args.language)

        while True:
            utt_batch = queue_in.get()
@@ -200,14 +260,20 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:
            for utt in utt_batch:
                try:
                    _LOGGER.debug(utt)
-                    utt.phonemes = phonemize(utt.text, phonemizer)
-                    utt.phoneme_ids = phonemes_to_ids(utt.phonemes)
-                    utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
-                        utt.audio_path,
-                        args.cache_dir,
-                        silence_detector,
-                        args.sample_rate,
+                    utt.phonemes = phonemize(
+                        casing(utt.text), phonemizer, phoneme_map=phoneme_map
                    )
+                    utt.phoneme_ids = phonemes_to_ids(
+                        utt.phonemes,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
                    queue_out.put(utt)
                except TimeoutError:
                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
@@ -217,7 +283,48 @@ def process_batch(args: argparse.Namespace, queue_in: JoinableQueue, queue_out:

            queue_in.task_done()
    except Exception:
-        _LOGGER.exception("process_batch")
+        _LOGGER.exception("phonemize_batch_espeak")
+
+
+def phonemize_batch_text(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
+    try:
+        casing = get_text_casing(args.text_casing)
+        silence_detector = make_silence_detector()
+        alphabet = ALPHABETS[args.language]
+
+        while True:
+            utt_batch = queue_in.get()
+            if utt_batch is None:
+                break
+
+            for utt in utt_batch:
+                try:
+                    _LOGGER.debug(utt)
+                    utt.phonemes = list(unicodedata.normalize("NFD", casing(utt.text)))
+                    utt.phoneme_ids = phonemes_to_ids(
+                        utt.phonemes,
+                        phoneme_id_map=alphabet,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
+                    queue_out.put(utt)
+                except TimeoutError:
+                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
+                except Exception:
+                    _LOGGER.exception("Failed to process utterance: %s", utt)
+                    queue_out.put(None)
+
+            queue_in.task_done()
+    except Exception:
+        _LOGGER.exception("phonemize_batch_text")


 # -----------------------------------------------------------------------------
@@ -233,6 +340,7 @@ class Utterance:
    phoneme_ids: Optional[List[int]] = None
    audio_norm_path: Optional[Path] = None
    audio_spec_path: Optional[Path] = None
+    missing_phonemes: "Counter[str]" = field(default_factory=Counter)


 class PathEncoder(json.JSONEncoder):
@@ -242,9 +350,12 @@ class PathEncoder(json.JSONEncoder):
        return super().default(o)


-def ljspeech_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
-) -> Iterable[Utterance]:
+def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    speaker_id = args.speaker_id
+    skip_audio = args.skip_audio
+
    # filename|speaker|text
    # speaker is optional
    metadata_path = dataset_dir / "metadata.csv"
@@ -257,7 +368,7 @@ def ljspeech_dataset(
    with open(metadata_path, "r", encoding="utf-8") as csv_file:
        reader = csv.reader(csv_file, delimiter="|")
        for row in reader:
-            assert len(row) >= 2, "Not enough colums"
+            assert len(row) >= 2, "Not enough columns"

            speaker: Optional[str] = None
            if is_single_speaker or (len(row) == 2):
@@ -280,18 +391,25 @@ def ljspeech_dataset(
                # Try with .wav
                wav_path = wav_dir / f"{filename}.wav"

-            if not wav_path.exists():
-                _LOGGER.warning("Missing %s", filename)
-                continue
+            if not skip_audio:
+                if not wav_path.exists():
+                    _LOGGER.warning("Missing %s", filename)
+                    continue
+
+                if wav_path.stat().st_size == 0:
+                    _LOGGER.warning("Empty file: %s", wav_path)
+                    continue

            yield Utterance(
                text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
            )


-def mycroft_dataset(
-    dataset_dir: Path, is_single_speaker: bool, speaker_id: Optional[int] = None
-) -> Iterable[Utterance]:
+def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    skip_audio = args.skip_audio
+
    speaker_id = 0
    for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
        speaker = metadata_path.parent.name if not is_single_speaker else None
@@ -301,15 +419,15 @@ def mycroft_dataset(
            for row in reader:
                filename, text = row[0], row[1]
                wav_path = metadata_path.parent / filename
-                yield Utterance(
-                    text=text,
-                    audio_path=wav_path,
-                    speaker=speaker,
-                    speaker_id=speaker_id if not is_single_speaker else None,
-                )
+                if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
+                    yield Utterance(
+                        text=text,
+                        audio_path=wav_path,
+                        speaker=speaker,
+                        speaker_id=speaker_id if not is_single_speaker else None,
+                    )
        speaker_id += 1

-
 # -----------------------------------------------------------------------------


@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import sys
+from collections import Counter, defaultdict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--speaker-number", type=int)
+    parser.add_argument("--speaker-name")
+    args = parser.parse_args()
+
+    assert (args.speaker_number is not None) or (args.speaker_name is not None)
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    if args.speaker_name is not None:
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            if args.speaker_name == speaker_id:
+                writer.writerow((audio, text))
+    else:
+        utterances = defaultdict(list)
+        counts = Counter()
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            utterances[speaker_id].append((audio, text))
+            counts[speaker_id] += 1
+
+        writer = csv.writer(sys.stdout, delimiter="|")
+        for i, (speaker_id, _count) in enumerate(counts.most_common()):
+            if i == args.speaker_number:
+                for row in utterances[speaker_id]:
+                    writer.writerow(row)
+
+                print(speaker_id, file=sys.stderr)
+                break
+
+
+if __name__ == "__main__":
+    main()
@@ -8,7 +8,8 @@ docker run \
  --user "$(id -u):$(id -g)" \
  --ipc=host \
  -v "${HOME}:${HOME}" \
+  -v /media/cache:/media/cache:ro \
  -v /etc/hostname:/etc/hostname:ro \
  -v /etc/localtime:/etc/localtime:ro \
-  piper-train \
+  larynx2-train \
  "$@"
@@ -1,5 +1,6 @@
 import io
 import json
+import logging
 import wave
 from dataclasses import dataclass
 from pathlib import Path
@@ -9,6 +10,8 @@ import numpy as np
 import onnxruntime
 from espeak_phonemizer import Phonemizer

+_LOGGER = logging.getLogger(__name__)
+
 _BOS = "^"
 _EOS = "$"
 _PAD = "_"
@@ -69,8 +72,11 @@ class Piper:
        phoneme_ids: List[int] = []

        for phoneme in phonemes:
-            phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
-            phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
+            if phoneme in self.config.phoneme_id_map:
+                phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
+                phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
+            else:
+                _LOGGER.warning("No id for phoneme: %s", phoneme)

        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])