Initial commit

2026-04-23 00:04:49 +00:00 · 2022-10-21 16:56:34 -05:00
commit 14696f2960
18 changed files with 26583 additions and 0 deletions
--- a/src/cpp/config.hpp
+++ b/src/cpp/config.hpp
@@ -0,0 +1,139 @@
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+#include <filesystem>
+#include <map>
+#include <optional>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "json.hpp"
+#include "utf8.h"
+
+using namespace std;
+using json = nlohmann::json;
+
+namespace larynx {
+
+typedef char32_t Phoneme;
+typedef int64_t PhonemeId;
+
+const string DefaultVoice = "en-gb-x-rp";
+
+enum eSpeakMode { Text, TextWithPhonemes, SSML };
+
+struct eSpeakConfig {
+  string voice = DefaultVoice;
+  eSpeakMode mode = Text;
+  set<Phoneme> clauseBreakers{U'.', U'?', U'!', U',', U';', U':'};
+};
+
+struct PhonemizeConfig {
+  string text;
+  optional<vector<Phoneme>> phonemes;
+  optional<map<Phoneme, vector<Phoneme>>> phonemeMap;
+  map<Phoneme, vector<PhonemeId>> phonemeIdMap;
+
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;
+
+  optional<eSpeakConfig> eSpeak;
+};
+
+struct SynthesisConfig {
+  vector<PhonemeId> phonemeIds;
+  float noiseScale = 0.667f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+  filesystem::path outputPath;
+};
+
+bool isSingleCodepoint(string s) {
+  return utf8::distance(s.begin(), s.end()) == 1;
+}
+
+Phoneme getCodepoint(string s) {
+  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
+  return *character_iter;
+}
+
+void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+
+  if (configRoot.contains("espeak")) {
+    if (!phonemizeConfig.eSpeak) {
+      phonemizeConfig.eSpeak.emplace();
+    }
+
+    auto espeakValue = configRoot["espeak"];
+    if (espeakValue.contains("voice")) {
+      phonemizeConfig.eSpeak->voice = espeakValue["voice"].get<string>();
+    }
+  }
+
+  // phoneme to [phoneme] map
+  if (configRoot.contains("phoneme_map")) {
+    if (!phonemizeConfig.phonemeMap) {
+      phonemizeConfig.phonemeMap.emplace();
+    }
+
+    auto phonemeMapValue = configRoot["phoneme_map"];
+    for (auto& fromPhonemeItem : phonemeMapValue.items()) {
+      string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw runtime_error("Phonemes must be one codepoint (phoneme map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto& toPhonemeValue : fromPhonemeItem.value()) {
+        string toPhoneme = toPhonemeValue.get<string>();
+        if (!isSingleCodepoint(toPhoneme)) {
+          throw runtime_error("Phonemes must be one codepoint (phoneme map)");
+        }
+
+        auto toCodepoint = getCodepoint(toPhoneme);
+        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
+      }
+    }
+  }
+
+  // phoneme to [id] map
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto& fromPhonemeItem : phonemeIdMapValue.items()) {
+      string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto& toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
+} /* parsePhonemizeConfig */
+
+void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
+
+  if (configRoot.contains("audio")) {
+    auto audioValue = configRoot["audio"];
+    if (audioValue.contains("sample_rate")) {
+      // Default sample rate is 22050 Hz
+      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
+    }
+  }
+
+} /* parseSynthesisConfig */
+
+} // namespace larynx
+
+#endif // CONFIG_H_