Add more config options

2026-06-02 09:57:02 +00:00 · 2023-06-06 16:31:41 -05:00
parent 810fad44cf
commit ff88957218
2 changed files with 85 additions and 27 deletions
@@ -36,6 +36,7 @@ struct RunConfig {
  optional<float> noiseScale;
  optional<float> lengthScale;
  optional<float> noiseW;
+  optional<float> sentenceSilenceSeconds;
 };

 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -94,6 +95,11 @@ int main(int argc, char *argv[]) {
    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
  }

+  if (runConfig.sentenceSilenceSeconds) {
+    voice.synthesisConfig.sentenceSilenceSeconds =
+        runConfig.sentenceSilenceSeconds.value();
+  }
+
  if (runConfig.outputType == OUTPUT_DIRECTORY) {
    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
    cerr << "Output directory: " << runConfig.outputPath.value() << endl;
@@ -234,11 +240,14 @@ void printUsage(char *argv[]) {
          "becomes available"
       << endl;
  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
-  cerr << "   --noise-scale           NUM   generator noise (default: 0.667)"
+  cerr << "   --noise_scale           NUM   generator noise (default: 0.667)"
       << endl;
-  cerr << "   --length-scale          NUM   phoneme length (default: 1.0)"
+  cerr << "   --length_scale          NUM   phoneme length (default: 1.0)"
       << endl;
-  cerr << "   --noise-w               NUM   phonene width noise (default: 0.8)"
+  cerr << "   --noise_w               NUM   phoneme width noise (default: 0.8)"
+       << endl;
+  cerr << "   --silence_seconds       NUM   seconds of silence after each "
+          "sentence (default: 0.2)"
       << endl;
  cerr << endl;
 }
@@ -263,7 +272,8 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
    } else if (arg == "-c" || arg == "--config") {
      ensureArg(argc, argv, i);
      modelConfigPath = filesystem::path(argv[++i]);
-    } else if (arg == "-f" || arg == "--output_file") {
+    } else if (arg == "-f" || arg == "--output_file" ||
+               arg == "--output-file") {
      ensureArg(argc, argv, i);
      std::string filePath = argv[++i];
      if (filePath == "-") {
@@ -273,24 +283,27 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
        runConfig.outputType = OUTPUT_FILE;
        runConfig.outputPath = filesystem::path(filePath);
      }
-    } else if (arg == "-d" || arg == "--output_dir") {
+    } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
      ensureArg(argc, argv, i);
      runConfig.outputType = OUTPUT_DIRECTORY;
      runConfig.outputPath = filesystem::path(argv[++i]);
-    } else if (arg == "--output_raw") {
+    } else if (arg == "--output_raw" || arg == "--output-raw") {
      runConfig.outputType = OUTPUT_RAW;
    } else if (arg == "-s" || arg == "--speaker") {
      ensureArg(argc, argv, i);
      runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
-    } else if (arg == "--noise-scale") {
+    } else if (arg == "--noise_scale" || arg == "--noise-scale") {
      ensureArg(argc, argv, i);
      runConfig.noiseScale = stof(argv[++i]);
-    } else if (arg == "--length-scale") {
+    } else if (arg == "--length_scale" || arg == "--length-scale") {
      ensureArg(argc, argv, i);
      runConfig.lengthScale = stof(argv[++i]);
-    } else if (arg == "--noise-w") {
+    } else if (arg == "--noise_w" || arg == "--noise-w") {
      ensureArg(argc, argv, i);
      runConfig.noiseW = stof(argv[++i]);
+    } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
+      ensureArg(argc, argv, i);
+      runConfig.sentenceSilenceSeconds = stof(argv[++i]);
    } else if (arg == "-h" || arg == "--help") {
      printUsage(argv);
      exit(0);
@@ -18,16 +18,31 @@ const float MAX_WAV_VALUE = 32767.0f;

 const std::string instanceName{"piper"};

+// True if the string is a single UTF-8 codepoint
 bool isSingleCodepoint(std::string s) {
  return utf8::distance(s.begin(), s.end()) == 1;
 }

+// Get the first UTF-8 codepoint of a string
 Phoneme getCodepoint(std::string s) {
  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
  return *character_iter;
 }

+// Load JSON config information for phonemization
 void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+  // {
+  //     "espeak": {
+  //         "voice": "<language code>"
+  //     },
+  //     "phoneme_type": "<espeak or text>",
+  //     "phoneme_map": {
+  //         "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
+  //     },
+  //     "phoneme_id_map": {
+  //         "<phoneme>": [<id1>, <id2>, ...]
+  //     }
+  // }

  if (configRoot.contains("espeak")) {
    if (!phonemizeConfig.eSpeak) {
@@ -47,7 +62,27 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
    }
  }

+  // phoneme to [id] map
+  // Maps phonemes to one or more phoneme ids (required).
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
  // phoneme to [phoneme] map
+  // Maps phonemes to one or more other phonemes (not normally used).
  if (configRoot.contains("phoneme_map")) {
    if (!phonemizeConfig.phonemeMap) {
      phonemizeConfig.phonemeMap.emplace();
@@ -75,28 +110,22 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
    }
  }

-  // phoneme to [id] map
-  if (configRoot.contains("phoneme_id_map")) {
-    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
-    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
-      std::string fromPhoneme = fromPhonemeItem.key();
-      if (!isSingleCodepoint(fromPhoneme)) {
-        throw std::runtime_error(
-            "Phonemes must be one codepoint (phoneme id map)");
-      }
-
-      auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto &toIdValue : fromPhonemeItem.value()) {
-        PhonemeId toId = toIdValue.get<PhonemeId>();
-        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
-      }
-    }
-  }
-
 } /* parsePhonemizeConfig */

+// Load JSON config for audio synthesis
 void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {

+  // {
+  //     "audio": {
+  //         "sample_rate": 22050
+  //     },
+  //     "inference": {
+  //         "noise_scale": 0.667,
+  //         "length_scale": 1,
+  //         "noise_w": 0.8
+  //     }
+  // }
+
  if (configRoot.contains("audio")) {
    auto audioValue = configRoot["audio"];
    if (audioValue.contains("sample_rate")) {
@@ -105,6 +134,22 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
    }
  }

+  if (configRoot.contains("inference")) {
+    // Overrides default inference settings
+    auto inferenceValue = configRoot["inference"];
+    if (inferenceValue.contains("noise_scale")) {
+      synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
+    }
+
+    if (inferenceValue.contains("length_scale")) {
+      synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
+    }
+
+    if (inferenceValue.contains("noise_w")) {
+      synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
+    }
+  }
+
 } /* parseSynthesisConfig */

 void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {