diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index 1242b87..aad42af 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <functional>
 #include <iostream>
+#include <map>
 #include <mutex>
 #include <sstream>
 #include <stdexcept>
@@ -76,6 +77,9 @@ struct RunConfig {
   //   "output_file": str,        (optional)
   // }
   bool jsonInput = false;
+
+  // Seconds of extra silence to insert after a single phoneme
+  optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
 };
 
 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -185,6 +189,8 @@ int main(int argc, char *argv[]) {
         runConfig.sentenceSilenceSeconds.value();
   }
 
+  voice.synthesisConfig.phonemeSilenceSeconds = runConfig.phonemeSilenceSeconds;
+
   if (runConfig.outputType == OUTPUT_DIRECTORY) {
     runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
     spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
@@ -453,6 +459,23 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
     } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
       ensureArg(argc, argv, i);
       runConfig.sentenceSilenceSeconds = stof(argv[++i]);
+    } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
+      ensureArg(argc, argv, i);
+      ensureArg(argc, argv, i + 1);
+      auto phonemeStr = std::string(argv[++i]);
+      if (!piper::isSingleCodepoint(phonemeStr)) {
+        std::cerr << "Phoneme '" << phonemeStr
+                  << "' is not a single codepoint (--phoneme_silence)"
+                  << std::endl;
+        exit(1);
+      }
+
+      if (!runConfig.phonemeSilenceSeconds) {
+        runConfig.phonemeSilenceSeconds.emplace();
+      }
+
+      auto phoneme = piper::getCodepoint(phonemeStr);
+      (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
     } else if (arg == "--espeak_data" || arg == "--espeak-data") {
       ensureArg(argc, argv, i);
       runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
diff --git a/src/cpp/piper.cpp b/src/cpp/piper.cpp
index d83dd3f..6da95cd 100644
--- a/src/cpp/piper.cpp
+++ b/src/cpp/piper.cpp
@@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;
 
 const std::string instanceName{"piper"};
 
-std::string getVersion() {
-  return VERSION;
-}
+std::string getVersion() { return VERSION; }
 
 // True if the string is a single UTF-8 codepoint
 bool isSingleCodepoint(std::string s) {
@@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
                     sentencePhonemes.size(), phonemesStr);
     }
 
-    SynthesisResult sentenceResult;
+    std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
+    std::vector<SynthesisResult> phraseResults;
+    std::vector<size_t> phraseSilenceSamples;
 
     // Use phoneme/id map from config
     PhonemeIdConfig idConfig;
     idConfig.phonemeIdMap =
         std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
 
-    // phonemes -> ids
-    phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
-    if (spdlog::should_log(spdlog::level::debug)) {
-      // DEBUG log for phoneme ids
-      std::stringstream phonemeIdsStr;
-      for (auto phonemeId : phonemeIds) {
-        phonemeIdsStr << phonemeId << ", ";
-      }
+    if (voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Split into phrases
+      std::map<Phoneme, float> &phonemeSilenceSeconds =
+          *voice.synthesisConfig.phonemeSilenceSeconds;
 
-      spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
-                    sentencePhonemes.size(), phonemeIds.size(),
-                    phonemeIdsStr.str());
+      auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+      phrasePhonemes.push_back(currentPhrasePhonemes);
+
+      for (auto sentencePhonemesIter = sentencePhonemes.begin();
+           sentencePhonemesIter != sentencePhonemes.end();
+           sentencePhonemesIter++) {
+        Phoneme &currentPhoneme = *sentencePhonemesIter;
+        currentPhrasePhonemes->push_back(currentPhoneme);
+
+        if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
+          // Split at phrase boundary
+          phraseSilenceSamples.push_back(
+              (std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
+                            voice.synthesisConfig.sampleRate *
+                            voice.synthesisConfig.channels));
+
+          currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+          phrasePhonemes.push_back(currentPhrasePhonemes);
+        }
+      }
+    } else {
+      // Use all phonemes
+      phrasePhonemes.push_back(
+          std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
     }
 
-    // ids -> audio
-    synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
-               sentenceResult);
+    // Ensure results/samples are the same size
+    while (phraseResults.size() < phrasePhonemes.size()) {
+      phraseResults.emplace_back();
+    }
+
+    while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
+      phraseSilenceSamples.push_back(0);
+    }
+
+    // phonemes -> ids -> audio
+    for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
+      if (phrasePhonemes[phraseIdx]->size() <= 0) {
+        continue;
+      }
+
+      // phonemes -> ids
+      phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
+                      missingPhonemes);
+      if (spdlog::should_log(spdlog::level::debug)) {
+        // DEBUG log for phoneme ids
+        std::stringstream phonemeIdsStr;
+        for (auto phonemeId : phonemeIds) {
+          phonemeIdsStr << phonemeId << ", ";
+        }
+
+        spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                      phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
+                      phonemeIdsStr.str());
+      }
+
+      // ids -> audio
+      synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+                 phraseResults[phraseIdx]);
+
+      // Add end of phrase silence
+      for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
+        audioBuffer.push_back(0);
+      }
+
+      result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
+      result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
+
+      phonemeIds.clear();
+    }
 
     // Add end of sentence silence
     if (sentenceSilenceSamples > 0) {
@@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
       audioBuffer.clear();
     }
 
-    result.audioSeconds += sentenceResult.audioSeconds;
-    result.inferSeconds += sentenceResult.inferSeconds;
-
     phonemeIds.clear();
   }
 
diff --git a/src/cpp/piper.hpp b/src/cpp/piper.hpp
index 9e7c222..332a619 100644
--- a/src/cpp/piper.hpp
+++ b/src/cpp/piper.hpp
@@ -3,6 +3,7 @@
 
 #include <fstream>
 #include <functional>
+#include <map>
 #include <optional>
 #include <string>
 #include <vector>
@@ -49,14 +50,22 @@ struct PhonemizeConfig {
 };
 
 struct SynthesisConfig {
+  // VITS inference settings
   float noiseScale = 0.667f;
   float lengthScale = 1.0f;
   float noiseW = 0.8f;
+
+  // Audio settings
   int sampleRate = 22050;
   int sampleWidth = 2; // 16-bit
   int channels = 1;    // mono
+
+  // Speaker id from 0 to numSpeakers - 1
   std::optional<SpeakerId> speakerId;
+
+  // Extra silence
   float sentenceSilenceSeconds = 0.2f;
+  std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
 };
 
 struct ModelConfig {
@@ -89,6 +98,12 @@ struct Voice {
   ModelSession session;
 };
 
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s);
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s);
+
 // Get version of Piper
 std::string getVersion();