diff --git a/.gitignore b/.gitignore
index 8e72809..e8bcba1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ htmlcov
 /data/
 /build/
 /local/
+/dist/
 *.so
 
 .venv/
diff --git a/Makefile b/Makefile
index a2c40c6..7a68fc0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,19 @@
-.PHONY: release debug clean
+.PHONY: release debug clean docker
 
 release:
 	mkdir -p build
 	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release && make
 
+no-pcaudio:
+	mkdir -p build
+	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Release -DUSE_PCAUDIO=OFF && make
+
 debug:
 	mkdir -p build
 	cd build && cmake ../src/cpp -DCMAKE_BUILD_TYPE=Debug && make
 
 clean:
 	rm -rf build/ dist/
+
+docker:
+	docker buildx build . --platform 'linux/amd64,linux/arm64' --output 'type=local,dest=dist'
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 7c9ec65..1ebadd9 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -17,9 +17,13 @@ pkg_check_modules(ESPEAK_NG REQUIRED espeak-ng<2)
 
 # https://github.com/espeak-ng/pcaudiolib
 check_include_file_cxx("pcaudiolib/audio.h" PCAUDIO_INCLUDE_FOUND)
+
 if(PCAUDIO_INCLUDE_FOUND)
-target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
-set(PCAUDIO_LIBRARIES "pcaudio")
+  option(USE_PCAUDIO "Build with pcaudiolib" ON)
+  if(USE_PCAUDIO)
+    target_compile_definitions(larynx PUBLIC HAVE_PCAUDIO)
+    set(PCAUDIO_LIBRARIES "pcaudio")
+  endif()
 endif()
 
 set(ONNXRUNTIME_ROOTDIR "/usr/local/include/onnxruntime")
diff --git a/src/cpp/config.hpp b/src/cpp/config.hpp
index 4244879..a50f16c 100644
--- a/src/cpp/config.hpp
+++ b/src/cpp/config.hpp
@@ -19,6 +19,7 @@ namespace larynx {
 
 typedef char32_t Phoneme;
 typedef int64_t PhonemeId;
+typedef int64_t SpeakerId;
 
 const string DefaultVoice = "en-gb-x-rp";
 
@@ -52,7 +53,11 @@ struct SynthesisConfig {
   int sampleRate = 22050;
   int sampleWidth = 2; // 16-bit
   int channels = 1;    // mono
-  filesystem::path outputPath;
+  optional<SpeakerId> speakerId;
+};
+
+struct ModelConfig {
+  int numSpeakers;
 };
 
 bool isSingleCodepoint(string s) {
@@ -84,14 +89,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
     }
 
     auto phonemeMapValue = configRoot["phoneme_map"];
-    for (auto& fromPhonemeItem : phonemeMapValue.items()) {
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
       string fromPhoneme = fromPhonemeItem.key();
       if (!isSingleCodepoint(fromPhoneme)) {
         throw runtime_error("Phonemes must be one codepoint (phoneme map)");
       }
 
       auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto& toPhonemeValue : fromPhonemeItem.value()) {
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
         string toPhoneme = toPhonemeValue.get<string>();
         if (!isSingleCodepoint(toPhoneme)) {
           throw runtime_error("Phonemes must be one codepoint (phoneme map)");
@@ -106,14 +111,14 @@ void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
   // phoneme to [id] map
   if (configRoot.contains("phoneme_id_map")) {
     auto phonemeIdMapValue = configRoot["phoneme_id_map"];
-    for (auto& fromPhonemeItem : phonemeIdMapValue.items()) {
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
       string fromPhoneme = fromPhonemeItem.key();
       if (!isSingleCodepoint(fromPhoneme)) {
         throw runtime_error("Phonemes must be one codepoint (phoneme id map)");
       }
 
       auto fromCodepoint = getCodepoint(fromPhoneme);
-      for (auto& toIdValue : fromPhonemeItem.value()) {
+      for (auto &toIdValue : fromPhonemeItem.value()) {
         PhonemeId toId = toIdValue.get<PhonemeId>();
         phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
       }
@@ -134,6 +139,12 @@ void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
 
 } /* parseSynthesisConfig */
 
+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+} /* parseModelConfig */
+
 } // namespace larynx
 
 #endif // CONFIG_H_
diff --git a/src/cpp/api.hpp b/src/cpp/larynx.hpp
similarity index 83%
rename from src/cpp/api.hpp
rename to src/cpp/larynx.hpp
index 122bb80..32abb16 100644
--- a/src/cpp/api.hpp
+++ b/src/cpp/larynx.hpp
@@ -1,5 +1,5 @@
-#ifndef API_H_
-#define API_H_
+#ifndef LARYNX_H_
+#define LARYNX_H_
 
 #include <iostream>
 #include <string>
@@ -22,6 +22,7 @@ struct Voice {
   json configRoot;
   PhonemizeConfig phonemizeConfig;
   SynthesisConfig synthesisConfig;
+  ModelConfig modelConfig;
   ModelSession session;
 };
 
@@ -42,12 +43,24 @@ void terminate() {
 }
 
 // Load Onnx model and JSON config file
-void loadVoice(string modelPath, string modelConfigPath, Voice &voice) {
+void loadVoice(string modelPath, string modelConfigPath, Voice &voice,
+               optional<SpeakerId> &speakerId) {
   ifstream modelConfigFile(modelConfigPath.c_str());
   voice.configRoot = json::parse(modelConfigFile);
 
   parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
   parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multispeaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }
 
   loadModel(modelPath, voice.session);
 
@@ -83,8 +96,8 @@ void textToWavFile(Voice &voice, string text, ostream &audioFile,
   audioFile.write((const char *)audioBuffer.data(),
                   sizeof(int16_t) * audioBuffer.size());
 
-} /* textToAudio */
+} /* textToWavFile */
 
 } // namespace larynx
 
-#endif // API_H_
+#endif // LARYNX_H_
diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index ffb266f..3204b2f 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -12,7 +12,7 @@
 #include <pcaudiolib/audio.h>
 #endif
 
-#include "api.hpp"
+#include "larynx.hpp"
 
 using namespace std;
 
@@ -23,6 +23,7 @@ struct RunConfig {
   filesystem::path modelConfigPath;
   OutputType outputType = OUTPUT_PLAY;
   optional<filesystem::path> outputPath;
+  optional<larynx::SpeakerId> speakerId;
 };
 
 void parseArgs(int argc, char *argv[], RunConfig &runConfig);
@@ -36,7 +37,7 @@ int main(int argc, char *argv[]) {
   larynx::Voice voice;
   auto startTime = chrono::steady_clock::now();
   loadVoice(runConfig.modelPath.string(), runConfig.modelConfigPath.string(),
-            voice);
+            voice, runConfig.speakerId);
   auto endTime = chrono::steady_clock::now();
   auto loadSeconds = chrono::duration<double>(endTime - startTime).count();
   cerr << "Load time: " << loadSeconds << " sec" << endl;
@@ -122,9 +123,11 @@ int main(int argc, char *argv[]) {
 
   larynx::terminate();
 
+#ifdef HAVE_PCAUDIO
   audio_object_close(my_audio);
   audio_object_destroy(my_audio);
   my_audio = nullptr;
+#endif
 
   return EXIT_SUCCESS;
 }
@@ -145,6 +148,7 @@ void printUsage(char *argv[]) {
   cerr << "   -d  DIR   --output_dir  DIR   path to output directory (default: "
           "cwd)"
        << endl;
+  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
   cerr << endl;
 }
 
@@ -182,6 +186,9 @@ void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
       ensureArg(argc, argv, i);
       runConfig.outputType = OUTPUT_DIRECTORY;
       runConfig.outputPath = filesystem::path(argv[++i]);
+    } else if (arg == "-s" || arg == "--speaker") {
+      ensureArg(argc, argv, i);
+      runConfig.speakerId = (larynx::SpeakerId)stoi(argv[++i]);
     } else if (arg == "-h" || arg == "--help") {
       printUsage(argv);
       exit(0);
diff --git a/src/cpp/phonemize.hpp b/src/cpp/phonemize.hpp
index 80c62c6..1c89b53 100644
--- a/src/cpp/phonemize.hpp
+++ b/src/cpp/phonemize.hpp
@@ -10,9 +10,9 @@
 #include <vector>
 
 #include <espeak-ng/speak_lib.h>
-#include <utf8.h>
 
 #include "config.hpp"
+#include "utf8.h"
 
 using namespace std;
 
diff --git a/src/cpp/synthesize.hpp b/src/cpp/synthesize.hpp
index f75d77b..71070e2 100644
--- a/src/cpp/synthesize.hpp
+++ b/src/cpp/synthesize.hpp
@@ -53,6 +53,15 @@ void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
       Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
                                       scalesShape.data(), scalesShape.size()));
 
+  if (synthesisConfig.speakerId) {
+    // Add speaker id
+    vector<int64_t> speakerId{(int64_t)synthesisConfig.speakerId.value()};
+    vector<int64_t> speakerIdShape{1};
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
   // Infer
   auto startTime = chrono::steady_clock::now();
   auto outputTensors =