#ifndef SYNTHESIZE_H_ #define SYNTHESIZE_H_ #include #include #include #include #include #include #include "config.hpp" #include "model.hpp" using namespace std; namespace piper { // Maximum value for 16-bit signed WAV sample const float MAX_WAV_VALUE = 32767.0f; struct SynthesisResult { double inferSeconds; double audioSeconds; double realTimeFactor; }; // Phoneme ids to WAV audio void synthesize(vector &phonemeIds, SynthesisConfig &synthesisConfig, ModelSession &session, vector &audioBuffer, SynthesisResult &result) { auto memoryInfo = Ort::MemoryInfo::CreateCpu( OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); // Allocate vector phonemeIdLengths{(int64_t)phonemeIds.size()}; vector scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale, synthesisConfig.noiseW}; vector inputTensors; vector phonemeIdsShape{1, (int64_t)phonemeIds.size()}; inputTensors.push_back(Ort::Value::CreateTensor( memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(), phonemeIdsShape.size())); vector phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()}; inputTensors.push_back(Ort::Value::CreateTensor( memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(), phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size())); vector scalesShape{(int64_t)scales.size()}; inputTensors.push_back( Ort::Value::CreateTensor(memoryInfo, scales.data(), scales.size(), scalesShape.data(), scalesShape.size())); // Add speaker id. // NOTE: These must be kept outside the "if" below to avoid being deallocated. vector speakerId{(int64_t)synthesisConfig.speakerId.value_or(0)}; vector speakerIdShape{(int64_t)speakerId.size()}; if (synthesisConfig.speakerId) { inputTensors.push_back(Ort::Value::CreateTensor( memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(), speakerIdShape.size())); } // From export_onnx.py array inputNames = {"input", "input_lengths", "scales", "sid"}; array outputNames = {"output"}; // Infer auto startTime = chrono::steady_clock::now(); auto outputTensors = session.onnx.Run( Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(), inputTensors.size(), outputNames.data(), outputNames.size()); auto endTime = chrono::steady_clock::now(); if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) { throw runtime_error("Invalid output tensors"); } auto inferDuration = chrono::duration(endTime - startTime); result.inferSeconds = inferDuration.count(); const float *audio = outputTensors.front().GetTensorData(); auto audioShape = outputTensors.front().GetTensorTypeAndShapeInfo().GetShape(); int64_t audioCount = audioShape[audioShape.size() - 1]; result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate; result.realTimeFactor = 0.0; if (result.audioSeconds > 0) { result.realTimeFactor = result.inferSeconds / result.audioSeconds; } // Get max audio value for scaling float maxAudioValue = 0.01f; for (int64_t i = 0; i < audioCount; i++) { float audioValue = abs(audio[i]); if (audioValue > maxAudioValue) { maxAudioValue = audioValue; } } // We know the size up front audioBuffer.reserve(audioCount); // Scale audio to fill range and convert to int16 float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue)); for (int64_t i = 0; i < audioCount; i++) { int16_t intAudioValue = static_cast( clamp(audio[i] * audioScale, static_cast(numeric_limits::min()), static_cast(numeric_limits::max()))); audioBuffer.push_back(intAudioValue); } // Clean up for (size_t i = 0; i < outputTensors.size(); i++) { Ort::detail::OrtRelease(outputTensors[i].release()); } for (size_t i = 0; i < inputTensors.size(); i++) { Ort::detail::OrtRelease(inputTensors[i].release()); } } } // namespace piper #endif // SYNTHESIZE_H_