Initial commit

This commit is contained in:
Michael Hansen
2022-10-21 16:56:34 -05:00
commit 14696f2960
18 changed files with 26583 additions and 0 deletions

115
src/cpp/synthesize.hpp Normal file
View File

@@ -0,0 +1,115 @@
#ifndef SYNTHESIZE_H_
#define SYNTHESIZE_H_
#include <chrono>
#include <limits>
#include <memory>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include "config.hpp"
#include "model.hpp"
using namespace std;
namespace larynx {
// Maximum value for 16-bit signed WAV sample
const float MAX_WAV_VALUE = 32767.0f;
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
// Phoneme ids to WAV audio
void synthesize(SynthesisConfig &synthesisConfig, ModelSession &session,
vector<int16_t> &audioBuffer, SynthesisResult &result) {
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
vector<int64_t> phonemeIdLengths{(int64_t)synthesisConfig.phonemeIds.size()};
vector<float> scales{synthesisConfig.noiseScale, synthesisConfig.lengthScale,
synthesisConfig.noiseW};
vector<Ort::Value> inputTensors;
vector<int64_t> phonemeIdsShape{1,
(int64_t)synthesisConfig.phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, synthesisConfig.phonemeIds.data(),
synthesisConfig.phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
vector<int64_t> scalesShape{(int64_t)scales.size()};
inputTensors.push_back(
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
// Infer
auto startTime = chrono::steady_clock::now();
auto outputTensors =
session.onnx.Run(Ort::RunOptions{nullptr}, session.inputNames.data(),
inputTensors.data(), inputTensors.size(),
session.outputNames.data(), session.outputNames.size());
auto endTime = chrono::steady_clock::now();
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
throw runtime_error("Invalid output tensors");
}
auto inferDuration = chrono::duration<double>(endTime - startTime);
result.inferSeconds = inferDuration.count();
const float *audio = outputTensors.front().GetTensorData<float>();
auto audioShape =
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
int64_t audioCount = audioShape[audioShape.size() - 1];
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
result.realTimeFactor = 0.0;
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
// Get max audio value for scaling
float maxAudioValue = 0.01f;
for (int64_t i = 0; i < audioCount; i++) {
float audioValue = abs(audio[i]);
if (audioValue > maxAudioValue) {
maxAudioValue = audioValue;
}
}
// We know the size up front
audioBuffer.reserve(audioCount);
// Scale audio to fill range and convert to int16
float audioScale = (MAX_WAV_VALUE / max(0.01f, maxAudioValue));
for (int64_t i = 0; i < audioCount; i++) {
int16_t intAudioValue = static_cast<int16_t>(
clamp(audio[i] * audioScale,
static_cast<float>(numeric_limits<int16_t>::min()),
static_cast<float>(numeric_limits<int16_t>::max())));
audioBuffer.push_back(intAudioValue);
}
// Clean up
for (size_t i = 0; i < outputTensors.size(); i++) {
Ort::OrtRelease(outputTensors[i].release());
}
for (size_t i = 0; i < inputTensors.size(); i++) {
Ort::OrtRelease(inputTensors[i].release());
}
}
} // namespace larynx
#endif // SYNTHESIZE_H_