forked from rhasspy/piper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
piper.hpp
132 lines (99 loc) · 3.14 KB
/
piper.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#ifndef PIPER_H_
#define PIPER_H_
#include <fstream>
#include <functional>
#include <map>
#include <optional>
#include <string>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include <piper-phonemize/phoneme_ids.hpp>
#include <piper-phonemize/phonemize.hpp>
#include <piper-phonemize/tashkeel.hpp>
#include "json.hpp"
using json = nlohmann::json;
namespace piper {
typedef int64_t SpeakerId;
struct eSpeakConfig {
std::string voice = "en-us";
};
struct PiperConfig {
std::string eSpeakDataPath;
bool useESpeak = true;
bool useTashkeel = false;
std::optional<std::string> tashkeelModelPath;
std::unique_ptr<tashkeel::State> tashkeelState;
};
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
struct PhonemizeConfig {
PhonemeType phonemeType = eSpeakPhonemes;
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
PhonemeId idPad = 0; // padding (optionally interspersed)
PhonemeId idBos = 1; // beginning of sentence
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
eSpeakConfig eSpeak;
};
struct SynthesisConfig {
// VITS inference settings
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
// Audio settings
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
// Speaker id from 0 to numSpeakers - 1
std::optional<SpeakerId> speakerId;
// Extra silence
float sentenceSilenceSeconds = 0.2f;
std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
};
struct ModelConfig {
int numSpeakers;
// speaker name -> id
std::optional<std::map<std::string, SpeakerId>> speakerIdMap;
};
struct ModelSession {
Ort::Session onnx;
Ort::AllocatorWithDefaultOptions allocator;
Ort::SessionOptions options;
Ort::Env env;
ModelSession() : onnx(nullptr){};
};
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
SynthesisConfig synthesisConfig;
ModelConfig modelConfig;
ModelSession session;
};
// True if the string is a single UTF-8 codepoint
bool isSingleCodepoint(std::string s);
// Get the first UTF-8 codepoint of a string
Phoneme getCodepoint(std::string s);
// Get version of Piper
std::string getVersion();
// Must be called before using textTo* functions
void initialize(PiperConfig &config);
// Clean up
void terminate(PiperConfig &config);
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId);
// Phonemize text and synthesize audio
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback);
// Phonemize text and synthesize audio to WAV file
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result);
} // namespace piper
#endif // PIPER_H_