Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[project-s] ハミング用APIを追加 #1008

Merged
merged 20 commits into from
Jan 22, 2024
Merged
1 change: 1 addition & 0 deletions engine_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"port": 50021,
"icon": "engine_manifest_assets/icon.png",
"default_sampling_rate": 24000,
"frame_rate": 93.75,
"terms_of_service": "engine_manifest_assets/terms_of_service.md",
"update_infos": "engine_manifest_assets/update_infos.json",
"dependency_licenses": "engine_manifest_assets/dependency_licenses.json",
Expand Down
73 changes: 73 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@
AudioQuery,
BaseLibraryInfo,
DownloadableLibraryInfo,
FrameAudioQuery,
InstalledLibraryInfo,
MorphableTargetInfo,
ParseKanaBadRequest,
ParseKanaError,
Score,
Speaker,
SpeakerInfo,
StyleIdNotFoundError,
Expand Down Expand Up @@ -704,6 +706,77 @@ def _synthesis_morphing(
background=BackgroundTask(delete_file, f.name),
)

@app.post(
"/sing_frame_audio_query",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

判断メモです。
ここをsingにすべきかsongにすべきかですごく迷うのですが、これは「歌うためのクエリ」であり、「歌のクエリ」ではないので、singが合っているのかなと思いました。

もし仮に名詞にするならsongではなくsong_snyhtesisとかかなと。長いのでsingで良さそう。

response_model=FrameAudioQuery,
tags=["クエリ作成"],
summary="歌唱音声合成用のクエリを作成する",
)
def sing_frame_audio_query(
score: Score,
style_id: StyleId | None = Query(default=None), # noqa: B008
speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008
core_version: str | None = None,
) -> FrameAudioQuery:
"""
歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。
"""
style_id = get_style_id_from_deprecated(
style_id=style_id, deprecated_speaker=speaker
)
engine = get_engine(core_version)
core = get_core(core_version)
phonemes, f0, volume = engine.get_sing_phoneme_and_f0_and_volume(
score, style_id
)

return FrameAudioQuery(
f0=f0,
volume=volume,
phonemes=phonemes,
volumeScale=1,
outputSamplingRate=core.default_sampling_rate,
outputStereo=False,
)

@app.post(
"/frame_synthesis",
response_class=FileResponse,
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
)
def frame_synthesis(
query: FrameAudioQuery,
style_id: StyleId | None = Query(default=None), # noqa: B008
speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008
core_version: str | None = None,
) -> FileResponse:
"""
歌唱音声合成を行います。
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
"""
style_id = get_style_id_from_deprecated(
style_id=style_id, deprecated_speaker=speaker
)
engine = get_engine(core_version)
wave = engine.frame_synthsize_wave(query, style_id)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(
file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
)

return FileResponse(
f.name,
media_type="audio/wav",
background=BackgroundTask(delete_file, f.name),
)

@app.post(
"/connect_waves",
response_class=FileResponse,
Expand Down
81 changes: 81 additions & 0 deletions voicevox_engine/core_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,84 @@ def safe_decode_forward(
)
sr_wave = self.default_sampling_rate
return wave, sr_wave

def safe_predict_sing_consonant_length_forward(
self,
consonant: NDArray[np.int64],
vowel: NDArray[np.int64],
note_duration: NDArray[np.int64],
style_id: StyleId,
) -> NDArray[np.int64]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
consonant_length = self.core.predict_sing_consonant_length_forward(
length=consonant.shape[0],
consonant=consonant[np.newaxis],
vowel=vowel[np.newaxis],
note_duration=note_duration[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return consonant_length

def safe_predict_sing_f0_forward(
self,
phoneme: NDArray[np.int64],
note: NDArray[np.int64],
style_id: StyleId,
) -> NDArray[np.float32]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
f0 = self.core.predict_sing_f0_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
note=note[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return f0

def safe_predict_sing_volume_forward(
self,
phoneme: NDArray[np.int64],
note: NDArray[np.int64],
f0: NDArray[np.float32],
style_id: StyleId,
) -> NDArray[np.float32]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
volume = self.core.predict_sing_volume_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
note=note[np.newaxis],
f0=f0[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return volume

def safe_sf_decode_forward(
self,
phoneme: NDArray[np.int64],
f0: NDArray[np.float32],
volume: NDArray[np.float32],
style_id: StyleId,
) -> tuple[NDArray[np.float32], int]:
# 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)
with self.mutex:
wave = self.core.sf_decode_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
f0=f0[np.newaxis],
volume=volume[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)
sr_wave = self.default_sampling_rate
return wave, sr_wave
Loading
Loading