Skip to content

Commit

Permalink
[project-s] ハミング用APIを追加 (#1008)
Browse files Browse the repository at this point in the history
* update metas (add style type)

* update engine manifest (add frame rate)

* add sing api to core wrapper

* add sing api to core adapter

* add models for sing api

* add sing process to tts engine

* add sing api

* fix miss

* add fixme comment

Co-authored-by: Hiroshiba <[email protected]>

* remove sing type

* fix typo

* remove optional

* translate error detail

* get -> create

* fix docs

* Revert "remove optional"

This reverts commit 12b8fc6.

* fix pytest

* add comment

* add fixme comment

Co-authored-by: Hiroshiba <[email protected]>

* improve models

---------

Co-authored-by: Hiroshiba <[email protected]>
  • Loading branch information
y-chan and Hiroshiba authored Jan 22, 2024
1 parent 8d23bd3 commit 7bc1b21
Show file tree
Hide file tree
Showing 10 changed files with 627 additions and 17 deletions.
1 change: 1 addition & 0 deletions engine_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"port": 50021,
"icon": "engine_manifest_assets/icon.png",
"default_sampling_rate": 24000,
"frame_rate": 93.75,
"terms_of_service": "engine_manifest_assets/terms_of_service.md",
"update_infos": "engine_manifest_assets/update_infos.json",
"dependency_licenses": "engine_manifest_assets/dependency_licenses.json",
Expand Down
73 changes: 73 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@
AudioQuery,
BaseLibraryInfo,
DownloadableLibraryInfo,
FrameAudioQuery,
InstalledLibraryInfo,
MorphableTargetInfo,
ParseKanaBadRequest,
ParseKanaError,
Score,
Speaker,
SpeakerInfo,
StyleIdNotFoundError,
Expand Down Expand Up @@ -704,6 +706,77 @@ def _synthesis_morphing(
background=BackgroundTask(delete_file, f.name),
)

@app.post(
"/sing_frame_audio_query",
response_model=FrameAudioQuery,
tags=["クエリ作成"],
summary="歌唱音声合成用のクエリを作成する",
)
def sing_frame_audio_query(
score: Score,
style_id: StyleId | None = Query(default=None), # noqa: B008
speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008
core_version: str | None = None,
) -> FrameAudioQuery:
"""
歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。
"""
style_id = get_style_id_from_deprecated(
style_id=style_id, deprecated_speaker=speaker
)
engine = get_engine(core_version)
core = get_core(core_version)
phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume(
score, style_id
)

return FrameAudioQuery(
f0=f0,
volume=volume,
phonemes=phonemes,
volumeScale=1,
outputSamplingRate=core.default_sampling_rate,
outputStereo=False,
)

@app.post(
"/frame_synthesis",
response_class=FileResponse,
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
)
def frame_synthesis(
query: FrameAudioQuery,
style_id: StyleId | None = Query(default=None), # noqa: B008
speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008
core_version: str | None = None,
) -> FileResponse:
"""
歌唱音声合成を行います。
"""
style_id = get_style_id_from_deprecated(
style_id=style_id, deprecated_speaker=speaker
)
engine = get_engine(core_version)
wave = engine.frame_synthsize_wave(query, style_id)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(
file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
)

return FileResponse(
f.name,
media_type="audio/wav",
background=BackgroundTask(delete_file, f.name),
)

@app.post(
"/connect_waves",
response_class=FileResponse,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,23 @@
"styles": [
{
"id": 0,
"name": "style0"
"name": "style0",
"type": null
},
{
"id": 2,
"name": "style1"
"name": "style1",
"type": null
},
{
"id": 4,
"name": "style2"
"name": "style2",
"type": null
},
{
"id": 6,
"name": "style3"
"name": "style3",
"type": null
}
],
"supported_features": {
Expand All @@ -31,19 +35,23 @@
"styles": [
{
"id": 1,
"name": "style0"
"name": "style0",
"type": null
},
{
"id": 3,
"name": "style1"
"name": "style1",
"type": null
},
{
"id": 5,
"name": "style2"
"name": "style2",
"type": null
},
{
"id": 7,
"name": "style3"
"name": "style3",
"type": null
}
],
"supported_features": {
Expand All @@ -57,7 +65,8 @@
"styles": [
{
"id": 8,
"name": "style0"
"name": "style0",
"type": null
}
],
"supported_features": {
Expand All @@ -71,7 +80,8 @@
"styles": [
{
"id": 9,
"name": "style0"
"name": "style0",
"type": null
}
],
"supported_features": {
Expand Down
81 changes: 81 additions & 0 deletions voicevox_engine/core_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,84 @@ def safe_decode_forward(
)
sr_wave = self.default_sampling_rate
return wave, sr_wave

def safe_predict_sing_consonant_length_forward(
self,
consonant: NDArray[np.int64],
vowel: NDArray[np.int64],
note_duration: NDArray[np.int64],
style_id: StyleId,
) -> NDArray[np.int64]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
consonant_length = self.core.predict_sing_consonant_length_forward(
length=consonant.shape[0],
consonant=consonant[np.newaxis],
vowel=vowel[np.newaxis],
note_duration=note_duration[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return consonant_length

def safe_predict_sing_f0_forward(
self,
phoneme: NDArray[np.int64],
note: NDArray[np.int64],
style_id: StyleId,
) -> NDArray[np.float32]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
f0 = self.core.predict_sing_f0_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
note=note[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return f0

def safe_predict_sing_volume_forward(
self,
phoneme: NDArray[np.int64],
note: NDArray[np.int64],
f0: NDArray[np.float32],
style_id: StyleId,
) -> NDArray[np.float32]:
# 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

with self.mutex:
volume = self.core.predict_sing_volume_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
note=note[np.newaxis],
f0=f0[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)

return volume

def safe_sf_decode_forward(
self,
phoneme: NDArray[np.int64],
f0: NDArray[np.float32],
volume: NDArray[np.float32],
style_id: StyleId,
) -> tuple[NDArray[np.float32], int]:
# 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
self.initialize_style_id_synthesis(style_id, skip_reinit=True)
with self.mutex:
wave = self.core.sf_decode_forward(
length=phoneme.shape[0],
phoneme=phoneme[np.newaxis],
f0=f0[np.newaxis],
volume=volume[np.newaxis],
style_id=np.array(style_id, dtype=np.int64).reshape(-1),
)
sr_wave = self.default_sampling_rate
return wave, sr_wave
Loading

0 comments on commit 7bc1b21

Please sign in to comment.