[project-s] ハミング用APIを追加 (#1008)

* update metas (add style type) * update engine manifest (add frame rate) * add sing api to core wrapper * add sing api to core adapter * add models for sing api * add sing process to tts engine * add sing api * fix miss * add fixme comment Co-authored-by: Hiroshiba <[email protected]> * remove sing type * fix typo * remove optional * translate error detail * get -> create * fix docs * Revert "remove optional" This reverts commit 12b8fc6. * fix pytest * add comment * add fixme comment Co-authored-by: Hiroshiba <[email protected]> * improve models --------- Co-authored-by: Hiroshiba <[email protected]>
VOICEVOX · Jan 22, 2024 · 7bc1b21 · 7bc1b21
1 parent 8d23bd3
commit 7bc1b21
Show file tree

Hide file tree

Showing 10 changed files with 627 additions and 17 deletions.
diff --git a/engine_manifest.json b/engine_manifest.json
@@ -9,6 +9,7 @@
     "port": 50021,
     "icon": "engine_manifest_assets/icon.png",
     "default_sampling_rate": 24000,
+    "frame_rate": 93.75,
     "terms_of_service": "engine_manifest_assets/terms_of_service.md",
     "update_infos": "engine_manifest_assets/update_infos.json",
     "dependency_licenses": "engine_manifest_assets/dependency_licenses.json",

diff --git a/run.py b/run.py
@@ -42,10 +42,12 @@
     AudioQuery,
     BaseLibraryInfo,
     DownloadableLibraryInfo,
+    FrameAudioQuery,
     InstalledLibraryInfo,
     MorphableTargetInfo,
     ParseKanaBadRequest,
     ParseKanaError,
+    Score,
     Speaker,
     SpeakerInfo,
     StyleIdNotFoundError,
@@ -704,6 +706,77 @@ def _synthesis_morphing(
             background=BackgroundTask(delete_file, f.name),
         )
 
+    @app.post(
+        "/sing_frame_audio_query",
+        response_model=FrameAudioQuery,
+        tags=["クエリ作成"],
+        summary="歌唱音声合成用のクエリを作成する",
+    )
+    def sing_frame_audio_query(
+        score: Score,
+        style_id: StyleId | None = Query(default=None),  # noqa: B008
+        speaker: StyleId | None = Query(default=None, deprecated=True),  # noqa: B008
+        core_version: str | None = None,
+    ) -> FrameAudioQuery:
+        """
+        歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+        """
+        style_id = get_style_id_from_deprecated(
+            style_id=style_id, deprecated_speaker=speaker
+        )
+        engine = get_engine(core_version)
+        core = get_core(core_version)
+        phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume(
+            score, style_id
+        )
+
+        return FrameAudioQuery(
+            f0=f0,
+            volume=volume,
+            phonemes=phonemes,
+            volumeScale=1,
+            outputSamplingRate=core.default_sampling_rate,
+            outputStereo=False,
+        )
+
+    @app.post(
+        "/frame_synthesis",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+    )
+    def frame_synthesis(
+        query: FrameAudioQuery,
+        style_id: StyleId | None = Query(default=None),  # noqa: B008
+        speaker: StyleId | None = Query(default=None, deprecated=True),  # noqa: B008
+        core_version: str | None = None,
+    ) -> FileResponse:
+        """
+        歌唱音声合成を行います。
+        """
+        style_id = get_style_id_from_deprecated(
+            style_id=style_id, deprecated_speaker=speaker
+        )
+        engine = get_engine(core_version)
+        wave = engine.frame_synthsize_wave(query, style_id)
+
+        with NamedTemporaryFile(delete=False) as f:
+            soundfile.write(
+                file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
+            )
+
+        return FileResponse(
+            f.name,
+            media_type="audio/wav",
+            background=BackgroundTask(delete_file, f.name),
+        )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,

diff --git a/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json b/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json
@@ -5,19 +5,23 @@
     "styles": [
       {
         "id": 0,
-        "name": "style0"
+        "name": "style0",
+        "type": null
       },
       {
         "id": 2,
-        "name": "style1"
+        "name": "style1",
+        "type": null
       },
       {
         "id": 4,
-        "name": "style2"
+        "name": "style2",
+        "type": null
       },
       {
         "id": 6,
-        "name": "style3"
+        "name": "style3",
+        "type": null
       }
     ],
     "supported_features": {
@@ -31,19 +35,23 @@
     "styles": [
       {
         "id": 1,
-        "name": "style0"
+        "name": "style0",
+        "type": null
       },
       {
         "id": 3,
-        "name": "style1"
+        "name": "style1",
+        "type": null
       },
       {
         "id": 5,
-        "name": "style2"
+        "name": "style2",
+        "type": null
       },
       {
         "id": 7,
-        "name": "style3"
+        "name": "style3",
+        "type": null
       }
     ],
     "supported_features": {
@@ -57,7 +65,8 @@
     "styles": [
       {
         "id": 8,
-        "name": "style0"
+        "name": "style0",
+        "type": null
       }
     ],
     "supported_features": {
@@ -71,7 +80,8 @@
     "styles": [
       {
         "id": 9,
-        "name": "style0"
+        "name": "style0",
+        "type": null
       }
     ],
     "supported_features": {

diff --git a/voicevox_engine/core_adapter.py b/voicevox_engine/core_adapter.py
@@ -143,3 +143,84 @@ def safe_decode_forward(
             )
         sr_wave = self.default_sampling_rate
         return wave, sr_wave
+
+    def safe_predict_sing_consonant_length_forward(
+        self,
+        consonant: NDArray[np.int64],
+        vowel: NDArray[np.int64],
+        note_duration: NDArray[np.int64],
+        style_id: StyleId,
+    ) -> NDArray[np.int64]:
+        # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
+        self.initialize_style_id_synthesis(style_id, skip_reinit=True)
+
+        with self.mutex:
+            consonant_length = self.core.predict_sing_consonant_length_forward(
+                length=consonant.shape[0],
+                consonant=consonant[np.newaxis],
+                vowel=vowel[np.newaxis],
+                note_duration=note_duration[np.newaxis],
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
+            )
+
+        return consonant_length
+
+    def safe_predict_sing_f0_forward(
+        self,
+        phoneme: NDArray[np.int64],
+        note: NDArray[np.int64],
+        style_id: StyleId,
+    ) -> NDArray[np.float32]:
+        # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
+        self.initialize_style_id_synthesis(style_id, skip_reinit=True)
+
+        with self.mutex:
+            f0 = self.core.predict_sing_f0_forward(
+                length=phoneme.shape[0],
+                phoneme=phoneme[np.newaxis],
+                note=note[np.newaxis],
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
+            )
+
+        return f0
+
+    def safe_predict_sing_volume_forward(
+        self,
+        phoneme: NDArray[np.int64],
+        note: NDArray[np.int64],
+        f0: NDArray[np.float32],
+        style_id: StyleId,
+    ) -> NDArray[np.float32]:
+        # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する
+        self.initialize_style_id_synthesis(style_id, skip_reinit=True)
+
+        with self.mutex:
+            volume = self.core.predict_sing_volume_forward(
+                length=phoneme.shape[0],
+                phoneme=phoneme[np.newaxis],
+                note=note[np.newaxis],
+                f0=f0[np.newaxis],
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
+            )
+
+        return volume
+
+    def safe_sf_decode_forward(
+        self,
+        phoneme: NDArray[np.int64],
+        f0: NDArray[np.float32],
+        volume: NDArray[np.float32],
+        style_id: StyleId,
+    ) -> tuple[NDArray[np.float32], int]:
+        # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
+        self.initialize_style_id_synthesis(style_id, skip_reinit=True)
+        with self.mutex:
+            wave = self.core.sf_decode_forward(
+                length=phoneme.shape[0],
+                phoneme=phoneme[np.newaxis],
+                f0=f0[np.newaxis],
+                volume=volume[np.newaxis],
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
+            )
+        sr_wave = self.default_sampling_rate
+        return wave, sr_wave