VOICEVOX · tarepan · Jun 3, 2024 · Jun 3, 2024
@@ -46,19 +46,20 @@ def _gen_accent_phrases() -> list[AccentPhrase]:
 def test_update_length() -> None:
     """`.update_length()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
-    engine.update_length(_gen_accent_phrases(), StyleId(0))
+    engine.update_length(engine._core, _gen_accent_phrases(), StyleId(0))
 
 
 def test_update_pitch() -> None:
     """`.update_pitch()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
-    engine.update_pitch(_gen_accent_phrases(), StyleId(0))
+    engine.update_pitch(engine._core, _gen_accent_phrases(), StyleId(0))
 
 
 def test_synthesize_wave() -> None:
     """`.synthesize_wave()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
     engine.synthesize_wave(
+        engine._core,
         AudioQuery(
             accent_phrases=_gen_accent_phrases(),
             speedScale=1,

@@ -222,7 +222,7 @@ def test_update_length() -> None:
     # Inputs
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Indirect Outputs（yukarin_sに渡される値）
-    tts_engine.update_length(hello_hiho, StyleId(1))
+    tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1))
     yukarin_s_args = _yukarin_s_mock.call_args[1]
     list_length = yukarin_s_args["length"]
     phoneme_list = yukarin_s_args["phoneme_list"]
@@ -252,7 +252,7 @@ def test_update_pitch() -> None:
     # Inputs
     phrases: list = []
     # Outputs
-    result = tts_engine.update_pitch(phrases, StyleId(1))
+    result = tts_engine.update_pitch(tts_engine._core, phrases, StyleId(1))
     # Expects
     true_result: list = []
     # Tests
@@ -261,7 +261,7 @@ def test_update_pitch() -> None:
     # Inputs
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Indirect Outputs（yukarin_saに渡される値）
-    tts_engine.update_pitch(hello_hiho, StyleId(1))
+    tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1))
     yukarin_sa_args = _yukarin_sa_mock.call_args[1]
     list_length = yukarin_sa_args["length"]
     vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0]
@@ -305,7 +305,9 @@ def test_create_accent_phrases_toward_unknown() -> None:
         "dummy", text_to_features=stub_unknown_features_koxx
     )
     with pytest.raises(ValueError) as e:
-        accent_phrases = engine.update_length_and_pitch(accent_phrases, StyleId(0))
+        accent_phrases = engine.update_length_and_pitch(
+            engine._core, accent_phrases, StyleId(0)
+        )
     assert str(e.value) == "tuple.index(x): x not in tuple"
 
 
@@ -315,7 +317,7 @@ def test_mocked_update_length_output(snapshot_json: SnapshotAssertion) -> None:
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_length(hello_hiho, StyleId(1))
+    result = tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -326,7 +328,7 @@ def test_mocked_update_pitch_output(snapshot_json: SnapshotAssertion) -> None:
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_pitch(hello_hiho, StyleId(1))
+    result = tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -339,7 +341,9 @@ def test_mocked_update_length_and_pitch_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_length_and_pitch(hello_hiho, StyleId(1))
+    result = tts_engine.update_length_and_pitch(
+        tts_engine._core, hello_hiho, StyleId(1)
+    )
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -352,7 +356,7 @@ def test_mocked_create_accent_phrases_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_text()
     # Outputs
-    result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1))
+    result = tts_engine.create_accent_phrases(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -365,7 +369,9 @@ def test_mocked_create_accent_phrases_from_kana_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_kana()
     # Outputs
-    result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1))
+    result = tts_engine.create_accent_phrases_from_kana(
+        tts_engine._core, hello_hiho, StyleId(1)
+    )
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -376,7 +382,7 @@ def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_query()
     # Outputs
-    result = tts_engine.synthesize_wave(hello_hiho, StyleId(1))
+    result = tts_engine.synthesize_wave(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(result.tolist(), round_value=2)
 
@@ -392,11 +398,11 @@ def test_mocked_create_sing_volume_from_phoneme_and_f0_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     doremi_srore = _gen_doremi_score()
     phonemes, f0s, _ = tts_engine.create_sing_phoneme_and_f0_and_volume(
-        doremi_srore, StyleId(1)
+        tts_engine._core, doremi_srore, StyleId(1)
     )
     # Outputs
     result = tts_engine.create_sing_volume_from_phoneme_and_f0(
-        doremi_srore, phonemes, f0s, StyleId(1)
+        tts_engine._core, doremi_srore, phonemes, f0s, StyleId(1)
     )
     # Tests
     assert snapshot_json == round_floats(result, round_value=2)
@@ -413,7 +419,9 @@ def test_mocked_synthesize_wave_from_score_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     doremi_srore = _gen_doremi_score()
     # Outputs
-    result = tts_engine.create_sing_phoneme_and_f0_and_volume(doremi_srore, StyleId(1))
+    result = tts_engine.create_sing_phoneme_and_f0_and_volume(
+        tts_engine._core, doremi_srore, StyleId(1)
+    )
     # Tests
     assert snapshot_json(name="query") == round_floats(
         pydantic_to_native_type(result), round_value=2
@@ -430,7 +438,9 @@ def test_mocked_synthesize_wave_from_score_output(
         outputStereo=False,
     )
     # Outputs
-    result_wave = tts_engine.frame_synthsize_wave(doremi_query, StyleId(1))
+    result_wave = tts_engine.frame_synthsize_wave(
+        tts_engine._core, doremi_query, StyleId(1)
+    )
     # Tests
     assert snapshot_json(name="wave") == round_floats(
         result_wave.tolist(), round_value=2
@@ -527,7 +537,7 @@ def create_synthesis_test_base(
     (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866)
     """
     tts_engine = TTSEngine(core=MockCoreWrapper())
-    inputs = tts_engine.create_accent_phrases(text, StyleId(1))
+    inputs = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1))
     outputs = apply_interrogative_upspeak(inputs, enable_interrogative_upspeak)
     assert expected == outputs, f"case(text:{text})"
 
@@ -540,7 +550,7 @@ def test_create_accent_phrases() -> None:
     text = "これはありますか？"
     expected = koreha_arimasuka_base_expected()
     expected[-1].is_interrogative = True
-    actual = tts_engine.create_accent_phrases(text, StyleId(1))
+    actual = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1))
     assert expected == actual, f"case(text:{text})"
 
 

@@ -86,7 +86,7 @@ def audio_query(
         """
         engine = tts_engines.get_engine(core_version)
         core = core_manager.get_core(core_version)
-        accent_phrases = engine.create_accent_phrases(text, style_id)
+        accent_phrases = engine.create_accent_phrases(engine._core, text, style_id)
         return AudioQuery(
             accent_phrases=accent_phrases,
             speedScale=1,
@@ -130,7 +130,9 @@ def audio_query_from_preset(
                 status_code=422, detail="該当するプリセットIDが見つかりません"
             )
 
-        accent_phrases = engine.create_accent_phrases(text, selected_preset.style_id)
+        accent_phrases = engine.create_accent_phrases(
+            engine._core, text, selected_preset.style_id
+        )
         return AudioQuery(
             accent_phrases=accent_phrases,
             speedScale=selected_preset.speedScale,
@@ -173,13 +175,15 @@ def accent_phrases(
         engine = tts_engines.get_engine(core_version)
         if is_kana:
             try:
-                return engine.create_accent_phrases_from_kana(text, style_id)
+                return engine.create_accent_phrases_from_kana(
+                    engine._core, text, style_id
+                )
             except ParseKanaError as err:
                 raise HTTPException(
                     status_code=400, detail=ParseKanaBadRequest(err).dict()
                 )
         else:
-            return engine.create_accent_phrases(text, style_id)
+            return engine.create_accent_phrases(engine._core, text, style_id)
 
     @router.post(
         "/mora_data",
@@ -192,7 +196,7 @@ def mora_data(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_length_and_pitch(accent_phrases, style_id)
+        return engine.update_length_and_pitch(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/mora_length",
@@ -205,7 +209,7 @@ def mora_length(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_length(accent_phrases, style_id)
+        return engine.update_length(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/mora_pitch",
@@ -218,7 +222,7 @@ def mora_pitch(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_pitch(accent_phrases, style_id)
+        return engine.update_pitch(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/synthesis",
@@ -246,7 +250,10 @@ def synthesis(
     ) -> FileResponse:
         engine = tts_engines.get_engine(core_version)
         wave = engine.synthesize_wave(
-            query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak
+            engine._core,
+            query,
+            style_id,
+            enable_interrogative_upspeak=enable_interrogative_upspeak,
         )
 
         with NamedTemporaryFile(delete=False) as f:
@@ -333,7 +340,9 @@ def multi_synthesis(
                         )
 
                     with TemporaryFile() as wav_file:
-                        wave = engine.synthesize_wave(queries[i], style_id)
+                        wave = engine.synthesize_wave(
+                            engine._core, queries[i], style_id
+                        )
                         soundfile.write(
                             file=wav_file,
                             data=wave,
@@ -366,7 +375,7 @@ def sing_frame_audio_query(
         core = core_manager.get_core(core_version)
         try:
             phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume(
-                score, style_id
+                engine._core, score, style_id
             )
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
@@ -394,7 +403,11 @@ def sing_frame_volume(
         engine = tts_engines.get_engine(core_version)
         try:
             return engine.create_sing_volume_from_phoneme_and_f0(
-                score, frame_audio_query.phonemes, frame_audio_query.f0, style_id
+                engine._core,
+                score,
+                frame_audio_query.phonemes,
+                frame_audio_query.f0,
+                style_id,
             )
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
@@ -421,7 +434,7 @@ def frame_synthesis(
         """
         engine = tts_engines.get_engine(core_version)
         try:
-            wave = engine.frame_synthsize_wave(query, style_id)
+            wave = engine.frame_synthsize_wave(engine._core, query, style_id)
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
 

@@ -255,7 +255,7 @@ def start_synthesis_subprocess(
                 continue
             # FIXME: enable_interrogative_upspeakフラグをWebAPIから受け渡してくる
             wave = _engine.synthesize_wave(
-                query, style_id, enable_interrogative_upspeak=False
+                _engine._core, query, style_id, enable_interrogative_upspeak=False
             )
             with NamedTemporaryFile(delete=False) as f:
                 soundfile.write(

@@ -8,6 +8,8 @@
 from pyopenjtalk import tts
 from soxr import resample
 
+from voicevox_engine.core.core_adapter import CoreAdapter
+
 from ...metas.Metas import StyleId
 from ...model import AudioQuery
 from ...tts_pipeline.tts_engine import TTSEngine, to_flatten_moras
@@ -20,8 +22,9 @@ class MockTTSEngine(TTSEngine):
     def __init__(self) -> None:
         super().__init__(MockCoreWrapper())
 
+    @staticmethod
     def synthesize_wave(
-        self,
+        core: CoreAdapter,
         query: AudioQuery,
         style_id: StyleId,
         enable_interrogative_upspeak: bool = True,
@@ -34,14 +37,15 @@ def synthesize_wave(
         flatten_moras = to_flatten_moras(query.accent_phrases)
         kana_text = "".join([mora.text for mora in flatten_moras])
 
-        wave = self.forward(kana_text)
+        wave = MockTTSEngine.forward(kana_text)
 
         # volume
         wave *= query.volumeScale
 
         return wave
 
-    def forward(self, text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]:
+    @staticmethod
+    def forward(text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]:
         """
         forward tts via pyopenjtalk.tts()
         参照→TTSEngine のdocstring [Mock]

@@ -114,8 +114,12 @@ def synthesis_morphing_parameter(
     # WORLDに掛けるため合成はモノラルで行う
     query.outputStereo = False
 
-    base_wave = engine.synthesize_wave(query, base_style_id).astype(np.double)
-    target_wave = engine.synthesize_wave(query, target_style_id).astype(np.double)
+    base_wave = engine.synthesize_wave(engine._core, query, base_style_id).astype(
+        np.double
+    )
+    target_wave = engine.synthesize_wave(engine._core, query, target_style_id).astype(
+        np.double
+    )
 
     fs = query.outputSamplingRate
     frame_period = 1.0