From 5570f1508cb7a2277cd32549f3677281ed0e5fa2 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Mon, 3 Jun 2024 15:00:40 +0000
Subject: [PATCH 1/2] =?UTF-8?q?refactor:=20`TTSEngine`=20=E3=83=A1?=
 =?UTF-8?q?=E3=82=BD=E3=83=83=E3=83=89=E5=BC=95=E6=95=B0=E3=81=AB=20`CoreA?=
 =?UTF-8?q?dapter`=20=E3=82=92=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/unit/test_mock_tts_engine.py           |  5 ++-
 test/unit/tts_pipeline/test_tts_engine.py   | 42 ++++++++++++--------
 voicevox_engine/app/routers/tts_pipeline.py | 37 ++++++++++++------
 voicevox_engine/cancellable_engine.py       |  2 +-
 voicevox_engine/dev/tts_engine/mock.py      |  3 ++
 voicevox_engine/morphing/morphing.py        |  8 +++-
 voicevox_engine/tts_pipeline/tts_engine.py  | 43 ++++++++++++---------
 7 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/test/unit/test_mock_tts_engine.py b/test/unit/test_mock_tts_engine.py
index f7db77918..83386fbb8 100644
--- a/test/unit/test_mock_tts_engine.py
+++ b/test/unit/test_mock_tts_engine.py
@@ -46,19 +46,20 @@ def _gen_accent_phrases() -> list[AccentPhrase]:
 def test_update_length() -> None:
     """`.update_length()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
-    engine.update_length(_gen_accent_phrases(), StyleId(0))
+    engine.update_length(engine._core, _gen_accent_phrases(), StyleId(0))
 
 
 def test_update_pitch() -> None:
     """`.update_pitch()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
-    engine.update_pitch(_gen_accent_phrases(), StyleId(0))
+    engine.update_pitch(engine._core, _gen_accent_phrases(), StyleId(0))
 
 
 def test_synthesize_wave() -> None:
     """`.synthesize_wave()` がエラー無く生成をおこなう"""
     engine = MockTTSEngine()
     engine.synthesize_wave(
+        engine._core,
         AudioQuery(
             accent_phrases=_gen_accent_phrases(),
             speedScale=1,
diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py
index 760265e51..900c1e996 100644
--- a/test/unit/tts_pipeline/test_tts_engine.py
+++ b/test/unit/tts_pipeline/test_tts_engine.py
@@ -222,7 +222,7 @@ def test_update_length() -> None:
     # Inputs
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Indirect Outputs（yukarin_sに渡される値）
-    tts_engine.update_length(hello_hiho, StyleId(1))
+    tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1))
     yukarin_s_args = _yukarin_s_mock.call_args[1]
     list_length = yukarin_s_args["length"]
     phoneme_list = yukarin_s_args["phoneme_list"]
@@ -252,7 +252,7 @@ def test_update_pitch() -> None:
     # Inputs
     phrases: list = []
     # Outputs
-    result = tts_engine.update_pitch(phrases, StyleId(1))
+    result = tts_engine.update_pitch(tts_engine._core, phrases, StyleId(1))
     # Expects
     true_result: list = []
     # Tests
@@ -261,7 +261,7 @@ def test_update_pitch() -> None:
     # Inputs
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Indirect Outputs（yukarin_saに渡される値）
-    tts_engine.update_pitch(hello_hiho, StyleId(1))
+    tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1))
     yukarin_sa_args = _yukarin_sa_mock.call_args[1]
     list_length = yukarin_sa_args["length"]
     vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0]
@@ -305,7 +305,9 @@ def test_create_accent_phrases_toward_unknown() -> None:
         "dummy", text_to_features=stub_unknown_features_koxx
     )
     with pytest.raises(ValueError) as e:
-        accent_phrases = engine.update_length_and_pitch(accent_phrases, StyleId(0))
+        accent_phrases = engine.update_length_and_pitch(
+            engine._core, accent_phrases, StyleId(0)
+        )
     assert str(e.value) == "tuple.index(x): x not in tuple"
 
 
@@ -315,7 +317,7 @@ def test_mocked_update_length_output(snapshot_json: SnapshotAssertion) -> None:
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_length(hello_hiho, StyleId(1))
+    result = tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -326,7 +328,7 @@ def test_mocked_update_pitch_output(snapshot_json: SnapshotAssertion) -> None:
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_pitch(hello_hiho, StyleId(1))
+    result = tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -339,7 +341,9 @@ def test_mocked_update_length_and_pitch_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_accent_phrases()
     # Outputs
-    result = tts_engine.update_length_and_pitch(hello_hiho, StyleId(1))
+    result = tts_engine.update_length_and_pitch(
+        tts_engine._core, hello_hiho, StyleId(1)
+    )
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -352,7 +356,7 @@ def test_mocked_create_accent_phrases_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_text()
     # Outputs
-    result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1))
+    result = tts_engine.create_accent_phrases(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -365,7 +369,9 @@ def test_mocked_create_accent_phrases_from_kana_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_kana()
     # Outputs
-    result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1))
+    result = tts_engine.create_accent_phrases_from_kana(
+        tts_engine._core, hello_hiho, StyleId(1)
+    )
     # Tests
     assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
 
@@ -376,7 +382,7 @@ def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_query()
     # Outputs
-    result = tts_engine.synthesize_wave(hello_hiho, StyleId(1))
+    result = tts_engine.synthesize_wave(tts_engine._core, hello_hiho, StyleId(1))
     # Tests
     assert snapshot_json == round_floats(result.tolist(), round_value=2)
 
@@ -392,11 +398,11 @@ def test_mocked_create_sing_volume_from_phoneme_and_f0_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     doremi_srore = _gen_doremi_score()
     phonemes, f0s, _ = tts_engine.create_sing_phoneme_and_f0_and_volume(
-        doremi_srore, StyleId(1)
+        tts_engine._core, doremi_srore, StyleId(1)
     )
     # Outputs
     result = tts_engine.create_sing_volume_from_phoneme_and_f0(
-        doremi_srore, phonemes, f0s, StyleId(1)
+        tts_engine._core, doremi_srore, phonemes, f0s, StyleId(1)
     )
     # Tests
     assert snapshot_json == round_floats(result, round_value=2)
@@ -413,7 +419,9 @@ def test_mocked_synthesize_wave_from_score_output(
     tts_engine = TTSEngine(MockCoreWrapper())
     doremi_srore = _gen_doremi_score()
     # Outputs
-    result = tts_engine.create_sing_phoneme_and_f0_and_volume(doremi_srore, StyleId(1))
+    result = tts_engine.create_sing_phoneme_and_f0_and_volume(
+        tts_engine._core, doremi_srore, StyleId(1)
+    )
     # Tests
     assert snapshot_json(name="query") == round_floats(
         pydantic_to_native_type(result), round_value=2
@@ -430,7 +438,9 @@ def test_mocked_synthesize_wave_from_score_output(
         outputStereo=False,
     )
     # Outputs
-    result_wave = tts_engine.frame_synthsize_wave(doremi_query, StyleId(1))
+    result_wave = tts_engine.frame_synthsize_wave(
+        tts_engine._core, doremi_query, StyleId(1)
+    )
     # Tests
     assert snapshot_json(name="wave") == round_floats(
         result_wave.tolist(), round_value=2
@@ -527,7 +537,7 @@ def create_synthesis_test_base(
     (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866)
     """
     tts_engine = TTSEngine(core=MockCoreWrapper())
-    inputs = tts_engine.create_accent_phrases(text, StyleId(1))
+    inputs = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1))
     outputs = apply_interrogative_upspeak(inputs, enable_interrogative_upspeak)
     assert expected == outputs, f"case(text:{text})"
 
@@ -540,7 +550,7 @@ def test_create_accent_phrases() -> None:
     text = "これはありますか？"
     expected = koreha_arimasuka_base_expected()
     expected[-1].is_interrogative = True
-    actual = tts_engine.create_accent_phrases(text, StyleId(1))
+    actual = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1))
     assert expected == actual, f"case(text:{text})"
 
 
diff --git a/voicevox_engine/app/routers/tts_pipeline.py b/voicevox_engine/app/routers/tts_pipeline.py
index 935cad4cd..eb1e198d0 100644
--- a/voicevox_engine/app/routers/tts_pipeline.py
+++ b/voicevox_engine/app/routers/tts_pipeline.py
@@ -86,7 +86,7 @@ def audio_query(
         """
         engine = tts_engines.get_engine(core_version)
         core = core_manager.get_core(core_version)
-        accent_phrases = engine.create_accent_phrases(text, style_id)
+        accent_phrases = engine.create_accent_phrases(engine._core, text, style_id)
         return AudioQuery(
             accent_phrases=accent_phrases,
             speedScale=1,
@@ -130,7 +130,9 @@ def audio_query_from_preset(
                 status_code=422, detail="該当するプリセットIDが見つかりません"
             )
 
-        accent_phrases = engine.create_accent_phrases(text, selected_preset.style_id)
+        accent_phrases = engine.create_accent_phrases(
+            engine._core, text, selected_preset.style_id
+        )
         return AudioQuery(
             accent_phrases=accent_phrases,
             speedScale=selected_preset.speedScale,
@@ -173,13 +175,15 @@ def accent_phrases(
         engine = tts_engines.get_engine(core_version)
         if is_kana:
             try:
-                return engine.create_accent_phrases_from_kana(text, style_id)
+                return engine.create_accent_phrases_from_kana(
+                    engine._core, text, style_id
+                )
             except ParseKanaError as err:
                 raise HTTPException(
                     status_code=400, detail=ParseKanaBadRequest(err).dict()
                 )
         else:
-            return engine.create_accent_phrases(text, style_id)
+            return engine.create_accent_phrases(engine._core, text, style_id)
 
     @router.post(
         "/mora_data",
@@ -192,7 +196,7 @@ def mora_data(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_length_and_pitch(accent_phrases, style_id)
+        return engine.update_length_and_pitch(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/mora_length",
@@ -205,7 +209,7 @@ def mora_length(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_length(accent_phrases, style_id)
+        return engine.update_length(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/mora_pitch",
@@ -218,7 +222,7 @@ def mora_pitch(
         core_version: str | None = None,
     ) -> list[AccentPhrase]:
         engine = tts_engines.get_engine(core_version)
-        return engine.update_pitch(accent_phrases, style_id)
+        return engine.update_pitch(engine._core, accent_phrases, style_id)
 
     @router.post(
         "/synthesis",
@@ -246,7 +250,10 @@ def synthesis(
     ) -> FileResponse:
         engine = tts_engines.get_engine(core_version)
         wave = engine.synthesize_wave(
-            query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak
+            engine._core,
+            query,
+            style_id,
+            enable_interrogative_upspeak=enable_interrogative_upspeak,
         )
 
         with NamedTemporaryFile(delete=False) as f:
@@ -333,7 +340,9 @@ def multi_synthesis(
                         )
 
                     with TemporaryFile() as wav_file:
-                        wave = engine.synthesize_wave(queries[i], style_id)
+                        wave = engine.synthesize_wave(
+                            engine._core, queries[i], style_id
+                        )
                         soundfile.write(
                             file=wav_file,
                             data=wave,
@@ -366,7 +375,7 @@ def sing_frame_audio_query(
         core = core_manager.get_core(core_version)
         try:
             phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume(
-                score, style_id
+                engine._core, score, style_id
             )
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
@@ -394,7 +403,11 @@ def sing_frame_volume(
         engine = tts_engines.get_engine(core_version)
         try:
             return engine.create_sing_volume_from_phoneme_and_f0(
-                score, frame_audio_query.phonemes, frame_audio_query.f0, style_id
+                engine._core,
+                score,
+                frame_audio_query.phonemes,
+                frame_audio_query.f0,
+                style_id,
             )
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
@@ -421,7 +434,7 @@ def frame_synthesis(
         """
         engine = tts_engines.get_engine(core_version)
         try:
-            wave = engine.frame_synthsize_wave(query, style_id)
+            wave = engine.frame_synthsize_wave(engine._core, query, style_id)
         except TalkSingInvalidInputError as e:
             raise HTTPException(status_code=400, detail=str(e))
 
diff --git a/voicevox_engine/cancellable_engine.py b/voicevox_engine/cancellable_engine.py
index 70da62c5f..a4aa9bf6b 100644
--- a/voicevox_engine/cancellable_engine.py
+++ b/voicevox_engine/cancellable_engine.py
@@ -255,7 +255,7 @@ def start_synthesis_subprocess(
                 continue
             # FIXME: enable_interrogative_upspeakフラグをWebAPIから受け渡してくる
             wave = _engine.synthesize_wave(
-                query, style_id, enable_interrogative_upspeak=False
+                _engine._core, query, style_id, enable_interrogative_upspeak=False
             )
             with NamedTemporaryFile(delete=False) as f:
                 soundfile.write(
diff --git a/voicevox_engine/dev/tts_engine/mock.py b/voicevox_engine/dev/tts_engine/mock.py
index c99c9469f..d630981f1 100644
--- a/voicevox_engine/dev/tts_engine/mock.py
+++ b/voicevox_engine/dev/tts_engine/mock.py
@@ -8,6 +8,8 @@
 from pyopenjtalk import tts
 from soxr import resample
 
+from voicevox_engine.core.core_adapter import CoreAdapter
+
 from ...metas.Metas import StyleId
 from ...model import AudioQuery
 from ...tts_pipeline.tts_engine import TTSEngine, to_flatten_moras
@@ -22,6 +24,7 @@ def __init__(self) -> None:
 
     def synthesize_wave(
         self,
+        core: CoreAdapter,
         query: AudioQuery,
         style_id: StyleId,
         enable_interrogative_upspeak: bool = True,
diff --git a/voicevox_engine/morphing/morphing.py b/voicevox_engine/morphing/morphing.py
index b5ad1ca72..797bee8fa 100644
--- a/voicevox_engine/morphing/morphing.py
+++ b/voicevox_engine/morphing/morphing.py
@@ -114,8 +114,12 @@ def synthesis_morphing_parameter(
     # WORLDに掛けるため合成はモノラルで行う
     query.outputStereo = False
 
-    base_wave = engine.synthesize_wave(query, base_style_id).astype(np.double)
-    target_wave = engine.synthesize_wave(query, target_style_id).astype(np.double)
+    base_wave = engine.synthesize_wave(engine._core, query, base_style_id).astype(
+        np.double
+    )
+    target_wave = engine.synthesize_wave(engine._core, query, target_style_id).astype(
+        np.double
+    )
 
     fs = query.outputSamplingRate
     frame_period = 1.0
diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 18e9d17b1..487b1f06e 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -419,11 +419,12 @@ class TTSEngine:
 
     def __init__(self, core: CoreWrapper):
         super().__init__()
+        # NOTE: 一時的にこの private field へ外部からアクセスしている。逆に内部からのアクセスは無い。
         self._core = CoreAdapter(core)
         # NOTE: self._coreは将来的に消す予定
 
     def update_length(
-        self, accent_phrases: list[AccentPhrase], style_id: StyleId
+        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列に含まれるモーラの音素長属性をスタイルに合わせて更新する"""
         # モーラ系列を抽出する
@@ -436,7 +437,7 @@ def update_length(
         phoneme_ids = np.array([p.id for p in phonemes], dtype=np.int64)
 
         # コアを用いて音素長を生成する
-        phoneme_lengths = self._core.safe_yukarin_s_forward(phoneme_ids, style_id)
+        phoneme_lengths = core.safe_yukarin_s_forward(phoneme_ids, style_id)
 
         # 生成結果でモーラ内の音素長属性を置換する
         vowel_indexes = [i for i, p in enumerate(phonemes) if p.is_mora_tail()]
@@ -450,7 +451,7 @@ def update_length(
         return accent_phrases
 
     def update_pitch(
-        self, accent_phrases: list[AccentPhrase], style_id: StyleId
+        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列に含まれるモーラの音高属性をスタイルに合わせて更新する"""
         # 後続のnumpy.concatenateが空リストだとエラーになるので別処理
@@ -495,7 +496,7 @@ def update_pitch(
         vowel_ids = np.array([p.id for p in vowels], dtype=np.int64)
 
         # コアを用いてモーラ音高を生成する
-        f0 = self._core.safe_yukarin_sa_forward(
+        f0 = core.safe_yukarin_sa_forward(
             vowel_ids,
             consonant_ids,
             start_accent_list,
@@ -517,29 +518,32 @@ def update_pitch(
         return accent_phrases
 
     def update_length_and_pitch(
-        self, accent_phrases: list[AccentPhrase], style_id: StyleId
+        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列の音素長・モーラ音高をスタイルIDに基づいて更新する"""
-        accent_phrases = self.update_length(accent_phrases, style_id)
-        accent_phrases = self.update_pitch(accent_phrases, style_id)
+        accent_phrases = self.update_length(core, accent_phrases, style_id)
+        accent_phrases = self.update_pitch(core, accent_phrases, style_id)
         return accent_phrases
 
-    def create_accent_phrases(self, text: str, style_id: StyleId) -> list[AccentPhrase]:
+    def create_accent_phrases(
+        self, core: CoreAdapter, text: str, style_id: StyleId
+    ) -> list[AccentPhrase]:
         """テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する"""
         accent_phrases = text_to_accent_phrases(text)
-        accent_phrases = self.update_length_and_pitch(accent_phrases, style_id)
+        accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id)
         return accent_phrases
 
     def create_accent_phrases_from_kana(
-        self, kana: str, style_id: StyleId
+        self, core: CoreAdapter, kana: str, style_id: StyleId
     ) -> list[AccentPhrase]:
         """AquesTalk 風記法テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する"""
         accent_phrases = parse_kana(kana)
-        accent_phrases = self.update_length_and_pitch(accent_phrases, style_id)
+        accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id)
         return accent_phrases
 
     def synthesize_wave(
         self,
+        core: CoreAdapter,
         query: AudioQuery,
         style_id: StyleId,
         enable_interrogative_upspeak: bool = True,
@@ -552,7 +556,7 @@ def synthesize_wave(
         )
 
         phoneme, f0 = query_to_decoder_feature(query)
-        raw_wave, sr_raw_wave = self._core.safe_decode_forward(phoneme, f0, style_id)
+        raw_wave, sr_raw_wave = core.safe_decode_forward(phoneme, f0, style_id)
         wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
         return wave
 
@@ -560,6 +564,7 @@ def synthesize_wave(
     # 返す値の総称を考え、関数名を変更する
     def create_sing_phoneme_and_f0_and_volume(
         self,
+        core: CoreAdapter,
         score: Score,
         style_id: StyleId,
     ) -> tuple[list[FramePhoneme], list[float], list[float]]:
@@ -575,7 +580,7 @@ def create_sing_phoneme_and_f0_and_volume(
         ) = notes_to_keys_and_phonemes(notes)
 
         # コアを用いて子音長を生成する
-        consonant_lengths = self._core.safe_predict_sing_consonant_length_forward(
+        consonant_lengths = core.safe_predict_sing_consonant_length_forward(
             note_consonants_array, note_vowels_array, note_lengths_array, style_id
         )
 
@@ -587,13 +592,11 @@ def create_sing_phoneme_and_f0_and_volume(
         frame_keys = np.repeat(phoneme_keys_array, phoneme_lengths)
 
         # コアを用いて音高を生成する
-        f0s = self._core.safe_predict_sing_f0_forward(
-            frame_phonemes, frame_keys, style_id
-        )
+        f0s = core.safe_predict_sing_f0_forward(frame_phonemes, frame_keys, style_id)
 
         # コアを用いて音量を生成する
         # FIXME: 変数名のsいらない？
-        volumes = self._core.safe_predict_sing_volume_forward(
+        volumes = core.safe_predict_sing_volume_forward(
             frame_phonemes, frame_keys, f0s, style_id
         )
 
@@ -609,6 +612,7 @@ def create_sing_phoneme_and_f0_and_volume(
 
     def create_sing_volume_from_phoneme_and_f0(
         self,
+        core: CoreAdapter,
         score: Score,
         phonemes: list[FramePhoneme],
         f0s: list[float],
@@ -649,7 +653,7 @@ def create_sing_volume_from_phoneme_and_f0(
         frame_keys = np.repeat(phoneme_keys_array, phoneme_lengths)
 
         # コアを用いて音量を生成する
-        volumes = self._core.safe_predict_sing_volume_forward(
+        volumes = core.safe_predict_sing_volume_forward(
             frame_phonemes, frame_keys, f0_array, style_id
         )
 
@@ -660,13 +664,14 @@ def create_sing_volume_from_phoneme_and_f0(
 
     def frame_synthsize_wave(
         self,
+        core: CoreAdapter,
         query: FrameAudioQuery,
         style_id: StyleId,
     ) -> NDArray[np.float32]:
         """歌声合成用のクエリ・スタイルIDに基づいて音声波形を生成する"""
 
         phoneme, f0, volume = frame_query_to_sf_decoder_feature(query)
-        raw_wave, sr_raw_wave = self._core.safe_sf_decode_forward(
+        raw_wave, sr_raw_wave = core.safe_sf_decode_forward(
             phoneme, f0, volume, style_id
         )
         wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)

From 75f6487c94c5f72e5e588f01d4153c5dafb46084 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Mon, 3 Jun 2024 16:10:15 +0000
Subject: [PATCH 2/2] =?UTF-8?q?refactor:=20`TTSEngine`=20=E3=83=A1?=
 =?UTF-8?q?=E3=82=BD=E3=83=83=E3=83=89=E3=82=92=20staticmethod=20=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 voicevox_engine/dev/tts_engine/mock.py     |  7 +++--
 voicevox_engine/tts_pipeline/tts_engine.py | 35 ++++++++++++++--------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/voicevox_engine/dev/tts_engine/mock.py b/voicevox_engine/dev/tts_engine/mock.py
index d630981f1..0e22ecf2b 100644
--- a/voicevox_engine/dev/tts_engine/mock.py
+++ b/voicevox_engine/dev/tts_engine/mock.py
@@ -22,8 +22,8 @@ class MockTTSEngine(TTSEngine):
     def __init__(self) -> None:
         super().__init__(MockCoreWrapper())
 
+    @staticmethod
     def synthesize_wave(
-        self,
         core: CoreAdapter,
         query: AudioQuery,
         style_id: StyleId,
@@ -37,14 +37,15 @@ def synthesize_wave(
         flatten_moras = to_flatten_moras(query.accent_phrases)
         kana_text = "".join([mora.text for mora in flatten_moras])
 
-        wave = self.forward(kana_text)
+        wave = MockTTSEngine.forward(kana_text)
 
         # volume
         wave *= query.volumeScale
 
         return wave
 
-    def forward(self, text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]:
+    @staticmethod
+    def forward(text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]:
         """
         forward tts via pyopenjtalk.tts()
         参照→TTSEngine のdocstring [Mock]
diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 487b1f06e..b9cb7c5db 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -423,8 +423,9 @@ def __init__(self, core: CoreWrapper):
         self._core = CoreAdapter(core)
         # NOTE: self._coreは将来的に消す予定
 
+    @staticmethod
     def update_length(
-        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
+        core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列に含まれるモーラの音素長属性をスタイルに合わせて更新する"""
         # モーラ系列を抽出する
@@ -450,8 +451,9 @@ def update_length(
 
         return accent_phrases
 
+    @staticmethod
     def update_pitch(
-        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
+        core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列に含まれるモーラの音高属性をスタイルに合わせて更新する"""
         # 後続のnumpy.concatenateが空リストだとエラーになるので別処理
@@ -517,32 +519,39 @@ def update_pitch(
 
         return accent_phrases
 
+    @staticmethod
     def update_length_and_pitch(
-        self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
+        core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId
     ) -> list[AccentPhrase]:
         """アクセント句系列の音素長・モーラ音高をスタイルIDに基づいて更新する"""
-        accent_phrases = self.update_length(core, accent_phrases, style_id)
-        accent_phrases = self.update_pitch(core, accent_phrases, style_id)
+        accent_phrases = TTSEngine.update_length(core, accent_phrases, style_id)
+        accent_phrases = TTSEngine.update_pitch(core, accent_phrases, style_id)
         return accent_phrases
 
+    @staticmethod
     def create_accent_phrases(
-        self, core: CoreAdapter, text: str, style_id: StyleId
+        core: CoreAdapter, text: str, style_id: StyleId
     ) -> list[AccentPhrase]:
         """テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する"""
         accent_phrases = text_to_accent_phrases(text)
-        accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id)
+        accent_phrases = TTSEngine.update_length_and_pitch(
+            core, accent_phrases, style_id
+        )
         return accent_phrases
 
+    @staticmethod
     def create_accent_phrases_from_kana(
-        self, core: CoreAdapter, kana: str, style_id: StyleId
+        core: CoreAdapter, kana: str, style_id: StyleId
     ) -> list[AccentPhrase]:
         """AquesTalk 風記法テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する"""
         accent_phrases = parse_kana(kana)
-        accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id)
+        accent_phrases = TTSEngine.update_length_and_pitch(
+            core, accent_phrases, style_id
+        )
         return accent_phrases
 
+    @staticmethod
     def synthesize_wave(
-        self,
         core: CoreAdapter,
         query: AudioQuery,
         style_id: StyleId,
@@ -562,8 +571,8 @@ def synthesize_wave(
 
     # FIXME: sing用のエンジンに移すかクラス名変える
     # 返す値の総称を考え、関数名を変更する
+    @staticmethod
     def create_sing_phoneme_and_f0_and_volume(
-        self,
         core: CoreAdapter,
         score: Score,
         style_id: StyleId,
@@ -610,8 +619,8 @@ def create_sing_phoneme_and_f0_and_volume(
 
         return phoneme_data_list, f0s.tolist(), volumes.tolist()
 
+    @staticmethod
     def create_sing_volume_from_phoneme_and_f0(
-        self,
         core: CoreAdapter,
         score: Score,
         phonemes: list[FramePhoneme],
@@ -662,8 +671,8 @@ def create_sing_volume_from_phoneme_and_f0(
 
         return volume_list
 
+    @staticmethod
     def frame_synthsize_wave(
-        self,
         core: CoreAdapter,
         query: FrameAudioQuery,
         style_id: StyleId,