From 5570f1508cb7a2277cd32549f3677281ed0e5fa2 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 3 Jun 2024 15:00:40 +0000 Subject: [PATCH 1/2] =?UTF-8?q?refactor:=20`TTSEngine`=20=E3=83=A1?= =?UTF-8?q?=E3=82=BD=E3=83=83=E3=83=89=E5=BC=95=E6=95=B0=E3=81=AB=20`CoreA?= =?UTF-8?q?dapter`=20=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/unit/test_mock_tts_engine.py | 5 ++- test/unit/tts_pipeline/test_tts_engine.py | 42 ++++++++++++-------- voicevox_engine/app/routers/tts_pipeline.py | 37 ++++++++++++------ voicevox_engine/cancellable_engine.py | 2 +- voicevox_engine/dev/tts_engine/mock.py | 3 ++ voicevox_engine/morphing/morphing.py | 8 +++- voicevox_engine/tts_pipeline/tts_engine.py | 43 ++++++++++++--------- 7 files changed, 88 insertions(+), 52 deletions(-) diff --git a/test/unit/test_mock_tts_engine.py b/test/unit/test_mock_tts_engine.py index f7db77918..83386fbb8 100644 --- a/test/unit/test_mock_tts_engine.py +++ b/test/unit/test_mock_tts_engine.py @@ -46,19 +46,20 @@ def _gen_accent_phrases() -> list[AccentPhrase]: def test_update_length() -> None: """`.update_length()` がエラー無く生成をおこなう""" engine = MockTTSEngine() - engine.update_length(_gen_accent_phrases(), StyleId(0)) + engine.update_length(engine._core, _gen_accent_phrases(), StyleId(0)) def test_update_pitch() -> None: """`.update_pitch()` がエラー無く生成をおこなう""" engine = MockTTSEngine() - engine.update_pitch(_gen_accent_phrases(), StyleId(0)) + engine.update_pitch(engine._core, _gen_accent_phrases(), StyleId(0)) def test_synthesize_wave() -> None: """`.synthesize_wave()` がエラー無く生成をおこなう""" engine = MockTTSEngine() engine.synthesize_wave( + engine._core, AudioQuery( accent_phrases=_gen_accent_phrases(), speedScale=1, diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py index 760265e51..900c1e996 100644 --- a/test/unit/tts_pipeline/test_tts_engine.py +++ b/test/unit/tts_pipeline/test_tts_engine.py @@ -222,7 +222,7 @@ def test_update_length() -> None: # Inputs hello_hiho = _gen_hello_hiho_accent_phrases() # Indirect Outputs(yukarin_sに渡される値) - tts_engine.update_length(hello_hiho, StyleId(1)) + tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1)) yukarin_s_args = _yukarin_s_mock.call_args[1] list_length = yukarin_s_args["length"] phoneme_list = yukarin_s_args["phoneme_list"] @@ -252,7 +252,7 @@ def test_update_pitch() -> None: # Inputs phrases: list = [] # Outputs - result = tts_engine.update_pitch(phrases, StyleId(1)) + result = tts_engine.update_pitch(tts_engine._core, phrases, StyleId(1)) # Expects true_result: list = [] # Tests @@ -261,7 +261,7 @@ def test_update_pitch() -> None: # Inputs hello_hiho = _gen_hello_hiho_accent_phrases() # Indirect Outputs(yukarin_saに渡される値) - tts_engine.update_pitch(hello_hiho, StyleId(1)) + tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1)) yukarin_sa_args = _yukarin_sa_mock.call_args[1] list_length = yukarin_sa_args["length"] vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0] @@ -305,7 +305,9 @@ def test_create_accent_phrases_toward_unknown() -> None: "dummy", text_to_features=stub_unknown_features_koxx ) with pytest.raises(ValueError) as e: - accent_phrases = engine.update_length_and_pitch(accent_phrases, StyleId(0)) + accent_phrases = engine.update_length_and_pitch( + engine._core, accent_phrases, StyleId(0) + ) assert str(e.value) == "tuple.index(x): x not in tuple" @@ -315,7 +317,7 @@ def test_mocked_update_length_output(snapshot_json: SnapshotAssertion) -> None: tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_accent_phrases() # Outputs - result = tts_engine.update_length(hello_hiho, StyleId(1)) + result = tts_engine.update_length(tts_engine._core, hello_hiho, StyleId(1)) # Tests assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) @@ -326,7 +328,7 @@ def test_mocked_update_pitch_output(snapshot_json: SnapshotAssertion) -> None: tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_accent_phrases() # Outputs - result = tts_engine.update_pitch(hello_hiho, StyleId(1)) + result = tts_engine.update_pitch(tts_engine._core, hello_hiho, StyleId(1)) # Tests assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) @@ -339,7 +341,9 @@ def test_mocked_update_length_and_pitch_output( tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_accent_phrases() # Outputs - result = tts_engine.update_length_and_pitch(hello_hiho, StyleId(1)) + result = tts_engine.update_length_and_pitch( + tts_engine._core, hello_hiho, StyleId(1) + ) # Tests assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) @@ -352,7 +356,7 @@ def test_mocked_create_accent_phrases_output( tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_text() # Outputs - result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1)) + result = tts_engine.create_accent_phrases(tts_engine._core, hello_hiho, StyleId(1)) # Tests assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) @@ -365,7 +369,9 @@ def test_mocked_create_accent_phrases_from_kana_output( tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_kana() # Outputs - result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1)) + result = tts_engine.create_accent_phrases_from_kana( + tts_engine._core, hello_hiho, StyleId(1) + ) # Tests assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) @@ -376,7 +382,7 @@ def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None tts_engine = TTSEngine(MockCoreWrapper()) hello_hiho = _gen_hello_hiho_query() # Outputs - result = tts_engine.synthesize_wave(hello_hiho, StyleId(1)) + result = tts_engine.synthesize_wave(tts_engine._core, hello_hiho, StyleId(1)) # Tests assert snapshot_json == round_floats(result.tolist(), round_value=2) @@ -392,11 +398,11 @@ def test_mocked_create_sing_volume_from_phoneme_and_f0_output( tts_engine = TTSEngine(MockCoreWrapper()) doremi_srore = _gen_doremi_score() phonemes, f0s, _ = tts_engine.create_sing_phoneme_and_f0_and_volume( - doremi_srore, StyleId(1) + tts_engine._core, doremi_srore, StyleId(1) ) # Outputs result = tts_engine.create_sing_volume_from_phoneme_and_f0( - doremi_srore, phonemes, f0s, StyleId(1) + tts_engine._core, doremi_srore, phonemes, f0s, StyleId(1) ) # Tests assert snapshot_json == round_floats(result, round_value=2) @@ -413,7 +419,9 @@ def test_mocked_synthesize_wave_from_score_output( tts_engine = TTSEngine(MockCoreWrapper()) doremi_srore = _gen_doremi_score() # Outputs - result = tts_engine.create_sing_phoneme_and_f0_and_volume(doremi_srore, StyleId(1)) + result = tts_engine.create_sing_phoneme_and_f0_and_volume( + tts_engine._core, doremi_srore, StyleId(1) + ) # Tests assert snapshot_json(name="query") == round_floats( pydantic_to_native_type(result), round_value=2 @@ -430,7 +438,9 @@ def test_mocked_synthesize_wave_from_score_output( outputStereo=False, ) # Outputs - result_wave = tts_engine.frame_synthsize_wave(doremi_query, StyleId(1)) + result_wave = tts_engine.frame_synthsize_wave( + tts_engine._core, doremi_query, StyleId(1) + ) # Tests assert snapshot_json(name="wave") == round_floats( result_wave.tolist(), round_value=2 @@ -527,7 +537,7 @@ def create_synthesis_test_base( (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) """ tts_engine = TTSEngine(core=MockCoreWrapper()) - inputs = tts_engine.create_accent_phrases(text, StyleId(1)) + inputs = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1)) outputs = apply_interrogative_upspeak(inputs, enable_interrogative_upspeak) assert expected == outputs, f"case(text:{text})" @@ -540,7 +550,7 @@ def test_create_accent_phrases() -> None: text = "これはありますか?" expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True - actual = tts_engine.create_accent_phrases(text, StyleId(1)) + actual = tts_engine.create_accent_phrases(tts_engine._core, text, StyleId(1)) assert expected == actual, f"case(text:{text})" diff --git a/voicevox_engine/app/routers/tts_pipeline.py b/voicevox_engine/app/routers/tts_pipeline.py index 935cad4cd..eb1e198d0 100644 --- a/voicevox_engine/app/routers/tts_pipeline.py +++ b/voicevox_engine/app/routers/tts_pipeline.py @@ -86,7 +86,7 @@ def audio_query( """ engine = tts_engines.get_engine(core_version) core = core_manager.get_core(core_version) - accent_phrases = engine.create_accent_phrases(text, style_id) + accent_phrases = engine.create_accent_phrases(engine._core, text, style_id) return AudioQuery( accent_phrases=accent_phrases, speedScale=1, @@ -130,7 +130,9 @@ def audio_query_from_preset( status_code=422, detail="該当するプリセットIDが見つかりません" ) - accent_phrases = engine.create_accent_phrases(text, selected_preset.style_id) + accent_phrases = engine.create_accent_phrases( + engine._core, text, selected_preset.style_id + ) return AudioQuery( accent_phrases=accent_phrases, speedScale=selected_preset.speedScale, @@ -173,13 +175,15 @@ def accent_phrases( engine = tts_engines.get_engine(core_version) if is_kana: try: - return engine.create_accent_phrases_from_kana(text, style_id) + return engine.create_accent_phrases_from_kana( + engine._core, text, style_id + ) except ParseKanaError as err: raise HTTPException( status_code=400, detail=ParseKanaBadRequest(err).dict() ) else: - return engine.create_accent_phrases(text, style_id) + return engine.create_accent_phrases(engine._core, text, style_id) @router.post( "/mora_data", @@ -192,7 +196,7 @@ def mora_data( core_version: str | None = None, ) -> list[AccentPhrase]: engine = tts_engines.get_engine(core_version) - return engine.update_length_and_pitch(accent_phrases, style_id) + return engine.update_length_and_pitch(engine._core, accent_phrases, style_id) @router.post( "/mora_length", @@ -205,7 +209,7 @@ def mora_length( core_version: str | None = None, ) -> list[AccentPhrase]: engine = tts_engines.get_engine(core_version) - return engine.update_length(accent_phrases, style_id) + return engine.update_length(engine._core, accent_phrases, style_id) @router.post( "/mora_pitch", @@ -218,7 +222,7 @@ def mora_pitch( core_version: str | None = None, ) -> list[AccentPhrase]: engine = tts_engines.get_engine(core_version) - return engine.update_pitch(accent_phrases, style_id) + return engine.update_pitch(engine._core, accent_phrases, style_id) @router.post( "/synthesis", @@ -246,7 +250,10 @@ def synthesis( ) -> FileResponse: engine = tts_engines.get_engine(core_version) wave = engine.synthesize_wave( - query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak + engine._core, + query, + style_id, + enable_interrogative_upspeak=enable_interrogative_upspeak, ) with NamedTemporaryFile(delete=False) as f: @@ -333,7 +340,9 @@ def multi_synthesis( ) with TemporaryFile() as wav_file: - wave = engine.synthesize_wave(queries[i], style_id) + wave = engine.synthesize_wave( + engine._core, queries[i], style_id + ) soundfile.write( file=wav_file, data=wave, @@ -366,7 +375,7 @@ def sing_frame_audio_query( core = core_manager.get_core(core_version) try: phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume( - score, style_id + engine._core, score, style_id ) except TalkSingInvalidInputError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -394,7 +403,11 @@ def sing_frame_volume( engine = tts_engines.get_engine(core_version) try: return engine.create_sing_volume_from_phoneme_and_f0( - score, frame_audio_query.phonemes, frame_audio_query.f0, style_id + engine._core, + score, + frame_audio_query.phonemes, + frame_audio_query.f0, + style_id, ) except TalkSingInvalidInputError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -421,7 +434,7 @@ def frame_synthesis( """ engine = tts_engines.get_engine(core_version) try: - wave = engine.frame_synthsize_wave(query, style_id) + wave = engine.frame_synthsize_wave(engine._core, query, style_id) except TalkSingInvalidInputError as e: raise HTTPException(status_code=400, detail=str(e)) diff --git a/voicevox_engine/cancellable_engine.py b/voicevox_engine/cancellable_engine.py index 70da62c5f..a4aa9bf6b 100644 --- a/voicevox_engine/cancellable_engine.py +++ b/voicevox_engine/cancellable_engine.py @@ -255,7 +255,7 @@ def start_synthesis_subprocess( continue # FIXME: enable_interrogative_upspeakフラグをWebAPIから受け渡してくる wave = _engine.synthesize_wave( - query, style_id, enable_interrogative_upspeak=False + _engine._core, query, style_id, enable_interrogative_upspeak=False ) with NamedTemporaryFile(delete=False) as f: soundfile.write( diff --git a/voicevox_engine/dev/tts_engine/mock.py b/voicevox_engine/dev/tts_engine/mock.py index c99c9469f..d630981f1 100644 --- a/voicevox_engine/dev/tts_engine/mock.py +++ b/voicevox_engine/dev/tts_engine/mock.py @@ -8,6 +8,8 @@ from pyopenjtalk import tts from soxr import resample +from voicevox_engine.core.core_adapter import CoreAdapter + from ...metas.Metas import StyleId from ...model import AudioQuery from ...tts_pipeline.tts_engine import TTSEngine, to_flatten_moras @@ -22,6 +24,7 @@ def __init__(self) -> None: def synthesize_wave( self, + core: CoreAdapter, query: AudioQuery, style_id: StyleId, enable_interrogative_upspeak: bool = True, diff --git a/voicevox_engine/morphing/morphing.py b/voicevox_engine/morphing/morphing.py index b5ad1ca72..797bee8fa 100644 --- a/voicevox_engine/morphing/morphing.py +++ b/voicevox_engine/morphing/morphing.py @@ -114,8 +114,12 @@ def synthesis_morphing_parameter( # WORLDに掛けるため合成はモノラルで行う query.outputStereo = False - base_wave = engine.synthesize_wave(query, base_style_id).astype(np.double) - target_wave = engine.synthesize_wave(query, target_style_id).astype(np.double) + base_wave = engine.synthesize_wave(engine._core, query, base_style_id).astype( + np.double + ) + target_wave = engine.synthesize_wave(engine._core, query, target_style_id).astype( + np.double + ) fs = query.outputSamplingRate frame_period = 1.0 diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 18e9d17b1..487b1f06e 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -419,11 +419,12 @@ class TTSEngine: def __init__(self, core: CoreWrapper): super().__init__() + # NOTE: 一時的にこの private field へ外部からアクセスしている。逆に内部からのアクセスは無い。 self._core = CoreAdapter(core) # NOTE: self._coreは将来的に消す予定 def update_length( - self, accent_phrases: list[AccentPhrase], style_id: StyleId + self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列に含まれるモーラの音素長属性をスタイルに合わせて更新する""" # モーラ系列を抽出する @@ -436,7 +437,7 @@ def update_length( phoneme_ids = np.array([p.id for p in phonemes], dtype=np.int64) # コアを用いて音素長を生成する - phoneme_lengths = self._core.safe_yukarin_s_forward(phoneme_ids, style_id) + phoneme_lengths = core.safe_yukarin_s_forward(phoneme_ids, style_id) # 生成結果でモーラ内の音素長属性を置換する vowel_indexes = [i for i, p in enumerate(phonemes) if p.is_mora_tail()] @@ -450,7 +451,7 @@ def update_length( return accent_phrases def update_pitch( - self, accent_phrases: list[AccentPhrase], style_id: StyleId + self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列に含まれるモーラの音高属性をスタイルに合わせて更新する""" # 後続のnumpy.concatenateが空リストだとエラーになるので別処理 @@ -495,7 +496,7 @@ def update_pitch( vowel_ids = np.array([p.id for p in vowels], dtype=np.int64) # コアを用いてモーラ音高を生成する - f0 = self._core.safe_yukarin_sa_forward( + f0 = core.safe_yukarin_sa_forward( vowel_ids, consonant_ids, start_accent_list, @@ -517,29 +518,32 @@ def update_pitch( return accent_phrases def update_length_and_pitch( - self, accent_phrases: list[AccentPhrase], style_id: StyleId + self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列の音素長・モーラ音高をスタイルIDに基づいて更新する""" - accent_phrases = self.update_length(accent_phrases, style_id) - accent_phrases = self.update_pitch(accent_phrases, style_id) + accent_phrases = self.update_length(core, accent_phrases, style_id) + accent_phrases = self.update_pitch(core, accent_phrases, style_id) return accent_phrases - def create_accent_phrases(self, text: str, style_id: StyleId) -> list[AccentPhrase]: + def create_accent_phrases( + self, core: CoreAdapter, text: str, style_id: StyleId + ) -> list[AccentPhrase]: """テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する""" accent_phrases = text_to_accent_phrases(text) - accent_phrases = self.update_length_and_pitch(accent_phrases, style_id) + accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id) return accent_phrases def create_accent_phrases_from_kana( - self, kana: str, style_id: StyleId + self, core: CoreAdapter, kana: str, style_id: StyleId ) -> list[AccentPhrase]: """AquesTalk 風記法テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する""" accent_phrases = parse_kana(kana) - accent_phrases = self.update_length_and_pitch(accent_phrases, style_id) + accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id) return accent_phrases def synthesize_wave( self, + core: CoreAdapter, query: AudioQuery, style_id: StyleId, enable_interrogative_upspeak: bool = True, @@ -552,7 +556,7 @@ def synthesize_wave( ) phoneme, f0 = query_to_decoder_feature(query) - raw_wave, sr_raw_wave = self._core.safe_decode_forward(phoneme, f0, style_id) + raw_wave, sr_raw_wave = core.safe_decode_forward(phoneme, f0, style_id) wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave @@ -560,6 +564,7 @@ def synthesize_wave( # 返す値の総称を考え、関数名を変更する def create_sing_phoneme_and_f0_and_volume( self, + core: CoreAdapter, score: Score, style_id: StyleId, ) -> tuple[list[FramePhoneme], list[float], list[float]]: @@ -575,7 +580,7 @@ def create_sing_phoneme_and_f0_and_volume( ) = notes_to_keys_and_phonemes(notes) # コアを用いて子音長を生成する - consonant_lengths = self._core.safe_predict_sing_consonant_length_forward( + consonant_lengths = core.safe_predict_sing_consonant_length_forward( note_consonants_array, note_vowels_array, note_lengths_array, style_id ) @@ -587,13 +592,11 @@ def create_sing_phoneme_and_f0_and_volume( frame_keys = np.repeat(phoneme_keys_array, phoneme_lengths) # コアを用いて音高を生成する - f0s = self._core.safe_predict_sing_f0_forward( - frame_phonemes, frame_keys, style_id - ) + f0s = core.safe_predict_sing_f0_forward(frame_phonemes, frame_keys, style_id) # コアを用いて音量を生成する # FIXME: 変数名のsいらない? - volumes = self._core.safe_predict_sing_volume_forward( + volumes = core.safe_predict_sing_volume_forward( frame_phonemes, frame_keys, f0s, style_id ) @@ -609,6 +612,7 @@ def create_sing_phoneme_and_f0_and_volume( def create_sing_volume_from_phoneme_and_f0( self, + core: CoreAdapter, score: Score, phonemes: list[FramePhoneme], f0s: list[float], @@ -649,7 +653,7 @@ def create_sing_volume_from_phoneme_and_f0( frame_keys = np.repeat(phoneme_keys_array, phoneme_lengths) # コアを用いて音量を生成する - volumes = self._core.safe_predict_sing_volume_forward( + volumes = core.safe_predict_sing_volume_forward( frame_phonemes, frame_keys, f0_array, style_id ) @@ -660,13 +664,14 @@ def create_sing_volume_from_phoneme_and_f0( def frame_synthsize_wave( self, + core: CoreAdapter, query: FrameAudioQuery, style_id: StyleId, ) -> NDArray[np.float32]: """歌声合成用のクエリ・スタイルIDに基づいて音声波形を生成する""" phoneme, f0, volume = frame_query_to_sf_decoder_feature(query) - raw_wave, sr_raw_wave = self._core.safe_sf_decode_forward( + raw_wave, sr_raw_wave = core.safe_sf_decode_forward( phoneme, f0, volume, style_id ) wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) From 75f6487c94c5f72e5e588f01d4153c5dafb46084 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 3 Jun 2024 16:10:15 +0000 Subject: [PATCH 2/2] =?UTF-8?q?refactor:=20`TTSEngine`=20=E3=83=A1?= =?UTF-8?q?=E3=82=BD=E3=83=83=E3=83=89=E3=82=92=20staticmethod=20=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- voicevox_engine/dev/tts_engine/mock.py | 7 +++-- voicevox_engine/tts_pipeline/tts_engine.py | 35 ++++++++++++++-------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/voicevox_engine/dev/tts_engine/mock.py b/voicevox_engine/dev/tts_engine/mock.py index d630981f1..0e22ecf2b 100644 --- a/voicevox_engine/dev/tts_engine/mock.py +++ b/voicevox_engine/dev/tts_engine/mock.py @@ -22,8 +22,8 @@ class MockTTSEngine(TTSEngine): def __init__(self) -> None: super().__init__(MockCoreWrapper()) + @staticmethod def synthesize_wave( - self, core: CoreAdapter, query: AudioQuery, style_id: StyleId, @@ -37,14 +37,15 @@ def synthesize_wave( flatten_moras = to_flatten_moras(query.accent_phrases) kana_text = "".join([mora.text for mora in flatten_moras]) - wave = self.forward(kana_text) + wave = MockTTSEngine.forward(kana_text) # volume wave *= query.volumeScale return wave - def forward(self, text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]: + @staticmethod + def forward(text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]: """ forward tts via pyopenjtalk.tts() 参照→TTSEngine のdocstring [Mock] diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 487b1f06e..b9cb7c5db 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -423,8 +423,9 @@ def __init__(self, core: CoreWrapper): self._core = CoreAdapter(core) # NOTE: self._coreは将来的に消す予定 + @staticmethod def update_length( - self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId + core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列に含まれるモーラの音素長属性をスタイルに合わせて更新する""" # モーラ系列を抽出する @@ -450,8 +451,9 @@ def update_length( return accent_phrases + @staticmethod def update_pitch( - self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId + core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列に含まれるモーラの音高属性をスタイルに合わせて更新する""" # 後続のnumpy.concatenateが空リストだとエラーになるので別処理 @@ -517,32 +519,39 @@ def update_pitch( return accent_phrases + @staticmethod def update_length_and_pitch( - self, core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId + core: CoreAdapter, accent_phrases: list[AccentPhrase], style_id: StyleId ) -> list[AccentPhrase]: """アクセント句系列の音素長・モーラ音高をスタイルIDに基づいて更新する""" - accent_phrases = self.update_length(core, accent_phrases, style_id) - accent_phrases = self.update_pitch(core, accent_phrases, style_id) + accent_phrases = TTSEngine.update_length(core, accent_phrases, style_id) + accent_phrases = TTSEngine.update_pitch(core, accent_phrases, style_id) return accent_phrases + @staticmethod def create_accent_phrases( - self, core: CoreAdapter, text: str, style_id: StyleId + core: CoreAdapter, text: str, style_id: StyleId ) -> list[AccentPhrase]: """テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する""" accent_phrases = text_to_accent_phrases(text) - accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id) + accent_phrases = TTSEngine.update_length_and_pitch( + core, accent_phrases, style_id + ) return accent_phrases + @staticmethod def create_accent_phrases_from_kana( - self, core: CoreAdapter, kana: str, style_id: StyleId + core: CoreAdapter, kana: str, style_id: StyleId ) -> list[AccentPhrase]: """AquesTalk 風記法テキストからアクセント句系列を生成し、スタイルIDに基づいてその音素長・モーラ音高を更新する""" accent_phrases = parse_kana(kana) - accent_phrases = self.update_length_and_pitch(core, accent_phrases, style_id) + accent_phrases = TTSEngine.update_length_and_pitch( + core, accent_phrases, style_id + ) return accent_phrases + @staticmethod def synthesize_wave( - self, core: CoreAdapter, query: AudioQuery, style_id: StyleId, @@ -562,8 +571,8 @@ def synthesize_wave( # FIXME: sing用のエンジンに移すかクラス名変える # 返す値の総称を考え、関数名を変更する + @staticmethod def create_sing_phoneme_and_f0_and_volume( - self, core: CoreAdapter, score: Score, style_id: StyleId, @@ -610,8 +619,8 @@ def create_sing_phoneme_and_f0_and_volume( return phoneme_data_list, f0s.tolist(), volumes.tolist() + @staticmethod def create_sing_volume_from_phoneme_and_f0( - self, core: CoreAdapter, score: Score, phonemes: list[FramePhoneme], @@ -662,8 +671,8 @@ def create_sing_volume_from_phoneme_and_f0( return volume_list + @staticmethod def frame_synthsize_wave( - self, core: CoreAdapter, query: FrameAudioQuery, style_id: StyleId,