diff --git a/Dockerfile b/Dockerfile index 22be5f3dd..d261f358a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -233,6 +233,7 @@ ADD ./run.py ./presets.yaml ./engine_manifest.json /opt/voicevox_engine/ ADD ./resources /opt/voicevox_engine/resources ADD ./tools/generate_licenses.py /opt/voicevox_engine/tools/ ADD ./tools/licenses /opt/voicevox_engine/tools/licenses +ADD ./tools/generate_filemap.py /opt/voicevox_engine/tools/ # Replace version ARG VOICEVOX_ENGINE_VERSION=latest @@ -259,8 +260,7 @@ RUN < None: disable_mutable_api = envs.disable_mutable_api root_dir = select_first_not_none([args.voicevox_dir, engine_root()]) - speaker_info_dir = root_dir / "resources" / "character_info" + character_info_dir = root_dir / "resources" / "character_info" # NOTE: ENGINE v0.19 以前向けに後方互換性を確保する - if not speaker_info_dir.exists(): - speaker_info_dir = root_dir / "speaker_info" + if not character_info_dir.exists(): + character_info_dir = root_dir / "speaker_info" # ASGI に準拠した VOICEVOX ENGINE アプリケーションを生成する app = generate_app( @@ -400,7 +400,7 @@ def main() -> None: engine_manifest, library_manager, cancellable_engine, - speaker_info_dir, + character_info_dir, cors_policy_mode, allow_origin, disable_mutable_api=disable_mutable_api, diff --git a/test/benchmark/engine_preparation.py b/test/benchmark/engine_preparation.py index 578b54e40..21140ffd5 100644 --- a/test/benchmark/engine_preparation.py +++ b/test/benchmark/engine_preparation.py @@ -39,7 +39,7 @@ def _generate_engine_fake_server(root_dir: Path) -> TestClient: core_manager=core_manager, setting_loader=setting_loader, preset_manager=preset_manager, - speaker_info_dir=root_dir / "resources" / "character_info", + character_info_dir=root_dir / "resources" / "character_info", user_dict=user_dict, engine_manifest=engine_manifest, library_manager=library_manager, diff --git a/test/benchmark/speed/speaker.py b/test/benchmark/speed/speaker.py index 7b8c39ecc..058dc5211 100644 --- a/test/benchmark/speed/speaker.py +++ b/test/benchmark/speed/speaker.py @@ -29,19 +29,19 @@ def benchmark_get_speaker_info_all( # speaker_uuid 一覧を準備 response = client.get("/speakers", params={}) assert response.status_code == 200 - speakers = response.json() - speaker_uuids = list(map(lambda speaker: speaker["speaker_uuid"], speakers)) + talk_characters = response.json() + uuids = list(map(lambda c: c["speaker_uuid"], talk_characters)) def execute() -> None: """計測対象となる処理を実行する""" - for speaker_uuid in speaker_uuids: - client.get("/speaker_info", params={"speaker_uuid": speaker_uuid}) + for uuid in uuids: + client.get("/speaker_info", params={"speaker_uuid": uuid}) average_time = benchmark_time(execute, n_repeat=10) return average_time -def benchmark_request_time_for_all_speakers( +def benchmark_request_time_for_all_talk_characters( server: ServerType, root_dir: Path | None = None ) -> float: """ @@ -54,12 +54,12 @@ def benchmark_request_time_for_all_speakers( # speaker_uuid 一覧を準備 response = client.get("/speakers", params={}) assert response.status_code == 200 - speakers = response.json() - speaker_uuids = list(map(lambda speaker: speaker["speaker_uuid"], speakers)) + talk_characters = response.json() + uuids = list(map(lambda c: c["speaker_uuid"], talk_characters)) def execute() -> None: """計測対象となる処理を実行する""" - for _ in speaker_uuids: + for _ in uuids: client.get("/", params={}) average_time = benchmark_time(execute, n_repeat=10) @@ -89,7 +89,9 @@ def execute() -> None: print(f"全話者 `GET /speaker_info` fakeserve: {result_spk_infos_fakeserve} sec") print(f"全話者 `GET /speaker_info` localhost: {result_spk_infos_localhost} sec") - req_time_all_fake = benchmark_request_time_for_all_speakers("fake", root_dir) - req_time_all_local = benchmark_request_time_for_all_speakers("localhost", root_dir) + req_time_all_fake = benchmark_request_time_for_all_talk_characters("fake", root_dir) + req_time_all_local = benchmark_request_time_for_all_talk_characters( + "localhost", root_dir + ) print("全話者 `GET /` fakeserve: {:.3f} sec".format(req_time_all_fake)) print("全話者 `GET /` localhost: {:.3f} sec".format(req_time_all_local)) diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" index f78bf0f1b..65c84de4c 100644 --- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" +++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" @@ -67,6 +67,7 @@ "type": "number" }, "pauseLengthScale": { + "default": 1, "title": "句読点などの無音時間(倍率)", "type": "number" }, @@ -99,7 +100,6 @@ "volumeScale", "prePhonemeLength", "postPhonemeLength", - "pauseLengthScale", "outputSamplingRate", "outputStereo" ], @@ -615,6 +615,7 @@ "type": "number" }, "pauseLengthScale": { + "default": 1, "title": "句読点などの無音時間(倍率)", "type": "number" }, @@ -657,8 +658,7 @@ "intonationScale", "volumeScale", "prePhonemeLength", - "postPhonemeLength", - "pauseLengthScale" + "postPhonemeLength" ], "title": "Preset", "type": "object" @@ -861,6 +861,10 @@ "title": "モーラごとの音高の調整", "type": "boolean" }, + "adjust_pause_length": { + "title": "句読点などの無音時間の調整", + "type": "boolean" + }, "adjust_phoneme_length": { "title": "音素ごとの長さの調整", "type": "boolean" diff --git a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json index 774fdaa43..d24ff2283 100644 --- a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json +++ b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json @@ -16,6 +16,7 @@ "supported_features": { "adjust_intonation_scale": true, "adjust_mora_pitch": true, + "adjust_pause_length": true, "adjust_phoneme_length": true, "adjust_pitch_scale": true, "adjust_speed_scale": true, diff --git a/test/e2e/test_characters.py b/test/e2e/test_characters.py index 161cc3e05..7c9bdd34d 100644 --- a/test/e2e/test_characters.py +++ b/test/e2e/test_characters.py @@ -1,7 +1,4 @@ -""" -話者・歌手のテスト。 -TODO: 話者と歌手の両ドメイン共通のドメイン用語を定め、このテストファイル名を変更する。 -""" +"""話者・歌手のテスト""" import hashlib from test.utility import hash_long_string @@ -42,13 +39,13 @@ def test_話者一覧が取得できる( def test_話者の情報を取得できる( client: TestClient, snapshot_json: SnapshotAssertion ) -> None: - speakers = _speaker_list_adapter.validate_python(client.get("/speakers").json()) - for speaker in speakers: + talkers = _speaker_list_adapter.validate_python(client.get("/speakers").json()) + for talker in talkers: response = client.get( - "/speaker_info", params={"speaker_uuid": speaker.speaker_uuid} + "/speaker_info", params={"speaker_uuid": talker.speaker_uuid} ) assert snapshot_json( - name=speaker.speaker_uuid, + name=talker.speaker_uuid, ) == hash_long_string(response.json()) diff --git a/test/e2e/test_missing_core.py b/test/e2e/test_missing_core.py index 39e241fa2..2365cecd4 100644 --- a/test/e2e/test_missing_core.py +++ b/test/e2e/test_missing_core.py @@ -6,6 +6,6 @@ def test_missing_core_422(client: TestClient, snapshot_json: SnapshotAssertion) -> None: """存在しないコアを指定するとエラーを返す。""" - response = client.get("/supported_devices", params={"core_version": "4.0.4"}) + response = client.get("/speakers", params={"core_version": "4.0.4"}) assert response.status_code == 422 assert snapshot_json == response.json() diff --git a/test/unit/library/test_library_manager.py b/test/unit/library/test_library_manager.py index d3ea405a3..061d47787 100644 --- a/test/unit/library/test_library_manager.py +++ b/test/unit/library/test_library_manager.py @@ -65,8 +65,8 @@ def setUp(self) -> None: self.vvlib_manifest = json.loads(f.read()) self.library_uuid = self.vvlib_manifest["uuid"] with ZipFile(self.library_filename, "w") as zf: - speaker_infos = glob.glob("resources/character_info/**", recursive=True) - for info in speaker_infos: + character_infos = glob.glob("resources/character_info/**", recursive=True) + for info in character_infos: zf.write(info) zf.writestr(VVLIB_MANIFEST_NAME, json.dumps(self.vvlib_manifest)) self.library_file = open(self.library_filename, "br") diff --git a/test/unit/test_metas_store.py b/test/unit/test_metas_store.py index 194cc20d8..9461e13ca 100644 --- a/test/unit/test_metas_store.py +++ b/test/unit/test_metas_store.py @@ -1,130 +1,96 @@ import uuid -from voicevox_engine.metas.Metas import Speaker, SpeakerStyle, StyleId, StyleType +from voicevox_engine.metas.Metas import ( + SpeakerStyle, + SpeakerSupportedFeatures, + StyleId, + StyleType, +) from voicevox_engine.metas.MetasStore import ( - SING_STYLE_TYPES, - TALK_STYLE_TYPES, + _SING_STYLE_TYPES, + _TALK_STYLE_TYPES, Character, filter_characters_and_styles, ) -def _speakers_to_characters(speakers: list[Speaker]) -> list[Character]: - """Speaker 配列をキャラクター配列へキャストする。""" - characters: list[Character] = [] - for speaker in speakers: - styles = speaker.styles - talk_styles = filter(lambda style: style.type in TALK_STYLE_TYPES, styles) - sing_styles = filter(lambda style: style.type in SING_STYLE_TYPES, styles) - characters.append( - Character( - name=speaker.name, - uuid=speaker.speaker_uuid, - talk_styles=list(talk_styles), - sing_styles=list(sing_styles), - version=speaker.version, - supported_features=speaker.supported_features, - ) - ) - return characters - - -def _gen_speaker(style_types: list[StyleType]) -> Speaker: - return Speaker( - speaker_uuid=str(uuid.uuid4()), +def _gen_character(style_types: list[StyleType]) -> Character: + talk_styles = list(filter(lambda s: s in _TALK_STYLE_TYPES, style_types)) + sing_styles = list(filter(lambda s: s in _SING_STYLE_TYPES, style_types)) + return Character( name="", - styles=[ - SpeakerStyle( - name="", - id=StyleId(0), - type=style_type, - ) - for style_type in style_types + uuid=str(uuid.uuid4()), + talk_styles=[ + SpeakerStyle(name="", id=StyleId(0), type=style_type) + for style_type in talk_styles + ], + sing_styles=[ + SpeakerStyle(name="", id=StyleId(0), type=style_type) + for style_type in sing_styles ], version="", + supported_features=SpeakerSupportedFeatures(), ) -def _equal_speakers(a: list[Speaker], b: list[Speaker]) -> bool: +def _equal_characters(a: list[Character], b: list[Character]) -> bool: if len(a) != len(b): return False for i in range(len(a)): - if a[i].speaker_uuid != b[i].speaker_uuid: + if a[i].uuid != b[i].uuid: return False return True -def test_filter_speakers_and_styles_with_speaker() -> None: +def test_filter_characters_and_styles_with_talk() -> None: # Inputs - speaker_talk_only = _gen_speaker(["talk"]) - speaker_singing_teacher_only = _gen_speaker(["singing_teacher"]) - speaker_frame_decode_only = _gen_speaker(["frame_decode"]) - speaker_sing_only = _gen_speaker(["sing"]) - speaker_allstyle = _gen_speaker(["talk", "singing_teacher", "frame_decode", "sing"]) + talk_only = _gen_character(["talk"]) + singing_teacher_only = _gen_character(["singing_teacher"]) + frame_decode_only = _gen_character(["frame_decode"]) + sing_only = _gen_character(["sing"]) + allstyle = _gen_character(["talk", "singing_teacher", "frame_decode", "sing"]) # Outputs result = filter_characters_and_styles( - _speakers_to_characters( - [ - speaker_talk_only, - speaker_singing_teacher_only, - speaker_frame_decode_only, - speaker_sing_only, - speaker_allstyle, - ] - ), - "speaker", + [talk_only, singing_teacher_only, frame_decode_only, sing_only, allstyle], + "talk", ) # Tests assert len(result) == 2 # 話者だけになっている - assert _equal_speakers(result, [speaker_talk_only, speaker_allstyle]) + assert _equal_characters(result, [talk_only, allstyle]) # スタイルがフィルタリングされている - for speaker in result: - for style in speaker.styles: + for characters in result: + for style in characters.talk_styles + characters.sing_styles: assert style.type == "talk" -def test_filter_speakers_and_styles_with_singer() -> None: +def test_filter_characters_and_styles_with_sing() -> None: # Inputs - speaker_talk_only = _gen_speaker(["talk"]) - speaker_singing_teacher_only = _gen_speaker(["singing_teacher"]) - speaker_frame_decode_only = _gen_speaker(["frame_decode"]) - speaker_sing_only = _gen_speaker(["sing"]) - speaker_allstyle = _gen_speaker(["talk", "singing_teacher", "frame_decode", "sing"]) + talk_only = _gen_character(["talk"]) + singing_teacher_only = _gen_character(["singing_teacher"]) + frame_decode_only = _gen_character(["frame_decode"]) + sing_only = _gen_character(["sing"]) + allstyle = _gen_character(["talk", "singing_teacher", "frame_decode", "sing"]) # Outputs result = filter_characters_and_styles( - _speakers_to_characters( - [ - speaker_talk_only, - speaker_singing_teacher_only, - speaker_frame_decode_only, - speaker_sing_only, - speaker_allstyle, - ] - ), - "singer", + [talk_only, singing_teacher_only, frame_decode_only, sing_only, allstyle], + "sing", ) # Tests assert len(result) == 4 # 歌手だけになっている - assert _equal_speakers( - result, - [ - speaker_singing_teacher_only, - speaker_frame_decode_only, - speaker_sing_only, - speaker_allstyle, - ], + assert _equal_characters( + result, [singing_teacher_only, frame_decode_only, sing_only, allstyle] ) # スタイルがフィルタリングされている - for speaker in result: - for style in speaker.styles: + for character in result: + for style in character.talk_styles + character.sing_styles: assert style.type in ["singing_teacher", "frame_decode", "sing"] diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py index 4e71eee1e..1bbae2092 100644 --- a/test/unit/tts_pipeline/test_tts_engine.py +++ b/test/unit/tts_pipeline/test_tts_engine.py @@ -26,6 +26,7 @@ ) from .test_text_analyzer import stub_unknown_features_koxx +from .tts_utils import gen_mora def yukarin_s_mock( @@ -103,32 +104,13 @@ def is_model_loaded(self, style_id: str) -> bool: return True -def _gen_mora( - text: str, - consonant: str | None, - consonant_length: float | None, - vowel: str, - vowel_length: float, - pitch: float, -) -> Mora: - """Generate Mora with positional arguments for test simplicity.""" - return Mora( - text=text, - consonant=consonant, - consonant_length=consonant_length, - vowel=vowel, - vowel_length=vowel_length, - pitch=pitch, - ) - - def test_to_flatten_phonemes() -> None: """Test `to_flatten_phonemes`.""" # Inputs moras = [ - _gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), - _gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), + gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] # Expects @@ -140,33 +122,25 @@ def test_to_flatten_phonemes() -> None: assert true_phonemes == phonemes -def _gen_hello_hiho_text() -> str: - return "こんにちは、ヒホです" - - -def _gen_hello_hiho_kana() -> str: - return "コンニチワ'、ヒ'ホデ_ス" - - def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]: return [ AccentPhrase( moras=[ - _gen_mora("コ", "k", 0.0, "o", 0.0, 0.0), - _gen_mora("ン", None, None, "N", 0.0, 0.0), - _gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0), - _gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0), - _gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 0.0), + gen_mora("ン", None, None, "N", 0.0, 0.0), + gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0), + gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0), + gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0), ], accent=5, - pause_mora=_gen_mora("、", None, None, "pau", 0.0, 0.0), + pause_mora=gen_mora("、", None, None, "pau", 0.0, 0.0), ), AccentPhrase( moras=[ - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0), - _gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0), - _gen_mora("デ", "d", 0.0, "e", 0.0, 0.0), - _gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0), + gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0), + gen_mora("デ", "d", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), ], accent=1, pause_mora=None, @@ -187,7 +161,7 @@ def _gen_hello_hiho_query() -> AudioQuery: pauseLengthScale=0.8, outputSamplingRate=12000, outputStereo=True, - kana=_gen_hello_hiho_kana(), + kana="コンニチワ'、ヒ'ホデ_ス", ) @@ -352,7 +326,7 @@ def test_mocked_create_accent_phrases_output( """モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である""" # Inputs tts_engine = TTSEngine(MockCoreWrapper()) - hello_hiho = _gen_hello_hiho_text() + hello_hiho = "こんにちは、ヒホです" # Outputs result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1)) # Tests @@ -365,7 +339,7 @@ def test_mocked_create_accent_phrases_from_kana_output( """モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である""" # Inputs tts_engine = TTSEngine(MockCoreWrapper()) - hello_hiho = _gen_hello_hiho_kana() + hello_hiho = "コンニチワ'、ヒ'ホデ_ス" # Outputs result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1)) # Tests @@ -522,16 +496,9 @@ def koreha_arimasuka_base_expected() -> list[AccentPhrase]: ] -def create_synthesis_test_base( - text: str, expected: list[AccentPhrase], enable_interrogative_upspeak: bool -) -> None: - """音声合成時に疑問文モーラ処理を行っているかどうかを検証 - (https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) - """ +def create_synthesis_test_base(text: str) -> list[AccentPhrase]: tts_engine = TTSEngine(core=MockCoreWrapper()) - inputs = tts_engine.create_accent_phrases(text, StyleId(1)) - outputs = apply_interrogative_upspeak(inputs, enable_interrogative_upspeak) - assert expected == outputs, f"case(text:{text})" + return tts_engine.create_accent_phrases(text, StyleId(1)) def test_create_accent_phrases() -> None: @@ -548,6 +515,9 @@ def test_create_accent_phrases() -> None: def test_upspeak_voiced_last_mora() -> None: # voiced + "?" + flagON -> upspeak + # Inputs + inputs = create_synthesis_test_base(text="これはありますか?") + # Expects expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ @@ -560,28 +530,31 @@ def test_upspeak_voiced_last_mora() -> None: pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3, ) ] - create_synthesis_test_base( - text="これはありますか?", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # voiced + "?" + flagOFF -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="これはありますか?") + # Expects expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True - create_synthesis_test_base( - text="これはありますか?", - expected=expected, - enable_interrogative_upspeak=False, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, False) + # Test + assert expected == outputs # voiced + "" + flagON -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="これはありますか") + # Expects expected = koreha_arimasuka_base_expected() - create_synthesis_test_base( - text="これはありますか", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs def test_upspeak_voiced_N_last_mora() -> None: @@ -605,14 +578,19 @@ def nn_base_expected() -> list[AccentPhrase]: ] # voiced + "" + flagON -> upspeak + # Inputs + inputs = create_synthesis_test_base(text="ん") + # Expects expected = nn_base_expected() - create_synthesis_test_base( - text="ん", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # voiced + "?" + flagON -> upspeak + # Inputs + inputs = create_synthesis_test_base(text="ん?") + # Expects expected = nn_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ @@ -625,20 +603,21 @@ def nn_base_expected() -> list[AccentPhrase]: pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3, ) ] - create_synthesis_test_base( - text="ん?", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # voiced + "?" + flagOFF -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="ん?") + # Expects expected = nn_base_expected() expected[-1].is_interrogative = True - create_synthesis_test_base( - text="ん?", - expected=expected, - enable_interrogative_upspeak=False, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, False) + # Test + assert expected == outputs def test_upspeak_unvoiced_last_mora() -> None: @@ -662,30 +641,36 @@ def ltu_base_expected() -> list[AccentPhrase]: ] # unvoiced + "" + flagON -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="っ") + # Expects expected = ltu_base_expected() - create_synthesis_test_base( - text="っ", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # unvoiced + "?" + flagON -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="っ?") + # Expects expected = ltu_base_expected() expected[-1].is_interrogative = True - create_synthesis_test_base( - text="っ?", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # unvoiced + "?" + flagOFF -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="っ?") + # Expects expected = ltu_base_expected() expected[-1].is_interrogative = True - create_synthesis_test_base( - text="っ?", - expected=expected, - enable_interrogative_upspeak=False, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, False) + # Test + assert expected == outputs def test_upspeak_voiced_u_last_mora() -> None: @@ -709,14 +694,19 @@ def su_base_expected() -> list[AccentPhrase]: ] # voiced + "" + flagON -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="す") + # Expects expected = su_base_expected() - create_synthesis_test_base( - text="す", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # voiced + "?" + flagON -> upspeak + # Inputs + inputs = create_synthesis_test_base(text="す?") + # Expects expected = su_base_expected() expected[-1].is_interrogative = True expected[-1].moras += [ @@ -729,17 +719,18 @@ def su_base_expected() -> list[AccentPhrase]: pitch=expected[-1].moras[-1].pitch + 0.3, ) ] - create_synthesis_test_base( - text="す?", - expected=expected, - enable_interrogative_upspeak=True, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, True) + # Test + assert expected == outputs # voiced + "?" + flagOFF -> non-upspeak + # Inputs + inputs = create_synthesis_test_base(text="す?") + # Expects expected = su_base_expected() expected[-1].is_interrogative = True - create_synthesis_test_base( - text="す?", - expected=expected, - enable_interrogative_upspeak=False, - ) + # Outputs + outputs = apply_interrogative_upspeak(inputs, False) + # Test + assert expected == outputs diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py index 0d872dc37..ce213c3c1 100644 --- a/test/unit/tts_pipeline/test_wave_synthesizer.py +++ b/test/unit/tts_pipeline/test_wave_synthesizer.py @@ -3,7 +3,7 @@ import numpy as np from voicevox_engine.model import AudioQuery -from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora +from voicevox_engine.tts_pipeline.model import AccentPhrase from voicevox_engine.tts_pipeline.tts_engine import ( apply_intonation_scale, apply_output_sampling_rate, @@ -17,6 +17,8 @@ raw_wave_to_output_wave, ) +from .tts_utils import gen_mora + TRUE_NUM_PHONEME = 45 @@ -50,38 +52,19 @@ def _gen_query( ) -def _gen_mora( - text: str, - consonant: str | None, - consonant_length: float | None, - vowel: str, - vowel_length: float, - pitch: float, -) -> Mora: - """Generate Mora with positional arguments for test simplicity.""" - return Mora( - text=text, - consonant=consonant, - consonant_length=consonant_length, - vowel=vowel, - vowel_length=vowel_length, - pitch=pitch, - ) - - def test_apply_prepost_silence() -> None: """Test `apply_prepost_silence`.""" # Inputs query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) moras = [ - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), ] # Expects true_moras_with_silence = [ - _gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), - _gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), + gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] # Outputs @@ -95,20 +78,20 @@ def test_apply_speed_scale() -> None: # Inputs query = _gen_query(speedScale=2.0) input_moras = [ - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), + gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 6.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ] # Expects - x2 fast true_moras = [ - _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), - _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 2 * 0.01067, 5.0), + gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 6.0), + gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), ] # Outputs @@ -122,20 +105,20 @@ def test_apply_pitch_scale() -> None: # Inputs query = _gen_query(pitchScale=2.0) input_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), - _gen_mora("ン", None, None, "N", 0.0, 50.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.0), + gen_mora("ン", None, None, "N", 0.0, 5.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 6.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Expects - x4 value scaled true_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - _gen_mora("ン", None, None, "N", 0.0, 200.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 20.0), + gen_mora("ン", None, None, "N", 0.0, 20.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 24.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Outputs @@ -149,20 +132,20 @@ def test_apply_intonation_scale() -> None: # Inputs query = _gen_query(intonationScale=0.5) input_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - _gen_mora("ン", None, None, "N", 0.0, 200.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.0), + gen_mora("ン", None, None, "N", 0.0, 5.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 8.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Expects - mean=300 var x0.5 intonation scaling + # Expects - mean=6 var x0.5 intonation scaling true_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), - _gen_mora("ン", None, None, "N", 0.0, 250.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.5), + gen_mora("ン", None, None, "N", 0.0, 5.5), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 7.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Outputs @@ -222,13 +205,13 @@ def test_count_frame_per_unit() -> None: """Test `count_frame_per_unit`.""" # Inputs moras = [ - _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), - _gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), + gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), ] # Expects @@ -252,16 +235,16 @@ def test_query_to_decoder_feature() -> None: accent_phrases = [ AccentPhrase( moras=[ - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), ], accent=1, - pause_mora=_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), ), AccentPhrase( moras=[ - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 8.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ], accent=1, pause_mora=None, @@ -292,10 +275,10 @@ def test_query_to_decoder_feature() -> None: # Pitch # paw ko N pau hi hO paw # frame_per_vowel = [1, 3, 2, 1, 3, 3, 3] - # pau ko ko ko N N - true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0] - # pau pau hi hi hi - true2_f0 = [0.0, 0.0, 400.0, 400.0, 400.0] + # pau ko ko ko N N + true1_f0 = [0.0, 22.0, 22.0, 22.0, 22.0, 22.0] + # pau pau hi hi hi + true2_f0 = [0.0, 0.0, 28.0, 28.0, 28.0] # hO hO hO paw paw paw true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] true_f0 = np.array(true1_f0 + true2_f0 + true3_f0, dtype=np.float32) diff --git a/test/unit/tts_pipeline/tts_utils.py b/test/unit/tts_pipeline/tts_utils.py new file mode 100644 index 000000000..947d496a6 --- /dev/null +++ b/test/unit/tts_pipeline/tts_utils.py @@ -0,0 +1,22 @@ +"""合成系テスト向けの utility""" + +from voicevox_engine.tts_pipeline.model import Mora + + +def gen_mora( + text: str, + consonant: str | None, + consonant_length: float | None, + vowel: str, + vowel_length: float, + pitch: float, +) -> Mora: + """Generate Mora with positional arguments for test simplicity.""" + return Mora( + text=text, + consonant=consonant, + consonant_length=consonant_length, + vowel=vowel, + vowel_length=vowel_length, + pitch=pitch, + ) diff --git a/voicevox_engine/app/application.py b/voicevox_engine/app/application.py index 1b4c3857e..98618af96 100644 --- a/voicevox_engine/app/application.py +++ b/voicevox_engine/app/application.py @@ -9,7 +9,7 @@ from voicevox_engine.app.global_exceptions import configure_global_exception_handlers from voicevox_engine.app.middlewares import configure_middlewares from voicevox_engine.app.openapi_schema import configure_openapi_schema -from voicevox_engine.app.routers.character import generate_speaker_router +from voicevox_engine.app.routers.character import generate_character_router from voicevox_engine.app.routers.engine_info import generate_engine_info_router from voicevox_engine.app.routers.library import generate_library_router from voicevox_engine.app.routers.morphing import generate_morphing_router @@ -56,14 +56,14 @@ def generate_app( engine_manifest: EngineManifest, library_manager: LibraryManager, cancellable_engine: CancellableEngine | None = None, - speaker_info_dir: Path | None = None, + character_info_dir: Path | None = None, cors_policy_mode: CorsPolicyMode = CorsPolicyMode.localapps, allow_origin: list[str] | None = None, disable_mutable_api: bool = False, ) -> FastAPI: """ASGI 'application' 仕様に準拠した VOICEVOX ENGINE アプリケーションインスタンスを生成する。""" - if speaker_info_dir is None: - speaker_info_dir = engine_root() / "resources" / "character_info" + if character_info_dir is None: + character_info_dir = engine_root() / "resources" / "character_info" verify_mutability_allowed = generate_mutability_allowed_verifier( disable_mutable_api @@ -79,9 +79,9 @@ def generate_app( app = configure_global_exception_handlers(app) resource_manager = ResourceManager(is_development()) - resource_manager.register_dir(speaker_info_dir) + resource_manager.register_dir(character_info_dir) metas_store = MetasStore( - speaker_info_dir, + character_info_dir, _generate_core_characters_getter(core_manager), resource_manager, ) @@ -95,13 +95,15 @@ def generate_app( app.include_router( generate_preset_router(preset_manager, verify_mutability_allowed) ) - app.include_router(generate_speaker_router(resource_manager, metas_store)) + app.include_router(generate_character_router(resource_manager, metas_store)) if engine_manifest.supported_features.manage_library: app.include_router( generate_library_router(library_manager, verify_mutability_allowed) ) app.include_router(generate_user_dict_router(user_dict, verify_mutability_allowed)) - app.include_router(generate_engine_info_router(core_manager, engine_manifest)) + app.include_router( + generate_engine_info_router(core_manager, tts_engines, engine_manifest) + ) app.include_router( generate_setting_router( setting_loader, engine_manifest.brand_name, verify_mutability_allowed diff --git a/voicevox_engine/app/routers/character.py b/voicevox_engine/app/routers/character.py index 238cd4abd..1b0c9fd51 100644 --- a/voicevox_engine/app/routers/character.py +++ b/voicevox_engine/app/routers/character.py @@ -7,7 +7,7 @@ from pydantic.json_schema import SkipJsonSchema from voicevox_engine.metas.Metas import Speaker, SpeakerInfo -from voicevox_engine.metas.MetasStore import MetasStore, ResourceFormat +from voicevox_engine.metas.MetasStore import Character, MetasStore, ResourceFormat from voicevox_engine.resource_manager import ResourceManager, ResourceManagerError RESOURCE_ENDPOINT = "_resources" @@ -17,9 +17,24 @@ async def _get_resource_baseurl(request: Request) -> str: return f"{request.url.scheme}://{request.url.netloc}/{RESOURCE_ENDPOINT}" -def generate_speaker_router( - resource_manager: ResourceManager, - metas_store: MetasStore, +def _characters_to_speakers(characters: list[Character]) -> list[Speaker]: + """キャラクターのリストを `Speaker` のリストへキャストする。""" + return list( + map( + lambda character: Speaker( + name=character.name, + speaker_uuid=character.uuid, + styles=character.talk_styles + character.sing_styles, + version=character.version, + supported_features=character.supported_features, + ), + characters, + ) + ) + + +def generate_character_router( + resource_manager: ResourceManager, metas_store: MetasStore ) -> APIRouter: """話者情報 API Router を生成する""" router = APIRouter(tags=["その他"]) @@ -27,7 +42,8 @@ def generate_speaker_router( @router.get("/speakers") def speakers(core_version: str | SkipJsonSchema[None] = None) -> list[Speaker]: """話者情報の一覧を取得します。""" - return metas_store.talk_characters(core_version) + characters = metas_store.talk_characters(core_version) + return _characters_to_speakers(characters) @router.get("/speaker_info") def speaker_info( @@ -40,9 +56,9 @@ def speaker_info( 指定されたspeaker_uuidの話者に関する情報をjson形式で返します。 画像や音声はresource_formatで指定した形式で返されます。 """ - return metas_store.speaker_info( - speaker_uuid=speaker_uuid, - speaker_or_singer="speaker", + return metas_store.character_info( + character_uuid=speaker_uuid, + talk_or_sing="talk", core_version=core_version, resource_baseurl=resource_baseurl, resource_format=resource_format, @@ -51,7 +67,8 @@ def speaker_info( @router.get("/singers") def singers(core_version: str | SkipJsonSchema[None] = None) -> list[Speaker]: """歌手情報の一覧を取得します""" - return metas_store.sing_characters(core_version) + characters = metas_store.sing_characters(core_version) + return _characters_to_speakers(characters) @router.get("/singer_info") def singer_info( @@ -64,9 +81,9 @@ def singer_info( 指定されたspeaker_uuidの歌手に関する情報をjson形式で返します。 画像や音声はresource_formatで指定した形式で返されます。 """ - return metas_store.speaker_info( - speaker_uuid=speaker_uuid, - speaker_or_singer="singer", + return metas_store.character_info( + character_uuid=speaker_uuid, + talk_or_sing="sing", core_version=core_version, resource_baseurl=resource_baseurl, resource_format=resource_format, diff --git a/voicevox_engine/app/routers/engine_info.py b/voicevox_engine/app/routers/engine_info.py index 96f22b4d0..5d4ca5b44 100644 --- a/voicevox_engine/app/routers/engine_info.py +++ b/voicevox_engine/app/routers/engine_info.py @@ -10,6 +10,7 @@ from voicevox_engine.core.core_adapter import DeviceSupport from voicevox_engine.core.core_initializer import CoreManager from voicevox_engine.engine_manifest import EngineManifest +from voicevox_engine.tts_pipeline.tts_engine import TTSEngineManager class SupportedDevicesInfo(BaseModel): @@ -32,7 +33,9 @@ def generate_from(cls, device_support: DeviceSupport) -> Self: def generate_engine_info_router( - core_manager: CoreManager, engine_manifest_data: EngineManifest + core_manager: CoreManager, + tts_engine_manager: TTSEngineManager, + engine_manifest_data: EngineManifest, ) -> APIRouter: """エンジン情報 API Router を生成する""" router = APIRouter(tags=["その他"]) @@ -53,7 +56,7 @@ def supported_devices( ) -> SupportedDevicesInfo: """対応デバイスの一覧を取得します。""" version = core_version or core_manager.latest_version() - supported_devices = core_manager.get_core(version).supported_devices + supported_devices = tts_engine_manager.get_engine(version).supported_devices if supported_devices is None: raise HTTPException(status_code=422, detail="非対応の機能です。") return SupportedDevicesInfo.generate_from(supported_devices) diff --git a/voicevox_engine/app/routers/morphing.py b/voicevox_engine/app/routers/morphing.py index f80e0442f..ae8147329 100644 --- a/voicevox_engine/app/routers/morphing.py +++ b/voicevox_engine/app/routers/morphing.py @@ -12,7 +12,7 @@ from voicevox_engine.core.core_initializer import CoreManager from voicevox_engine.metas.Metas import StyleId -from voicevox_engine.metas.MetasStore import MetasStore, characters_to_speakers +from voicevox_engine.metas.MetasStore import MetasStore from voicevox_engine.model import AudioQuery from voicevox_engine.morphing.model import MorphableTargetInfo from voicevox_engine.morphing.morphing import ( @@ -55,9 +55,8 @@ def morphable_targets( 返り値のスタイルIDはstring型なので注意。 """ characters = metas_store.characters(core_version) - speakers = characters_to_speakers(characters) try: - morphable_targets = get_morphable_targets(speakers, base_style_ids) + morphable_targets = get_morphable_targets(characters, base_style_ids) except StyleIdNotFoundError as e: msg = f"該当するスタイル(style_id={e.style_id})が見つかりません" raise HTTPException(status_code=404, detail=msg) @@ -92,13 +91,11 @@ def _synthesis_morphing( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) # モーフィングが許可されないキャラクターペアを拒否する characters = metas_store.characters(core_version) - speakers = characters_to_speakers(characters) try: - morphable = is_morphable(speakers, base_style_id, target_style_id) + morphable = is_morphable(characters, base_style_id, target_style_id) except StyleIdNotFoundError as e: msg = f"該当するスタイル(style_id={e.style_id})が見つかりません" raise HTTPException(status_code=404, detail=msg) @@ -109,7 +106,6 @@ def _synthesis_morphing( # 生成したパラメータはキャッシュされる morph_param = synthesis_morphing_parameter( engine=engine, - core=core, query=query, base_style_id=base_style_id, target_style_id=target_style_id, diff --git a/voicevox_engine/app/routers/tts_pipeline.py b/voicevox_engine/app/routers/tts_pipeline.py index 4a2159a09..b9b67617e 100644 --- a/voicevox_engine/app/routers/tts_pipeline.py +++ b/voicevox_engine/app/routers/tts_pipeline.py @@ -87,7 +87,6 @@ def audio_query( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) accent_phrases = engine.create_accent_phrases(text, style_id) return AudioQuery( accent_phrases=accent_phrases, @@ -99,7 +98,7 @@ def audio_query( postPhonemeLength=0.1, pauseLength=None, pauseLengthScale=1, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, kana=create_kana(accent_phrases), ) @@ -119,7 +118,6 @@ def audio_query_from_preset( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) try: presets = preset_manager.load_presets() except PresetInputError as err: @@ -146,7 +144,7 @@ def audio_query_from_preset( postPhonemeLength=selected_preset.postPhonemeLength, pauseLength=selected_preset.pauseLength, pauseLengthScale=selected_preset.pauseLengthScale, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, kana=create_kana(accent_phrases), ) @@ -378,7 +376,6 @@ def sing_frame_audio_query( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) try: phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume( score, style_id @@ -391,7 +388,7 @@ def sing_frame_audio_query( volume=volume, phonemes=phonemes, volumeScale=1, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, ) @@ -532,8 +529,8 @@ def initialize_speaker( 実行しなくても他のAPIは使用できますが、初回実行時に時間がかかることがあります。 """ version = core_version or core_manager.latest_version() - core = core_manager.get_core(version) - core.initialize_style_id_synthesis(style_id, skip_reinit=skip_reinit) + engine = tts_engines.get_engine(version) + engine.initialize_synthesis(style_id, skip_reinit=skip_reinit) @router.get("/is_initialized_speaker", tags=["その他"]) def is_initialized_speaker( @@ -544,7 +541,7 @@ def is_initialized_speaker( 指定されたスタイルが初期化されているかどうかを返します。 """ version = core_version or core_manager.latest_version() - core = core_manager.get_core(version) - return core.is_initialized_style_id_synthesis(style_id) + engine = tts_engines.get_engine(version) + return engine.is_synthesis_initialized(style_id) return router diff --git a/voicevox_engine/engine_manifest.py b/voicevox_engine/engine_manifest.py index 30c5198f2..05677ed7a 100644 --- a/voicevox_engine/engine_manifest.py +++ b/voicevox_engine/engine_manifest.py @@ -34,6 +34,7 @@ class SupportedFeaturesJson: adjust_pitch_scale: FeatureSupportJson adjust_intonation_scale: FeatureSupportJson adjust_volume_scale: FeatureSupportJson + adjust_pause_length: FeatureSupportJson interrogative_upspeak: FeatureSupportJson synthesis_morphing: FeatureSupportJson sing: FeatureSupportJson @@ -103,6 +104,9 @@ class SupportedFeatures(BaseModel): adjust_pitch_scale: bool = Field(title="全体の音高の調整") adjust_intonation_scale: bool = Field(title="全体の抑揚の調整") adjust_volume_scale: bool = Field(title="全体の音量の調整") + adjust_pause_length: bool | SkipJsonSchema[None] = Field( + default=None, title="句読点などの無音時間の調整" + ) interrogative_upspeak: bool = Field(title="疑問文の自動調整") synthesis_morphing: bool = Field( title="2種類のスタイルでモーフィングした音声を合成" diff --git a/voicevox_engine/metas/MetasStore.py b/voicevox_engine/metas/MetasStore.py index 96a137272..8a16ae62b 100644 --- a/voicevox_engine/metas/MetasStore.py +++ b/voicevox_engine/metas/MetasStore.py @@ -9,7 +9,6 @@ from voicevox_engine.core.core_adapter import CoreCharacter, CoreCharacterStyle from voicevox_engine.metas.Metas import ( - Speaker, SpeakerInfo, SpeakerStyle, SpeakerSupportedFeatures, @@ -40,27 +39,13 @@ class Character: supported_features: SpeakerSupportedFeatures -TALK_STYLE_TYPES: Final = ["talk"] -SING_STYLE_TYPES: Final = ["singing_teacher", "frame_decode", "sing"] +_TALK_STYLE_TYPES: Final = ["talk"] +_SING_STYLE_TYPES: Final = ["singing_teacher", "frame_decode", "sing"] -def characters_to_speakers(characters: list[Character]) -> list[Speaker]: - """キャラクター配列を Speaker 配列へキャストする。""" - return [ - Speaker( - name=character.name, - speaker_uuid=character.uuid, - styles=character.talk_styles + character.sing_styles, - version=character.version, - supported_features=character.supported_features, - ) - for character in characters - ] - - -class _EngineSpeaker(BaseModel): +class _EngineCharacter(BaseModel): """ - エンジンに含まれる話者情報 + エンジンに含まれるキャラクター情報 """ supported_features: SpeakerSupportedFeatures = Field( @@ -78,27 +63,27 @@ class MetasStore: def __init__( self, - engine_speakers_path: Path, + engine_characters_path: Path, get_core_characters: GetCoreCharacters, resource_manager: ResourceManager, ) -> None: """ Parameters ---------- - engine_speakers_path : Path + engine_characters_path : Path エンジンに含まれる話者メタ情報ディレクトリのパス。 get_core_characters: コアに含まれるキャラクター情報を返す関数 """ - self._speakers_path = engine_speakers_path + self._characters_path = engine_characters_path self._get_core_characters = get_core_characters self._resource_manager = resource_manager - # エンジンに含まれる各話者のメタ情報 - self._loaded_metas: dict[str, _EngineSpeaker] = { - folder.name: _EngineSpeaker.model_validate_json( + # エンジンに含まれる各キャラクターのメタ情報 + self._loaded_metas: dict[str, _EngineCharacter] = { + folder.name: _EngineCharacter.model_validate_json( (folder / "metas.json").read_text(encoding="utf-8") ) - for folder in engine_speakers_path.iterdir() + for folder in engine_characters_path.iterdir() if folder.is_dir() } @@ -112,10 +97,10 @@ def characters(self, core_version: str | None) -> list[Character]: engine_character = self._loaded_metas[character_uuid] styles = cast_styles(core_character.styles) talk_styles = list( - filter(lambda style: style.type in TALK_STYLE_TYPES, styles) + filter(lambda style: style.type in _TALK_STYLE_TYPES, styles) ) sing_styles = list( - filter(lambda style: style.type in SING_STYLE_TYPES, styles) + filter(lambda style: style.type in _SING_STYLE_TYPES, styles) ) characters.append( Character( @@ -129,17 +114,17 @@ def characters(self, core_version: str | None) -> list[Character]: ) return characters - def speaker_info( + def character_info( self, - speaker_uuid: str, - speaker_or_singer: Literal["speaker", "singer"], + character_uuid: str, + talk_or_sing: Literal["talk", "sing"], core_version: str | None, resource_baseurl: str, resource_format: ResourceFormat, ) -> SpeakerInfo: # キャラクター情報は以下のディレクトリ構造に従わなければならない。 - # {engine_speakers_path}/ - # {speaker_uuid_0}/ + # {engine_characters_path}/ + # {character_uuid_0}/ # policy.md # portrait.png # icons/ @@ -156,25 +141,25 @@ def speaker_info( # {id_0}_003.wav # {id_1}_001.wav # ... - # {speaker_uuid_1}/ + # {character_uuid_1}/ # ... # 該当話者を検索する characters = self.characters(core_version) - speakers = filter_characters_and_styles(characters, speaker_or_singer) - speaker = next( - filter(lambda spk: spk.speaker_uuid == speaker_uuid, speakers), None + characters = filter_characters_and_styles(characters, talk_or_sing) + character = next( + filter(lambda character: character.uuid == character_uuid, characters), None ) - if speaker is None: + if character is None: # FIXME: HTTPExceptionはこのファイルとドメインが合わないので辞める raise HTTPException(status_code=404, detail="該当する話者が見つかりません") # 話者情報を取得する try: - speaker_path = self._speakers_path / speaker_uuid + character_path = self._characters_path / character_uuid - # speaker policy - policy_path = speaker_path / "policy.md" + # character policy + policy_path = character_path / "policy.md" policy = policy_path.read_text("utf-8") def _resource_str(path: Path) -> str: @@ -185,21 +170,21 @@ def _resource_str(path: Path) -> str: return resource_str return f"{resource_baseurl}/{resource_str}" - # speaker portrait - portrait_path = speaker_path / "portrait.png" + # character portrait + portrait_path = character_path / "portrait.png" portrait = _resource_str(portrait_path) # スタイル情報を取得する style_infos = [] - for style in speaker.styles: + for style in character.talk_styles + character.sing_styles: id = style.id # style icon - style_icon_path = speaker_path / "icons" / f"{id}.png" + style_icon_path = character_path / "icons" / f"{id}.png" icon = _resource_str(style_icon_path) # style portrait - style_portrait_path = speaker_path / "portraits" / f"{id}.png" + style_portrait_path = character_path / "portraits" / f"{id}.png" style_portrait = None if style_portrait_path.exists(): style_portrait = _resource_str(style_portrait_path) @@ -208,7 +193,7 @@ def _resource_str(path: Path) -> str: voice_samples: list[str] = [] for j in range(3): num = str(j + 1).zfill(3) - voice_path = speaker_path / "voice_samples" / f"{id}_{num}.wav" + voice_path = character_path / "voice_samples" / f"{id}_{num}.wav" voice_samples.append(_resource_str(voice_path)) style_infos.append( @@ -224,58 +209,42 @@ def _resource_str(path: Path) -> str: msg = "追加情報が見つかりませんでした" raise HTTPException(status_code=500, detail=msg) - spk_info = SpeakerInfo( + character_info = SpeakerInfo( policy=policy, portrait=portrait, style_infos=style_infos ) - return spk_info + return character_info - def talk_characters(self, core_version: str | None) -> list[Speaker]: + def talk_characters(self, core_version: str | None) -> list[Character]: """話せるキャラクターの情報の一覧を取得する。""" - return filter_characters_and_styles(self.characters(core_version), "speaker") + return filter_characters_and_styles(self.characters(core_version), "talk") - def sing_characters(self, core_version: str | None) -> list[Speaker]: + def sing_characters(self, core_version: str | None) -> list[Character]: """歌えるキャラクターの情報の一覧を取得する。""" - return filter_characters_and_styles(self.characters(core_version), "singer") + return filter_characters_and_styles(self.characters(core_version), "sing") def filter_characters_and_styles( characters: list[Character], - speaker_or_singer: Literal["speaker", "singer"], -) -> list[Speaker]: + talk_or_sing: Literal["talk", "sing"], +) -> list[Character]: """キャラクター内のスタイルをtalk系・sing系のみにする。スタイル数が0になったキャラクターは除外する。""" - if speaker_or_singer == "speaker": + if talk_or_sing == "talk": # talk 系スタイルを持たないキャラクターを除外する - talk_characters = filter( - lambda character: len(character.talk_styles) > 0, characters - ) - # キャラクター内のスタイルを talk 系のみにしたうえでキャストする - talk_speakers = map( - lambda talker: Speaker( - name=talker.name, - speaker_uuid=talker.uuid, - styles=talker.talk_styles, - version=talker.version, - supported_features=talker.supported_features, - ), - talk_characters, + talk_characters = list( + filter(lambda character: len(character.talk_styles) > 0, characters) ) - return list(talk_speakers) - elif speaker_or_singer == "singer": + # sing 系スタイルを除外する + for talk_character in talk_characters: + talk_character.sing_styles = [] + return talk_characters + elif talk_or_sing == "sing": # sing 系スタイルを持たないキャラクターを除外する - sing_characters = filter( - lambda character: len(character.sing_styles) > 0, characters - ) - # キャラクター内のスタイルを sing 系のみにしたうえでキャストする - sing_speakers = map( - lambda singer: Speaker( - name=singer.name, - speaker_uuid=singer.uuid, - styles=singer.sing_styles, - version=singer.version, - supported_features=singer.supported_features, - ), - sing_characters, + sing_characters = list( + filter(lambda character: len(character.sing_styles) > 0, characters) ) - return list(sing_speakers) + # talk 系スタイルを除外する + for sing_character in sing_characters: + sing_character.talk_styles = [] + return sing_characters else: - raise Exception(f"'{speaker_or_singer}' は不正な style_type です") + raise Exception(f"'{talk_or_sing}' は不正な style_type です") diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index 7fdbe9716..ddff1a61b 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -28,7 +28,7 @@ class AudioQuery(BaseModel): pauseLength: float | SkipJsonSchema[None] = Field( default=None, title="句読点などの無音時間" ) - pauseLengthScale: float = Field(title="句読点などの無音時間(倍率)") + pauseLengthScale: float = Field(default=1, title="句読点などの無音時間(倍率)") outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") outputStereo: bool = Field(title="音声データをステレオ出力するか否か") kana: str | SkipJsonSchema[None] = Field( diff --git a/voicevox_engine/morphing/morphing.py b/voicevox_engine/morphing/morphing.py index b426b26e0..81721254c 100644 --- a/voicevox_engine/morphing/morphing.py +++ b/voicevox_engine/morphing/morphing.py @@ -12,10 +12,10 @@ from numpy.typing import NDArray from soxr import resample +from voicevox_engine.metas.MetasStore import Character from voicevox_engine.morphing.model import MorphableTargetInfo -from ..core.core_adapter import CoreAdapter -from ..metas.Metas import Speaker, StyleId +from ..metas.Metas import StyleId from ..model import AudioQuery from ..tts_pipeline.tts_engine import TTSEngine @@ -37,7 +37,7 @@ class _MorphingParameter: def get_morphable_targets( - speakers: list[Speaker], + characters: list[Character], base_style_ids: list[StyleId], ) -> list[dict[StyleId, MorphableTargetInfo]]: """ @@ -47,9 +47,11 @@ def get_morphable_targets( morphable_targets_arr = [] for base_style_id in base_style_ids: morphable_targets: dict[StyleId, MorphableTargetInfo] = {} - for style in chain.from_iterable(speaker.styles for speaker in speakers): + for style in chain.from_iterable( + character.talk_styles + character.sing_styles for character in characters + ): morphable_targets[style.id] = MorphableTargetInfo( - is_morphable=is_morphable(speakers, base_style_id, style.id) + is_morphable=is_morphable(characters, base_style_id, style.id) ) morphable_targets_arr.append(morphable_targets) @@ -57,15 +59,15 @@ def get_morphable_targets( def is_morphable( - speakers: list[Speaker], style_id_1: StyleId, style_id_2: StyleId + characters: list[Character], style_id_1: StyleId, style_id_2: StyleId ) -> bool: """指定された2つのスタイル ID がモーフィング可能か判定する。""" # スタイル ID にキャラクターを紐付ける対応表を生成する。 - style_id_to_character: dict[StyleId, Speaker] = {} - for speaker in speakers: - for style in speaker.styles: - style_id_to_character[style.id] = speaker + style_id_to_character: dict[StyleId, Character] = {} + for character in characters: + for style in character.talk_styles + character.sing_styles: + style_id_to_character[style.id] = character try: character_1 = style_id_to_character[style_id_1] @@ -76,8 +78,8 @@ def is_morphable( except KeyError: raise StyleIdNotFoundError(style_id_2) - uuid_1 = character_1.speaker_uuid - uuid_2 = character_2.speaker_uuid + uuid_1 = character_1.uuid + uuid_2 = character_2.uuid morphable_1 = character_1.supported_features.permitted_synthesis_morphing morphable_2 = character_2.supported_features.permitted_synthesis_morphing @@ -98,7 +100,6 @@ def is_morphable( def synthesis_morphing_parameter( engine: TTSEngine, - core: CoreAdapter, query: AudioQuery, base_style_id: StyleId, target_style_id: StyleId, @@ -106,7 +107,7 @@ def synthesis_morphing_parameter( query = deepcopy(query) # 不具合回避のためデフォルトのサンプリングレートでWORLDに掛けた後に指定のサンプリングレートに変換する - query.outputSamplingRate = core.default_sampling_rate + query.outputSamplingRate = engine.default_sampling_rate # WORLDに掛けるため合成はモノラルで行う query.outputStereo = False diff --git a/voicevox_engine/preset/model.py b/voicevox_engine/preset/model.py index 1b6c77bd6..d9c2d4754 100644 --- a/voicevox_engine/preset/model.py +++ b/voicevox_engine/preset/model.py @@ -28,4 +28,4 @@ class Preset(BaseModel): pauseLength: float | SkipJsonSchema[None] = Field( default=None, title="句読点などの無音時間" ) - pauseLengthScale: float = Field(title="句読点などの無音時間(倍率)") + pauseLengthScale: float = Field(default=1, title="句読点などの無音時間(倍率)") diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 9d3c57248..373d65c11 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -8,7 +8,7 @@ from numpy.typing import NDArray from soxr import resample -from ..core.core_adapter import CoreAdapter +from ..core.core_adapter import CoreAdapter, DeviceSupport from ..core.core_initializer import CoreManager from ..core.core_wrapper import CoreWrapper from ..metas.Metas import StyleId @@ -438,7 +438,16 @@ class TTSEngine: def __init__(self, core: CoreWrapper): super().__init__() self._core = CoreAdapter(core) - # NOTE: self._coreは将来的に消す予定 + + @property + def default_sampling_rate(self) -> int: + """合成される音声波形のデフォルトサンプリングレートを取得する。""" + return self._core.default_sampling_rate + + @property + def supported_devices(self) -> DeviceSupport | None: + """合成時に各デバイスが利用可能か否かの一覧を取得する。""" + return self._core.supported_devices def update_length( self, accent_phrases: list[AccentPhrase], style_id: StyleId @@ -574,6 +583,14 @@ def synthesize_wave( wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave + def initialize_synthesis(self, style_id: StyleId, skip_reinit: bool) -> None: + """指定されたスタイル ID に関する合成機能を初期化する。既に初期化されていた場合は引数に応じて再初期化する。""" + self._core.initialize_style_id_synthesis(style_id, skip_reinit=skip_reinit) + + def is_synthesis_initialized(self, style_id: StyleId) -> bool: + """指定されたスタイル ID に関する合成機能が初期化済みか否かを取得する。""" + return self._core.is_initialized_style_id_synthesis(style_id) + # FIXME: sing用のエンジンに移すかクラス名変える # 返す値の総称を考え、関数名を変更する def create_sing_phoneme_and_f0_and_volume(