From ca83fa56fdf05a46ca82e525fd0181455b6a98ed Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Wed, 26 Jun 2024 11:52:51 +0900 Subject: [PATCH] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9AAudioQuery.pauseLeng?= =?UTF-8?q?th=E3=81=AESkipJsonSchema=E3=82=92=E3=81=AA=E3=81=8F=E3=81=99?= =?UTF-8?q?=20(#1430)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * pauseLengthのSkipJsonSchemaをなくす * nullのときの処理を追加 * デフォルト値追記 * python -m pytest --snapshot-update --- ...\343\202\222\347\242\272\350\252\215.json" | 13 +++++-- .../__snapshots__/test_synthesis.ambr | 3 ++ .../single_api/tts_pipeline/test_synthesis.py | 34 +++++++++++++++++++ voicevox_engine/model.py | 9 +++-- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" index 65c84de4c..affde042e 100644 --- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" +++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" @@ -63,12 +63,19 @@ "type": "boolean" }, "pauseLength": { - "title": "句読点などの無音時間", - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "句読点などの無音時間。nullのときは無視される。デフォルト値はnull" }, "pauseLengthScale": { "default": 1, - "title": "句読点などの無音時間(倍率)", + "title": "句読点などの無音時間(倍率)。デフォルト値は1", "type": "number" }, "pitchScale": { diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr b/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr index 27c4fb9de..a8970cf02 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_synthesis.ambr @@ -2,3 +2,6 @@ # name: test_post_synthesis_200 'MD5:f7d42ce5787856549abc3d2d7561c06f' # --- +# name: test_post_synthesis_old_audio_query_200 + 'MD5:f7d42ce5787856549abc3d2d7561c06f' +# --- diff --git a/test/e2e/single_api/tts_pipeline/test_synthesis.py b/test/e2e/single_api/tts_pipeline/test_synthesis.py index 37636dc56..1827eba3e 100644 --- a/test/e2e/single_api/tts_pipeline/test_synthesis.py +++ b/test/e2e/single_api/tts_pipeline/test_synthesis.py @@ -41,3 +41,37 @@ def test_post_synthesis_200(client: TestClient, snapshot: SnapshotAssertion) -> # 音声波形が一致する assert response.headers["content-type"] == "audio/wav" assert snapshot == hash_wave_floats_from_wav_bytes(response.read()) + + +def test_post_synthesis_old_audio_query_200( + client: TestClient, snapshot: SnapshotAssertion +) -> None: + """古いバージョンの audio_query でもエラーなく合成できる""" + query = { + "accent_phrases": [ + { + "moras": [ + gen_mora("テ", "t", 2.3, "e", 0.8, 3.3), + gen_mora("ス", "s", 2.1, "U", 0.3, 0.0), + gen_mora("ト", "t", 2.3, "o", 1.8, 4.1), + ], + "accent": 1, + "pause_mora": None, + "is_interrogative": False, + } + ], + "speedScale": 1.0, + "pitchScale": 1.0, + "intonationScale": 1.0, + "volumeScale": 1.0, + "prePhonemeLength": 0.1, + "postPhonemeLength": 0.1, + "outputSamplingRate": 24000, + "outputStereo": False, + } + response = client.post("/synthesis", params={"speaker": 0}, json=query) + assert response.status_code == 200 + + # 音声波形が一致する + assert response.headers["content-type"] == "audio/wav" + assert snapshot == hash_wave_floats_from_wav_bytes(response.read()) diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index ddff1a61b..02574f5e1 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -25,10 +25,13 @@ class AudioQuery(BaseModel): volumeScale: float = Field(title="全体の音量") prePhonemeLength: float = Field(title="音声の前の無音時間") postPhonemeLength: float = Field(title="音声の後の無音時間") - pauseLength: float | SkipJsonSchema[None] = Field( - default=None, title="句読点などの無音時間" + pauseLength: float | None = Field( + default=None, + title="句読点などの無音時間。nullのときは無視される。デフォルト値はnull", + ) + pauseLengthScale: float = Field( + default=1, title="句読点などの無音時間(倍率)。デフォルト値は1" ) - pauseLengthScale: float = Field(default=1, title="句読点などの無音時間(倍率)") outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") outputStereo: bool = Field(title="音声データをステレオ出力するか否か") kana: str | SkipJsonSchema[None] = Field(