From 620837924ce6f30a8eb5095b7a27fbc4a7e086b3 Mon Sep 17 00:00:00 2001 From: sabonerune <102559104+sabonerune@users.noreply.github.com> Date: Mon, 24 Jun 2024 21:00:04 +0900 Subject: [PATCH 1/6] =?UTF-8?q?FIX:=20`AudioQuery`=E3=81=AE=E4=BA=92?= =?UTF-8?q?=E6=8F=9B=E6=80=A7=E3=81=AE=E5=95=8F=E9=A1=8C=E3=82=92=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=20(#1425)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...23\343\201\250\343\202\222\347\242\272\350\252\215.json" | 6 +++--- voicevox_engine/model.py | 2 +- voicevox_engine/preset/model.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" index f78bf0f1b..0d9e9f862 100644 --- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" +++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" @@ -67,6 +67,7 @@ "type": "number" }, "pauseLengthScale": { + "default": 1, "title": "句読点などの無音時間(倍率)", "type": "number" }, @@ -99,7 +100,6 @@ "volumeScale", "prePhonemeLength", "postPhonemeLength", - "pauseLengthScale", "outputSamplingRate", "outputStereo" ], @@ -615,6 +615,7 @@ "type": "number" }, "pauseLengthScale": { + "default": 1, "title": "句読点などの無音時間(倍率)", "type": "number" }, @@ -657,8 +658,7 @@ "intonationScale", "volumeScale", "prePhonemeLength", - "postPhonemeLength", - "pauseLengthScale" + "postPhonemeLength" ], "title": "Preset", "type": "object" diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index 7fdbe9716..ddff1a61b 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -28,7 +28,7 @@ class AudioQuery(BaseModel): pauseLength: float | SkipJsonSchema[None] = Field( default=None, title="句読点などの無音時間" ) - pauseLengthScale: float = Field(title="句読点などの無音時間(倍率)") + pauseLengthScale: float = Field(default=1, title="句読点などの無音時間(倍率)") outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") outputStereo: bool = Field(title="音声データをステレオ出力するか否か") kana: str | SkipJsonSchema[None] = Field( diff --git a/voicevox_engine/preset/model.py b/voicevox_engine/preset/model.py index 1b6c77bd6..d9c2d4754 100644 --- a/voicevox_engine/preset/model.py +++ b/voicevox_engine/preset/model.py @@ -28,4 +28,4 @@ class Preset(BaseModel): pauseLength: float | SkipJsonSchema[None] = Field( default=None, title="句読点などの無音時間" ) - pauseLengthScale: float = Field(title="句読点などの無音時間(倍率)") + pauseLengthScale: float = Field(default=1, title="句読点などの無音時間(倍率)") From 4a5541e5d05e62e8c6bdf3e9ddbaf4b095d77240 Mon Sep 17 00:00:00 2001 From: tarepan Date: Tue, 25 Jun 2024 00:44:35 +0900 Subject: [PATCH 2/6] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E9=9F=B3=E5=A3=B0?= =?UTF-8?q?=E5=90=88=E6=88=90=E7=B3=BB=E3=83=86=E3=82=B9=E3=83=88=E3=81=AE?= =?UTF-8?q?=20utils=20=E3=82=92=E7=B5=B1=E5=BB=83=E5=90=88=20(#1428)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: ローカルの util 関数を解体 * refactor: `_gen_mora()` を util へ切り出して `gen_mora()` へリネーム * fix: lint --- test/unit/tts_pipeline/test_tts_engine.py | 60 +++------ .../tts_pipeline/test_wave_synthesizer.py | 115 ++++++++---------- test/unit/tts_pipeline/tts_utils.py | 22 ++++ 3 files changed, 88 insertions(+), 109 deletions(-) create mode 100644 test/unit/tts_pipeline/tts_utils.py diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py index 4e71eee1e..c41034663 100644 --- a/test/unit/tts_pipeline/test_tts_engine.py +++ b/test/unit/tts_pipeline/test_tts_engine.py @@ -26,6 +26,7 @@ ) from .test_text_analyzer import stub_unknown_features_koxx +from .tts_utils import gen_mora def yukarin_s_mock( @@ -103,32 +104,13 @@ def is_model_loaded(self, style_id: str) -> bool: return True -def _gen_mora( - text: str, - consonant: str | None, - consonant_length: float | None, - vowel: str, - vowel_length: float, - pitch: float, -) -> Mora: - """Generate Mora with positional arguments for test simplicity.""" - return Mora( - text=text, - consonant=consonant, - consonant_length=consonant_length, - vowel=vowel, - vowel_length=vowel_length, - pitch=pitch, - ) - - def test_to_flatten_phonemes() -> None: """Test `to_flatten_phonemes`.""" # Inputs moras = [ - _gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), - _gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] # Expects @@ -140,33 +122,25 @@ def test_to_flatten_phonemes() -> None: assert true_phonemes == phonemes -def _gen_hello_hiho_text() -> str: - return "こんにちは、ヒホです" - - -def _gen_hello_hiho_kana() -> str: - return "コンニチワ'、ヒ'ホデ_ス" - - def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]: return [ AccentPhrase( moras=[ - _gen_mora("コ", "k", 0.0, "o", 0.0, 0.0), - _gen_mora("ン", None, None, "N", 0.0, 0.0), - _gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0), - _gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0), - _gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 0.0), + gen_mora("ン", None, None, "N", 0.0, 0.0), + gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0), + gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0), + gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0), ], accent=5, - pause_mora=_gen_mora("、", None, None, "pau", 0.0, 0.0), + pause_mora=gen_mora("、", None, None, "pau", 0.0, 0.0), ), AccentPhrase( moras=[ - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0), - _gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0), - _gen_mora("デ", "d", 0.0, "e", 0.0, 0.0), - _gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0), + gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0), + gen_mora("デ", "d", 0.0, "e", 0.0, 0.0), + gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), ], accent=1, pause_mora=None, @@ -187,7 +161,7 @@ def _gen_hello_hiho_query() -> AudioQuery: pauseLengthScale=0.8, outputSamplingRate=12000, outputStereo=True, - kana=_gen_hello_hiho_kana(), + kana="コンニチワ'、ヒ'ホデ_ス", ) @@ -352,7 +326,7 @@ def test_mocked_create_accent_phrases_output( """モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である""" # Inputs tts_engine = TTSEngine(MockCoreWrapper()) - hello_hiho = _gen_hello_hiho_text() + hello_hiho = "こんにちは、ヒホです" # Outputs result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1)) # Tests @@ -365,7 +339,7 @@ def test_mocked_create_accent_phrases_from_kana_output( """モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である""" # Inputs tts_engine = TTSEngine(MockCoreWrapper()) - hello_hiho = _gen_hello_hiho_kana() + hello_hiho = "コンニチワ'、ヒ'ホデ_ス" # Outputs result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1)) # Tests diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py index 0d872dc37..5487bc8af 100644 --- a/test/unit/tts_pipeline/test_wave_synthesizer.py +++ b/test/unit/tts_pipeline/test_wave_synthesizer.py @@ -3,7 +3,7 @@ import numpy as np from voicevox_engine.model import AudioQuery -from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora +from voicevox_engine.tts_pipeline.model import AccentPhrase from voicevox_engine.tts_pipeline.tts_engine import ( apply_intonation_scale, apply_output_sampling_rate, @@ -17,6 +17,8 @@ raw_wave_to_output_wave, ) +from .tts_utils import gen_mora + TRUE_NUM_PHONEME = 45 @@ -50,38 +52,19 @@ def _gen_query( ) -def _gen_mora( - text: str, - consonant: str | None, - consonant_length: float | None, - vowel: str, - vowel_length: float, - pitch: float, -) -> Mora: - """Generate Mora with positional arguments for test simplicity.""" - return Mora( - text=text, - consonant=consonant, - consonant_length=consonant_length, - vowel=vowel, - vowel_length=vowel_length, - pitch=pitch, - ) - - def test_apply_prepost_silence() -> None: """Test `apply_prepost_silence`.""" # Inputs query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) moras = [ - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), ] # Expects true_moras_with_silence = [ - _gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), - _gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] # Outputs @@ -95,20 +78,20 @@ def test_apply_speed_scale() -> None: # Inputs query = _gen_query(speedScale=2.0) input_moras = [ - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ] # Expects - x2 fast true_moras = [ - _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), - _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), ] # Outputs @@ -122,20 +105,20 @@ def test_apply_pitch_scale() -> None: # Inputs query = _gen_query(pitchScale=2.0) input_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), - _gen_mora("ン", None, None, "N", 0.0, 50.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), + gen_mora("ン", None, None, "N", 0.0, 50.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Expects - x4 value scaled true_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - _gen_mora("ン", None, None, "N", 0.0, 200.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + gen_mora("ン", None, None, "N", 0.0, 200.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Outputs @@ -149,20 +132,20 @@ def test_apply_intonation_scale() -> None: # Inputs query = _gen_query(intonationScale=0.5) input_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - _gen_mora("ン", None, None, "N", 0.0, 200.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + gen_mora("ン", None, None, "N", 0.0, 200.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Expects - mean=300 var x0.5 intonation scaling true_moras = [ - _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), - _gen_mora("ン", None, None, "N", 0.0, 250.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), + gen_mora("ン", None, None, "N", 0.0, 250.0), + gen_mora("、", None, None, "pau", 0.0, 0.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), + gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Outputs @@ -222,13 +205,13 @@ def test_count_frame_per_unit() -> None: """Test `count_frame_per_unit`.""" # Inputs moras = [ - _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), - _gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), + gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), ] # Expects @@ -252,16 +235,16 @@ def test_query_to_decoder_feature() -> None: accent_phrases = [ AccentPhrase( moras=[ - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), ], accent=1, - pause_mora=_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), ), AccentPhrase( moras=[ - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ], accent=1, pause_mora=None, diff --git a/test/unit/tts_pipeline/tts_utils.py b/test/unit/tts_pipeline/tts_utils.py new file mode 100644 index 000000000..947d496a6 --- /dev/null +++ b/test/unit/tts_pipeline/tts_utils.py @@ -0,0 +1,22 @@ +"""合成系テスト向けの utility""" + +from voicevox_engine.tts_pipeline.model import Mora + + +def gen_mora( + text: str, + consonant: str | None, + consonant_length: float | None, + vowel: str, + vowel_length: float, + pitch: float, +) -> Mora: + """Generate Mora with positional arguments for test simplicity.""" + return Mora( + text=text, + consonant=consonant, + consonant_length=consonant_length, + vowel=vowel, + vowel_length=vowel_length, + pitch=pitch, + ) From 566a5fd860f3a043f26380ba0b52fb20e520e916 Mon Sep 17 00:00:00 2001 From: sabonerune <102559104+sabonerune@users.noreply.github.com> Date: Tue, 25 Jun 2024 01:12:33 +0900 Subject: [PATCH 3/6] =?UTF-8?q?FIX:=20Docker=E3=83=93=E3=83=AB=E3=83=89?= =?UTF-8?q?=E3=81=8C=E5=A4=B1=E6=95=97=E3=81=99=E3=82=8B=E5=95=8F=E9=A1=8C?= =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3=20(#1427)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 22be5f3dd..d261f358a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -233,6 +233,7 @@ ADD ./run.py ./presets.yaml ./engine_manifest.json /opt/voicevox_engine/ ADD ./resources /opt/voicevox_engine/resources ADD ./tools/generate_licenses.py /opt/voicevox_engine/tools/ ADD ./tools/licenses /opt/voicevox_engine/tools/licenses +ADD ./tools/generate_filemap.py /opt/voicevox_engine/tools/ # Replace version ARG VOICEVOX_ENGINE_VERSION=latest @@ -259,8 +260,7 @@ RUN < Date: Tue, 25 Jun 2024 15:23:53 +0900 Subject: [PATCH 4/6] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9AAudioQuery?= =?UTF-8?q?=E3=81=AF=E5=BE=8C=E6=96=B9=E4=BA=92=E6=8F=9B=E6=80=A7=E3=81=8C?= =?UTF-8?q?=E3=81=82=E3=82=8B=E3=81=93=E3=81=A8=E3=82=92=E3=83=89=E3=82=AD?= =?UTF-8?q?=E3=83=A5=E3=83=A1=E3=83=B3=E3=83=88=E3=81=A7=E6=A1=88=E5=86=85?= =?UTF-8?q?=20(#1433)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * AudioQueryは後方互換性があることをドキュメントで案内 * 詳細に --- ...3\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" | 1 + 1 file changed, 1 insertion(+) diff --git "a/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" "b/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" index 540173be1..38a95c6ad 100644 --- "a/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" +++ "b/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" @@ -2,6 +2,7 @@ - バージョンが上がっても、`/audio_query`で返ってくる値をそのまま`/synthesis`に POST すれば音声合成できるようにする予定です - `AudioQuery`のパラメータは増えますが、なるべくデフォルト値で以前と変わらない音声が生成されるようにします + - 以前のバージョンの`AudioQuery`を新しいバージョンの`/synthesis`にそのまま POST できるようにします(後方互換) - バージョン 0.7 から音声スタイルが実装されました。スタイルの情報は`/speakers`から取得できます - スタイルの情報にある`style_id`を`speaker`に指定することで、今まで通り音声合成ができます - style_id の指定先が speaker なのは互換性のためです From 6813b41c6c3a307fa7fdb912873d5f124a792ad3 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Tue, 25 Jun 2024 15:24:12 +0900 Subject: [PATCH 5/6] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9A=E3=82=A8?= =?UTF-8?q?=E3=83=B3=E3=82=B8=E3=83=B3=E3=83=9E=E3=83=8B=E3=83=95=E3=82=A7?= =?UTF-8?q?=E3=82=B9=E3=83=88=E3=81=AB=E7=84=A1=E9=9F=B3=E6=99=82=E9=96=93?= =?UTF-8?q?=E3=82=92=E8=AA=BF=E6=95=B4=E3=81=99=E3=82=8B=E8=83=BD=E5=8A=9B?= =?UTF-8?q?`adjust=5Fpause=5Flength`=E3=82=92=E8=BF=BD=E5=8A=A0=20(#1432)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit エンジンマニフェストに無音時間を調整する能力adjust_pause_lengthを追加 --- engine_manifest.json | 5 +++++ ...223\343\201\250\343\202\222\347\242\272\350\252\215.json" | 4 ++++ .../test_engine_manifest/test_get_engine_manifest_200.json | 1 + voicevox_engine/engine_manifest.py | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/engine_manifest.json b/engine_manifest.json index 000edb6dc..b3a840501 100644 --- a/engine_manifest.json +++ b/engine_manifest.json @@ -44,6 +44,11 @@ "value": true, "name": "全体の音量の調整" }, + "adjust_pause_length": { + "type": "bool", + "value": true, + "name": "句読点などの無音時間の調整" + }, "interrogative_upspeak": { "type": "bool", "value": true, diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" index 0d9e9f862..65c84de4c 100644 --- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" +++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" @@ -861,6 +861,10 @@ "title": "モーラごとの音高の調整", "type": "boolean" }, + "adjust_pause_length": { + "title": "句読点などの無音時間の調整", + "type": "boolean" + }, "adjust_phoneme_length": { "title": "音素ごとの長さの調整", "type": "boolean" diff --git a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json index 774fdaa43..d24ff2283 100644 --- a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json +++ b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json @@ -16,6 +16,7 @@ "supported_features": { "adjust_intonation_scale": true, "adjust_mora_pitch": true, + "adjust_pause_length": true, "adjust_phoneme_length": true, "adjust_pitch_scale": true, "adjust_speed_scale": true, diff --git a/voicevox_engine/engine_manifest.py b/voicevox_engine/engine_manifest.py index 30c5198f2..05677ed7a 100644 --- a/voicevox_engine/engine_manifest.py +++ b/voicevox_engine/engine_manifest.py @@ -34,6 +34,7 @@ class SupportedFeaturesJson: adjust_pitch_scale: FeatureSupportJson adjust_intonation_scale: FeatureSupportJson adjust_volume_scale: FeatureSupportJson + adjust_pause_length: FeatureSupportJson interrogative_upspeak: FeatureSupportJson synthesis_morphing: FeatureSupportJson sing: FeatureSupportJson @@ -103,6 +104,9 @@ class SupportedFeatures(BaseModel): adjust_pitch_scale: bool = Field(title="全体の音高の調整") adjust_intonation_scale: bool = Field(title="全体の抑揚の調整") adjust_volume_scale: bool = Field(title="全体の音量の調整") + adjust_pause_length: bool | SkipJsonSchema[None] = Field( + default=None, title="句読点などの無音時間の調整" + ) interrogative_upspeak: bool = Field(title="疑問文の自動調整") synthesis_morphing: bool = Field( title="2種類のスタイルでモーフィングした音声を合成" From 4965657faa95bc65f4cebb6c31b842969da466e6 Mon Sep 17 00:00:00 2001 From: tarepan Date: Tue, 25 Jun 2024 15:36:22 +0900 Subject: [PATCH 6/6] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=86=E3=82=B9?= =?UTF-8?q?=E3=83=88=E7=94=A8=E3=81=AE=E3=83=94=E3=83=83=E3=83=81=E3=82=92?= =?UTF-8?q?=20log=20=E3=82=B9=E3=82=B1=E3=83=BC=E3=83=AB=E3=81=AB=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=20(#1426)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: テスト用のピッチを log スケールに修正 --- test/unit/tts_pipeline/test_tts_engine.py | 2 +- .../tts_pipeline/test_wave_synthesizer.py | 56 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py index c41034663..bd08189aa 100644 --- a/test/unit/tts_pipeline/test_tts_engine.py +++ b/test/unit/tts_pipeline/test_tts_engine.py @@ -109,7 +109,7 @@ def test_to_flatten_phonemes() -> None: # Inputs moras = [ gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py index 5487bc8af..ce213c3c1 100644 --- a/test/unit/tts_pipeline/test_wave_synthesizer.py +++ b/test/unit/tts_pipeline/test_wave_synthesizer.py @@ -57,13 +57,13 @@ def test_apply_prepost_silence() -> None: # Inputs query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) moras = [ - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), ] # Expects true_moras_with_silence = [ gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), ] @@ -78,19 +78,19 @@ def test_apply_speed_scale() -> None: # Inputs query = _gen_query(speedScale=2.0) input_moras = [ - gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 6.0), gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ] # Expects - x2 fast true_moras = [ - gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), - gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 2 * 0.01067, 5.0), gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), - gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 6.0), gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), ] @@ -105,19 +105,19 @@ def test_apply_pitch_scale() -> None: # Inputs query = _gen_query(pitchScale=2.0) input_moras = [ - gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), - gen_mora("ン", None, None, "N", 0.0, 50.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.0), + gen_mora("ン", None, None, "N", 0.0, 5.0), gen_mora("、", None, None, "pau", 0.0, 0.0), - gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 6.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] # Expects - x4 value scaled true_moras = [ - gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - gen_mora("ン", None, None, "N", 0.0, 200.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 20.0), + gen_mora("ン", None, None, "N", 0.0, 20.0), gen_mora("、", None, None, "pau", 0.0, 0.0), - gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 24.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] @@ -132,19 +132,19 @@ def test_apply_intonation_scale() -> None: # Inputs query = _gen_query(intonationScale=0.5) input_moras = [ - gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), - gen_mora("ン", None, None, "N", 0.0, 200.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.0), + gen_mora("ン", None, None, "N", 0.0, 5.0), gen_mora("、", None, None, "pau", 0.0, 0.0), - gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 8.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Expects - mean=300 var x0.5 intonation scaling + # Expects - mean=6 var x0.5 intonation scaling true_moras = [ - gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), - gen_mora("ン", None, None, "N", 0.0, 250.0), + gen_mora("コ", "k", 0.0, "o", 0.0, 5.5), + gen_mora("ン", None, None, "N", 0.0, 5.5), gen_mora("、", None, None, "pau", 0.0, 0.0), - gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), + gen_mora("ヒ", "h", 0.0, "i", 0.0, 7.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] @@ -235,15 +235,15 @@ def test_query_to_decoder_feature() -> None: accent_phrases = [ AccentPhrase( moras=[ - gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), + gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), ], accent=1, pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), ), AccentPhrase( moras=[ - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 8.0), gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), ], accent=1, @@ -275,10 +275,10 @@ def test_query_to_decoder_feature() -> None: # Pitch # paw ko N pau hi hO paw # frame_per_vowel = [1, 3, 2, 1, 3, 3, 3] - # pau ko ko ko N N - true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0] - # pau pau hi hi hi - true2_f0 = [0.0, 0.0, 400.0, 400.0, 400.0] + # pau ko ko ko N N + true1_f0 = [0.0, 22.0, 22.0, 22.0, 22.0, 22.0] + # pau pau hi hi hi + true2_f0 = [0.0, 0.0, 28.0, 28.0, 28.0] # hO hO hO paw paw paw true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] true_f0 = np.array(true1_f0 + true2_f0 + true3_f0, dtype=np.float32)