VOICEVOX · Hiroshiba · Jun 7, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/build_util/check_release_build.py b/build_util/check_release_build.py
@@ -20,7 +20,7 @@ def test_release_build(dist_dir: Path, skip_run_process: bool) -> None:
     run_file = dist_dir / "run"
     if not run_file.exists():
         run_file = dist_dir / "run.exe"
-
+    print(f"run_file : {run_file}")
     # 起動
     process = None
     if not skip_run_process:

@@ -8,3 +8,5 @@
   volumeScale: 1
   prePhonemeLength: 0.1
   postPhonemeLength: 0.1
+  pauseLength: null
+  pauseLengthScale: 1
diff --git a/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json b/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json
@@ -27,6 +27,8 @@ def test_post_synthesis_morphing_200(client: TestClient) -> None:
         "volumeScale": 1.0,
         "prePhonemeLength": 0.1,
         "postPhonemeLength": 0.1,
+        "pauseLength": None,
+        "pauseLengthScale": 1.0,
         "outputSamplingRate": 24000,
         "outputStereo": False,
         "kana": "テ'_スト",

@@ -20,6 +20,8 @@ def test_post_add_preset_200(
         "volumeScale": 1,
         "prePhonemeLength": 10,
         "postPhonemeLength": 10,
+        "pauseLength": None,
+        "pauseLengthScale": 1,
     }
     response = client.post("/add_preset", params={}, json=preset)
     assert response.status_code == 200

diff --git a/test/e2e/single_api/preset/test_presets.py b/test/e2e/single_api/preset/test_presets.py
@@ -8,5 +8,7 @@
 
 def test_get_presets_200(client: TestClient, snapshot_json: SnapshotAssertion) -> None:
     response = client.get("/presets")
+    print("snapshot", snapshot_json)
+    print("response", response.json())
     assert response.status_code == 200
     assert snapshot_json == response.json()
@@ -20,6 +20,8 @@ def test_post_update_preset_200(
         "volumeScale": 1,
         "prePhonemeLength": 10,
         "postPhonemeLength": 10,
+        "pauseLength": None,
+        "pauseLengthScale": 1,
     }
     response = client.post("/update_preset", params={}, json=preset)
     assert response.status_code == 200
@@ -40,6 +42,8 @@ def test_post_update_preset_422(
         "volumeScale": 404,
         "prePhonemeLength": 404,
         "postPhonemeLength": 404,
+        "pauseLength": 404,
+        "pauseLengthScale": 404,
     }
     response = client.post("/update_preset", params={}, json=preset)
     assert response.status_code == 422

@@ -28,6 +28,8 @@ def test_post_multi_synthesis_200(client: TestClient) -> None:
             "volumeScale": 1.0,
             "prePhonemeLength": 0.1,
             "postPhonemeLength": 0.1,
+            "pauseLength": None,
+            "pauseLengthScale": 1.0,
             "outputSamplingRate": 24000,
             "outputStereo": False,
             "kana": "テ'_スト",
@@ -52,6 +54,8 @@ def test_post_multi_synthesis_200(client: TestClient) -> None:
             "volumeScale": 1.0,
             "prePhonemeLength": 0.2,
             "postPhonemeLength": 0.1,
+            "pauseLength": None,
+            "pauseLengthScale": 1.0,
             "outputSamplingRate": 24000,
             "outputStereo": False,
             "kana": "テ'_ストト",

@@ -29,6 +29,8 @@ def test_post_synthesis_200(client: TestClient, snapshot: SnapshotAssertion) ->
         "volumeScale": 1.0,
         "prePhonemeLength": 0.1,
         "postPhonemeLength": 0.1,
+        "pauseLength": None,
+        "pauseLengthScale": 1.0,
         "outputSamplingRate": 24000,
         "outputStereo": False,
         "kana": "テ'_スト",

diff --git a/test/preset/presets-test-1.yaml b/test/preset/presets-test-1.yaml
@@ -8,6 +8,8 @@
   volumeScale: 1
   prePhonemeLength: 0.1
   postPhonemeLength: 0.1
+  pauseLength: null
+  pauseLengthScale: 1.0
 
 - id: 2
   name: test2
@@ -19,3 +21,5 @@
   volumeScale: 0.7
   prePhonemeLength: 0.5
   postPhonemeLength: 0.5
+  pauseLength: null
+  pauseLengthScale: 1.0
diff --git a/test/preset/presets-test-2.yaml b/test/preset/presets-test-2.yaml
@@ -8,6 +8,8 @@
   volumeScale: 1
   prePhonemeLength: 0.1
   postPhonemeLength: 0.1
+  pauseLength: null
+  pauseLengthScale: 1.0
 
 - id: 2
   name: test2
@@ -19,3 +21,5 @@
   volumeScale: 0.7
   prePhonemeLength: 0.5
   postPhonemeLength: 0.5
+  pauseLength: null
+  pauseLengthScale: 1.0
diff --git a/test/preset/presets-test-3.yaml b/test/preset/presets-test-3.yaml
@@ -8,6 +8,8 @@
   volumeScale: 1
   prePhonemeLength: 0.1
   postPhonemeLength: 0.1
+  pauseLength: null
+  pauseLengthScale: 1.0
 
 - id: 1
   name: test2
@@ -19,3 +21,5 @@
   volumeScale: 0.7
   prePhonemeLength: 0.5
   postPhonemeLength: 0.5
+  pauseLength: null
+  pauseLengthScale: 1.0
diff --git a/test/preset/test_preset.py b/test/preset/test_preset.py
@@ -78,6 +78,8 @@ def test_add_preset(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         id = preset_manager.add_preset(preset)
@@ -106,6 +108,8 @@ def test_add_preset_load_failure(self) -> None:
                         "volumeScale": 0,
                         "prePhonemeLength": 0,
                         "postPhonemeLength": 0,
+                        "pauseLength": 0,
+                        "pauseLengthScale": 0,
                     }
                 )
             )
@@ -126,6 +130,8 @@ def test_add_preset_conflict_id(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         id = preset_manager.add_preset(preset)
@@ -152,6 +158,8 @@ def test_add_preset_conflict_id2(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         id = preset_manager.add_preset(preset)
@@ -178,6 +186,8 @@ def test_add_preset_write_failure(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         preset_manager.load_presets()
@@ -206,6 +216,8 @@ def test_update_preset(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         id = preset_manager.update_preset(preset)
@@ -234,6 +246,8 @@ def test_update_preset_load_failure(self) -> None:
                         "volumeScale": 0,
                         "prePhonemeLength": 0,
                         "postPhonemeLength": 0,
+                        "pauseLength": 0,
+                        "pauseLengthScale": 0,
                     }
                 )
             )
@@ -254,6 +268,8 @@ def test_update_preset_not_found(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         with self.assertRaises(
@@ -279,6 +295,8 @@ def test_update_preset_write_failure(self) -> None:
                 "volumeScale": 1,
                 "prePhonemeLength": 0.1,
                 "postPhonemeLength": 0.1,
+                "pauseLength": None,
+                "pauseLengthScale": 1.0,
             }
         )
         preset_manager.load_presets()

diff --git a/test/test_mock_tts_engine.py b/test/test_mock_tts_engine.py
@@ -66,6 +66,8 @@ def test_synthesize_wave(self) -> None:
                 volumeScale=1,
                 prePhonemeLength=0.1,
                 postPhonemeLength=0.1,
+                pauseLength=None,
+                pauseLengthScale=1.0,
                 outputSamplingRate=24000,
                 outputStereo=False,
                 kana=create_kana(self.accent_phrases_hello_hiho),

diff --git a/test/tts_pipeline/test_tts_engine.py b/test/tts_pipeline/test_tts_engine.py
@@ -184,6 +184,8 @@ def _gen_hello_hiho_query() -> AudioQuery:
         volumeScale=1.3,
         prePhonemeLength=0.1,
         postPhonemeLength=0.2,
+        pauseLength=None,
+        pauseLengthScale=1.0,
         outputSamplingRate=12000,
         outputStereo=True,
         kana=_gen_hello_hiho_kana(),
@@ -376,6 +378,7 @@ def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None
     # Inputs
     tts_engine = TTSEngine(MockCoreWrapper())
     hello_hiho = _gen_hello_hiho_query()
+    print(hello_hiho)
     # Outputs
     result = tts_engine.synthesize_wave(hello_hiho, StyleId(1))
     # Tests

diff --git a/test/tts_pipeline/test_wave_synthesizer.py b/test/tts_pipeline/test_wave_synthesizer.py
@@ -26,6 +26,8 @@ def _gen_query(
     intonationScale: float = 1.0,
     prePhonemeLength: float = 0.0,
     postPhonemeLength: float = 0.0,
+    pauseLength: float | None = -1,
+    pauseLengthScale: float = 1.0,
     volumeScale: float = 1.0,
     outputSamplingRate: int = 24000,
     outputStereo: bool = False,
@@ -39,6 +41,8 @@ def _gen_query(
         intonationScale=intonationScale,
         prePhonemeLength=prePhonemeLength,
         postPhonemeLength=postPhonemeLength,
+        pauseLength=pauseLength,
+        pauseLengthScale=pauseLengthScale,
         volumeScale=volumeScale,
         outputSamplingRate=outputSamplingRate,
         outputStereo=outputStereo,
@@ -269,6 +273,8 @@ def test_query_to_decoder_feature() -> None:
         intonationScale=0.5,
         prePhonemeLength=2 * 0.01067,
         postPhonemeLength=6 * 0.01067,
+        pauseLength=None,
+        pauseLengthScale=1.0,
     )
 
     # Expects
@@ -295,7 +301,6 @@ def test_query_to_decoder_feature() -> None:
 
     # Outputs
     phoneme, f0 = query_to_decoder_feature(query)
-
     assert np.array_equal(phoneme, true_phoneme)
     assert np.array_equal(f0, true_f0)
 

@@ -64,6 +64,8 @@ def audio_query(
             volumeScale=1,
             prePhonemeLength=0.1,
             postPhonemeLength=0.1,
+            pauseLength=None,
+            pauseLengthScale=1,
             outputSamplingRate=core.default_sampling_rate,
             outputStereo=False,
             kana=create_kana(accent_phrases),
@@ -108,6 +110,8 @@ def audio_query_from_preset(
             volumeScale=selected_preset.volumeScale,
             prePhonemeLength=selected_preset.prePhonemeLength,
             postPhonemeLength=selected_preset.postPhonemeLength,
+            pauseLength=selected_preset.pauseLength,
+            pauseLengthScale=selected_preset.pauseLengthScale,
             outputSamplingRate=core.default_sampling_rate,
             outputStereo=False,
             kana=create_kana(accent_phrases),
@@ -217,7 +221,6 @@ def synthesis(
         wave = engine.synthesize_wave(
             query, style_id, enable_interrogative_upspeak=enable_interrogative_upspeak
         )
-
         with NamedTemporaryFile(delete=False) as f:
             soundfile.write(
                 file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"

@@ -62,6 +62,8 @@ class AudioQuery(BaseModel):
     volumeScale: float = Field(title="全体の音量")
     prePhonemeLength: float = Field(title="音声の前の無音時間")
     postPhonemeLength: float = Field(title="音声の後の無音時間")
+    pauseLength: float | None = Field(title="テキスト内の無音時間(絶対値)")
+    pauseLengthScale: float = Field(title="テキスト内の無音時間(倍率)")
     outputSamplingRate: int = Field(title="音声データの出力サンプリングレート")
     outputStereo: bool = Field(title="音声データをステレオ出力するか否か")
     kana: str | None = Field(

diff --git a/voicevox_engine/preset/Preset.py b/voicevox_engine/preset/Preset.py
@@ -18,3 +18,5 @@ class Preset(BaseModel):
     volumeScale: float = Field(title="全体の音量")
     prePhonemeLength: float = Field(title="音声の前の無音時間")
     postPhonemeLength: float = Field(title="音声の後の無音時間")
+    pauseLength: float | None = Field(title="テキスト内の無音時間")
+    pauseLengthScale: float = Field(title="テキスト内の無音時間(倍率)")