From 620837924ce6f30a8eb5095b7a27fbc4a7e086b3 Mon Sep 17 00:00:00 2001
From: sabonerune <102559104+sabonerune@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:00:04 +0900
Subject: [PATCH 1/6] =?UTF-8?q?FIX:=20`AudioQuery`=E3=81=AE=E4=BA=92?=
 =?UTF-8?q?=E6=8F=9B=E6=80=A7=E3=81=AE=E5=95=8F=E9=A1=8C=E3=82=92=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3=20(#1425)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...23\343\201\250\343\202\222\347\242\272\350\252\215.json" | 6 +++---
 voicevox_engine/model.py                                    | 2 +-
 voicevox_engine/preset/model.py                             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
index f78bf0f1b..0d9e9f862 100644
--- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
+++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
@@ -67,6 +67,7 @@
             "type": "number"
           },
           "pauseLengthScale": {
+            "default": 1,
             "title": "句読点などの無音時間（倍率）",
             "type": "number"
           },
@@ -99,7 +100,6 @@
           "volumeScale",
           "prePhonemeLength",
           "postPhonemeLength",
-          "pauseLengthScale",
           "outputSamplingRate",
           "outputStereo"
         ],
@@ -615,6 +615,7 @@
             "type": "number"
           },
           "pauseLengthScale": {
+            "default": 1,
             "title": "句読点などの無音時間（倍率）",
             "type": "number"
           },
@@ -657,8 +658,7 @@
           "intonationScale",
           "volumeScale",
           "prePhonemeLength",
-          "postPhonemeLength",
-          "pauseLengthScale"
+          "postPhonemeLength"
         ],
         "title": "Preset",
         "type": "object"
diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py
index 7fdbe9716..ddff1a61b 100644
--- a/voicevox_engine/model.py
+++ b/voicevox_engine/model.py
@@ -28,7 +28,7 @@ class AudioQuery(BaseModel):
     pauseLength: float | SkipJsonSchema[None] = Field(
         default=None, title="句読点などの無音時間"
     )
-    pauseLengthScale: float = Field(title="句読点などの無音時間（倍率）")
+    pauseLengthScale: float = Field(default=1, title="句読点などの無音時間（倍率）")
     outputSamplingRate: int = Field(title="音声データの出力サンプリングレート")
     outputStereo: bool = Field(title="音声データをステレオ出力するか否か")
     kana: str | SkipJsonSchema[None] = Field(
diff --git a/voicevox_engine/preset/model.py b/voicevox_engine/preset/model.py
index 1b6c77bd6..d9c2d4754 100644
--- a/voicevox_engine/preset/model.py
+++ b/voicevox_engine/preset/model.py
@@ -28,4 +28,4 @@ class Preset(BaseModel):
     pauseLength: float | SkipJsonSchema[None] = Field(
         default=None, title="句読点などの無音時間"
     )
-    pauseLengthScale: float = Field(title="句読点などの無音時間（倍率）")
+    pauseLengthScale: float = Field(default=1, title="句読点などの無音時間（倍率）")

From 4a5541e5d05e62e8c6bdf3e9ddbaf4b095d77240 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Tue, 25 Jun 2024 00:44:35 +0900
Subject: [PATCH 2/6] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E9=9F=B3=E5=A3=B0?=
 =?UTF-8?q?=E5=90=88=E6=88=90=E7=B3=BB=E3=83=86=E3=82=B9=E3=83=88=E3=81=AE?=
 =?UTF-8?q?=20utils=20=E3=82=92=E7=B5=B1=E5=BB=83=E5=90=88=20(#1428)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor: ローカルの util 関数を解体

* refactor: `_gen_mora()` を util へ切り出して `gen_mora()` へリネーム

* fix: lint
---
 test/unit/tts_pipeline/test_tts_engine.py     |  60 +++------
 .../tts_pipeline/test_wave_synthesizer.py     | 115 ++++++++----------
 test/unit/tts_pipeline/tts_utils.py           |  22 ++++
 3 files changed, 88 insertions(+), 109 deletions(-)
 create mode 100644 test/unit/tts_pipeline/tts_utils.py

diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py
index 4e71eee1e..c41034663 100644
--- a/test/unit/tts_pipeline/test_tts_engine.py
+++ b/test/unit/tts_pipeline/test_tts_engine.py
@@ -26,6 +26,7 @@
 )
 
 from .test_text_analyzer import stub_unknown_features_koxx
+from .tts_utils import gen_mora
 
 
 def yukarin_s_mock(
@@ -103,32 +104,13 @@ def is_model_loaded(self, style_id: str) -> bool:
         return True
 
 
-def _gen_mora(
-    text: str,
-    consonant: str | None,
-    consonant_length: float | None,
-    vowel: str,
-    vowel_length: float,
-    pitch: float,
-) -> Mora:
-    """Generate Mora with positional arguments for test simplicity."""
-    return Mora(
-        text=text,
-        consonant=consonant,
-        consonant_length=consonant_length,
-        vowel=vowel,
-        vowel_length=vowel_length,
-        pitch=pitch,
-    )
-
-
 def test_to_flatten_phonemes() -> None:
     """Test `to_flatten_phonemes`."""
     # Inputs
     moras = [
-        _gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
-        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
-        _gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
+        gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
     ]
 
     # Expects
@@ -140,33 +122,25 @@ def test_to_flatten_phonemes() -> None:
     assert true_phonemes == phonemes
 
 
-def _gen_hello_hiho_text() -> str:
-    return "こんにちは、ヒホです"
-
-
-def _gen_hello_hiho_kana() -> str:
-    return "コンニチワ'、ヒ'ホデ_ス"
-
-
 def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]:
     return [
         AccentPhrase(
             moras=[
-                _gen_mora("コ", "k", 0.0, "o", 0.0, 0.0),
-                _gen_mora("ン", None, None, "N", 0.0, 0.0),
-                _gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0),
-                _gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0),
-                _gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0),
+                gen_mora("コ", "k", 0.0, "o", 0.0, 0.0),
+                gen_mora("ン", None, None, "N", 0.0, 0.0),
+                gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0),
+                gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0),
+                gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0),
             ],
             accent=5,
-            pause_mora=_gen_mora("、", None, None, "pau", 0.0, 0.0),
+            pause_mora=gen_mora("、", None, None, "pau", 0.0, 0.0),
         ),
         AccentPhrase(
             moras=[
-                _gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0),
-                _gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0),
-                _gen_mora("デ", "d", 0.0, "e", 0.0, 0.0),
-                _gen_mora("ス", "s", 0.0, "U", 0.0, 0.0),
+                gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0),
+                gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0),
+                gen_mora("デ", "d", 0.0, "e", 0.0, 0.0),
+                gen_mora("ス", "s", 0.0, "U", 0.0, 0.0),
             ],
             accent=1,
             pause_mora=None,
@@ -187,7 +161,7 @@ def _gen_hello_hiho_query() -> AudioQuery:
         pauseLengthScale=0.8,
         outputSamplingRate=12000,
         outputStereo=True,
-        kana=_gen_hello_hiho_kana(),
+        kana="コンニチワ'、ヒ'ホデ_ス",
     )
 
 
@@ -352,7 +326,7 @@ def test_mocked_create_accent_phrases_output(
     """モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である"""
     # Inputs
     tts_engine = TTSEngine(MockCoreWrapper())
-    hello_hiho = _gen_hello_hiho_text()
+    hello_hiho = "こんにちは、ヒホです"
     # Outputs
     result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1))
     # Tests
@@ -365,7 +339,7 @@ def test_mocked_create_accent_phrases_from_kana_output(
     """モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である"""
     # Inputs
     tts_engine = TTSEngine(MockCoreWrapper())
-    hello_hiho = _gen_hello_hiho_kana()
+    hello_hiho = "コンニチワ'、ヒ'ホデ_ス"
     # Outputs
     result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1))
     # Tests
diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py
index 0d872dc37..5487bc8af 100644
--- a/test/unit/tts_pipeline/test_wave_synthesizer.py
+++ b/test/unit/tts_pipeline/test_wave_synthesizer.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from voicevox_engine.model import AudioQuery
-from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora
+from voicevox_engine.tts_pipeline.model import AccentPhrase
 from voicevox_engine.tts_pipeline.tts_engine import (
     apply_intonation_scale,
     apply_output_sampling_rate,
@@ -17,6 +17,8 @@
     raw_wave_to_output_wave,
 )
 
+from .tts_utils import gen_mora
+
 TRUE_NUM_PHONEME = 45
 
 
@@ -50,38 +52,19 @@ def _gen_query(
     )
 
 
-def _gen_mora(
-    text: str,
-    consonant: str | None,
-    consonant_length: float | None,
-    vowel: str,
-    vowel_length: float,
-    pitch: float,
-) -> Mora:
-    """Generate Mora with positional arguments for test simplicity."""
-    return Mora(
-        text=text,
-        consonant=consonant,
-        consonant_length=consonant_length,
-        vowel=vowel,
-        vowel_length=vowel_length,
-        pitch=pitch,
-    )
-
-
 def test_apply_prepost_silence() -> None:
     """Test `apply_prepost_silence`."""
     # Inputs
     query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067)
     moras = [
-        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
     ]
 
     # Expects
     true_moras_with_silence = [
-        _gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
-        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
-        _gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
+        gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
     ]
 
     # Outputs
@@ -95,20 +78,20 @@ def test_apply_speed_scale() -> None:
     # Inputs
     query = _gen_query(speedScale=2.0)
     input_moras = [
-        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
-        _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
-        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
-        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
-        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+        gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+        gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
     ]
 
     # Expects - x2 fast
     true_moras = [
-        _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
-        _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
-        _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
-        _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
-        _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
+        gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
+        gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
+        gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
     ]
 
     # Outputs
@@ -122,20 +105,20 @@ def test_apply_pitch_scale() -> None:
     # Inputs
     query = _gen_query(pitchScale=2.0)
     input_moras = [
-        _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
-        _gen_mora("ン", None, None, "N", 0.0, 50.0),
-        _gen_mora("、", None, None, "pau", 0.0, 0.0),
-        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
-        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
+        gen_mora("ン", None, None, "N", 0.0, 50.0),
+        gen_mora("、", None, None, "pau", 0.0, 0.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
+        gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
     # Expects - x4 value scaled
     true_moras = [
-        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
-        _gen_mora("ン", None, None, "N", 0.0, 200.0),
-        _gen_mora("、", None, None, "pau", 0.0, 0.0),
-        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
-        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        gen_mora("ン", None, None, "N", 0.0, 200.0),
+        gen_mora("、", None, None, "pau", 0.0, 0.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
     # Outputs
@@ -149,20 +132,20 @@ def test_apply_intonation_scale() -> None:
     # Inputs
     query = _gen_query(intonationScale=0.5)
     input_moras = [
-        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
-        _gen_mora("ン", None, None, "N", 0.0, 200.0),
-        _gen_mora("、", None, None, "pau", 0.0, 0.0),
-        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
-        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        gen_mora("ン", None, None, "N", 0.0, 200.0),
+        gen_mora("、", None, None, "pau", 0.0, 0.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
     # Expects - mean=300 var x0.5 intonation scaling
     true_moras = [
-        _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
-        _gen_mora("ン", None, None, "N", 0.0, 250.0),
-        _gen_mora("、", None, None, "pau", 0.0, 0.0),
-        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
-        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
+        gen_mora("ン", None, None, "N", 0.0, 250.0),
+        gen_mora("、", None, None, "pau", 0.0, 0.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
+        gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
     # Outputs
@@ -222,13 +205,13 @@ def test_count_frame_per_unit() -> None:
     """Test `count_frame_per_unit`."""
     # Inputs
     moras = [
-        _gen_mora("　", None, None, "　", 2 * 0.01067, 0.0),  # 0.01067 [sec/frame]
-        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
-        _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
-        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
-        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
-        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
-        _gen_mora("　", None, None, "　", 6 * 0.01067, 0.0),
+        gen_mora("　", None, None, "　", 2 * 0.01067, 0.0),  # 0.01067 [sec/frame]
+        gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
+        gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
+        gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
+        gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+        gen_mora("　", None, None, "　", 6 * 0.01067, 0.0),
     ]
 
     # Expects
@@ -252,16 +235,16 @@ def test_query_to_decoder_feature() -> None:
     accent_phrases = [
         AccentPhrase(
             moras=[
-                _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
-                _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+                gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+                gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
             ],
             accent=1,
-            pause_mora=_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+            pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
         ),
         AccentPhrase(
             moras=[
-                _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
-                _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+                gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+                gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
             ],
             accent=1,
             pause_mora=None,
diff --git a/test/unit/tts_pipeline/tts_utils.py b/test/unit/tts_pipeline/tts_utils.py
new file mode 100644
index 000000000..947d496a6
--- /dev/null
+++ b/test/unit/tts_pipeline/tts_utils.py
@@ -0,0 +1,22 @@
+"""合成系テスト向けの utility"""
+
+from voicevox_engine.tts_pipeline.model import Mora
+
+
+def gen_mora(
+    text: str,
+    consonant: str | None,
+    consonant_length: float | None,
+    vowel: str,
+    vowel_length: float,
+    pitch: float,
+) -> Mora:
+    """Generate Mora with positional arguments for test simplicity."""
+    return Mora(
+        text=text,
+        consonant=consonant,
+        consonant_length=consonant_length,
+        vowel=vowel,
+        vowel_length=vowel_length,
+        pitch=pitch,
+    )

From 566a5fd860f3a043f26380ba0b52fb20e520e916 Mon Sep 17 00:00:00 2001
From: sabonerune <102559104+sabonerune@users.noreply.github.com>
Date: Tue, 25 Jun 2024 01:12:33 +0900
Subject: [PATCH 3/6] =?UTF-8?q?FIX:=20Docker=E3=83=93=E3=83=AB=E3=83=89?=
 =?UTF-8?q?=E3=81=8C=E5=A4=B1=E6=95=97=E3=81=99=E3=82=8B=E5=95=8F=E9=A1=8C?=
 =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3=20(#1427)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 22be5f3dd..d261f358a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -233,6 +233,7 @@ ADD ./run.py ./presets.yaml ./engine_manifest.json /opt/voicevox_engine/
 ADD ./resources /opt/voicevox_engine/resources
 ADD ./tools/generate_licenses.py /opt/voicevox_engine/tools/
 ADD ./tools/licenses /opt/voicevox_engine/tools/licenses
+ADD ./tools/generate_filemap.py /opt/voicevox_engine/tools/
 
 # Replace version
 ARG VOICEVOX_ENGINE_VERSION=latest
@@ -259,8 +260,7 @@ RUN <<EOF
 EOF
 
 # Generate filemap.json
-ADD ./tools/generate_filemap.py /tmp/
-RUN gosu user /opt/python/bin/python3 /tmp/generate_filemap.py --target_dir resources/character_info
+RUN /opt/python/bin/python3 /opt/voicevox_engine/tools/generate_filemap.py --target_dir /opt/voicevox_engine/resources/character_info
 
 # Keep this layer separated to use layer cache on download failed in local build
 RUN <<EOF

From b8a8e9e5a864686629a08e77d9a5291f2d6ec736 Mon Sep 17 00:00:00 2001
From: Hiroshiba <hihokaruta@gmail.com>
Date: Tue, 25 Jun 2024 15:23:53 +0900
Subject: [PATCH 4/6] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9AAudioQuery?=
 =?UTF-8?q?=E3=81=AF=E5=BE=8C=E6=96=B9=E4=BA=92=E6=8F=9B=E6=80=A7=E3=81=8C?=
 =?UTF-8?q?=E3=81=82=E3=82=8B=E3=81=93=E3=81=A8=E3=82=92=E3=83=89=E3=82=AD?=
 =?UTF-8?q?=E3=83=A5=E3=83=A1=E3=83=B3=E3=83=88=E3=81=A7=E6=A1=88=E5=86=85?=
 =?UTF-8?q?=20(#1433)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* AudioQueryは後方互換性があることをドキュメントで案内

* 詳細に
---
 ...3\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" | 1 +
 1 file changed, 1 insertion(+)

diff --git "a/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md" "b/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md"
index 540173be1..38a95c6ad 100644
--- "a/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md"
+++ "b/docs/VOICEVOX\351\237\263\345\243\260\345\220\210\346\210\220\343\202\250\343\203\263\343\202\270\343\203\263\343\201\250\343\201\256\351\200\243\346\220\272.md"
@@ -2,6 +2,7 @@
 
 - バージョンが上がっても、`/audio_query`で返ってくる値をそのまま`/synthesis`に POST すれば音声合成できるようにする予定です
   - `AudioQuery`のパラメータは増えますが、なるべくデフォルト値で以前と変わらない音声が生成されるようにします
+  - 以前のバージョンの`AudioQuery`を新しいバージョンの`/synthesis`にそのまま POST できるようにします（後方互換）
 - バージョン 0.7 から音声スタイルが実装されました。スタイルの情報は`/speakers`から取得できます
   - スタイルの情報にある`style_id`を`speaker`に指定することで、今まで通り音声合成ができます
     - style_id の指定先が speaker なのは互換性のためです

From 6813b41c6c3a307fa7fdb912873d5f124a792ad3 Mon Sep 17 00:00:00 2001
From: Hiroshiba <hihokaruta@gmail.com>
Date: Tue, 25 Jun 2024 15:24:12 +0900
Subject: [PATCH 5/6] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9A=E3=82=A8?=
 =?UTF-8?q?=E3=83=B3=E3=82=B8=E3=83=B3=E3=83=9E=E3=83=8B=E3=83=95=E3=82=A7?=
 =?UTF-8?q?=E3=82=B9=E3=83=88=E3=81=AB=E7=84=A1=E9=9F=B3=E6=99=82=E9=96=93?=
 =?UTF-8?q?=E3=82=92=E8=AA=BF=E6=95=B4=E3=81=99=E3=82=8B=E8=83=BD=E5=8A=9B?=
 =?UTF-8?q?`adjust=5Fpause=5Flength`=E3=82=92=E8=BF=BD=E5=8A=A0=20(#1432)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

エンジンマニフェストに無音時間を調整する能力adjust_pause_lengthを追加
---
 engine_manifest.json                                         | 5 +++++
 ...223\343\201\250\343\202\222\347\242\272\350\252\215.json" | 4 ++++
 .../test_engine_manifest/test_get_engine_manifest_200.json   | 1 +
 voicevox_engine/engine_manifest.py                           | 4 ++++
 4 files changed, 14 insertions(+)

diff --git a/engine_manifest.json b/engine_manifest.json
index 000edb6dc..b3a840501 100644
--- a/engine_manifest.json
+++ b/engine_manifest.json
@@ -44,6 +44,11 @@
             "value": true,
             "name": "全体の音量の調整"
         },
+        "adjust_pause_length": {
+            "type": "bool",
+            "value": true,
+            "name": "句読点などの無音時間の調整"
+        },
         "interrogative_upspeak": {
             "type": "bool",
             "value": true,
diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
index 0d9e9f862..65c84de4c 100644
--- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
+++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
@@ -861,6 +861,10 @@
             "title": "モーラごとの音高の調整",
             "type": "boolean"
           },
+          "adjust_pause_length": {
+            "title": "句読点などの無音時間の調整",
+            "type": "boolean"
+          },
           "adjust_phoneme_length": {
             "title": "音素ごとの長さの調整",
             "type": "boolean"
diff --git a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json
index 774fdaa43..d24ff2283 100644
--- a/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json
+++ b/test/e2e/single_api/engine_info/__snapshots__/test_engine_manifest/test_get_engine_manifest_200.json
@@ -16,6 +16,7 @@
   "supported_features": {
     "adjust_intonation_scale": true,
     "adjust_mora_pitch": true,
+    "adjust_pause_length": true,
     "adjust_phoneme_length": true,
     "adjust_pitch_scale": true,
     "adjust_speed_scale": true,
diff --git a/voicevox_engine/engine_manifest.py b/voicevox_engine/engine_manifest.py
index 30c5198f2..05677ed7a 100644
--- a/voicevox_engine/engine_manifest.py
+++ b/voicevox_engine/engine_manifest.py
@@ -34,6 +34,7 @@ class SupportedFeaturesJson:
     adjust_pitch_scale: FeatureSupportJson
     adjust_intonation_scale: FeatureSupportJson
     adjust_volume_scale: FeatureSupportJson
+    adjust_pause_length: FeatureSupportJson
     interrogative_upspeak: FeatureSupportJson
     synthesis_morphing: FeatureSupportJson
     sing: FeatureSupportJson
@@ -103,6 +104,9 @@ class SupportedFeatures(BaseModel):
     adjust_pitch_scale: bool = Field(title="全体の音高の調整")
     adjust_intonation_scale: bool = Field(title="全体の抑揚の調整")
     adjust_volume_scale: bool = Field(title="全体の音量の調整")
+    adjust_pause_length: bool | SkipJsonSchema[None] = Field(
+        default=None, title="句読点などの無音時間の調整"
+    )
     interrogative_upspeak: bool = Field(title="疑問文の自動調整")
     synthesis_morphing: bool = Field(
         title="2種類のスタイルでモーフィングした音声を合成"

From 4965657faa95bc65f4cebb6c31b842969da466e6 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Tue, 25 Jun 2024 15:36:22 +0900
Subject: [PATCH 6/6] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=86=E3=82=B9?=
 =?UTF-8?q?=E3=83=88=E7=94=A8=E3=81=AE=E3=83=94=E3=83=83=E3=83=81=E3=82=92?=
 =?UTF-8?q?=20log=20=E3=82=B9=E3=82=B1=E3=83=BC=E3=83=AB=E3=81=AB=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3=20(#1426)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix: テスト用のピッチを log スケールに修正
---
 test/unit/tts_pipeline/test_tts_engine.py     |  2 +-
 .../tts_pipeline/test_wave_synthesizer.py     | 56 +++++++++----------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py
index c41034663..bd08189aa 100644
--- a/test/unit/tts_pipeline/test_tts_engine.py
+++ b/test/unit/tts_pipeline/test_tts_engine.py
@@ -109,7 +109,7 @@ def test_to_flatten_phonemes() -> None:
     # Inputs
     moras = [
         gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
-        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0),
         gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
     ]
 
diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py
index 5487bc8af..ce213c3c1 100644
--- a/test/unit/tts_pipeline/test_wave_synthesizer.py
+++ b/test/unit/tts_pipeline/test_wave_synthesizer.py
@@ -57,13 +57,13 @@ def test_apply_prepost_silence() -> None:
     # Inputs
     query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067)
     moras = [
-        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0),
     ]
 
     # Expects
     true_moras_with_silence = [
         gen_mora("　", None, None, "sil", 2 * 0.01067, 0.0),
-        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0),
         gen_mora("　", None, None, "sil", 6 * 0.01067, 0.0),
     ]
 
@@ -78,19 +78,19 @@ def test_apply_speed_scale() -> None:
     # Inputs
     query = _gen_query(speedScale=2.0)
     input_moras = [
-        gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
-        gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0),
+        gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0),
         gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
-        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 6.0),
         gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
     ]
 
     # Expects - x2 fast
     true_moras = [
-        gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
-        gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 5.0),
+        gen_mora("ン", None, None, "N", 2 * 0.01067, 5.0),
         gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
-        gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 6.0),
         gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
     ]
 
@@ -105,19 +105,19 @@ def test_apply_pitch_scale() -> None:
     # Inputs
     query = _gen_query(pitchScale=2.0)
     input_moras = [
-        gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
-        gen_mora("ン", None, None, "N", 0.0, 50.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 5.0),
+        gen_mora("ン", None, None, "N", 0.0, 5.0),
         gen_mora("、", None, None, "pau", 0.0, 0.0),
-        gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 6.0),
         gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
     # Expects - x4 value scaled
     true_moras = [
-        gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
-        gen_mora("ン", None, None, "N", 0.0, 200.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 20.0),
+        gen_mora("ン", None, None, "N", 0.0, 20.0),
         gen_mora("、", None, None, "pau", 0.0, 0.0),
-        gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 24.0),
         gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
@@ -132,19 +132,19 @@ def test_apply_intonation_scale() -> None:
     # Inputs
     query = _gen_query(intonationScale=0.5)
     input_moras = [
-        gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
-        gen_mora("ン", None, None, "N", 0.0, 200.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 5.0),
+        gen_mora("ン", None, None, "N", 0.0, 5.0),
         gen_mora("、", None, None, "pau", 0.0, 0.0),
-        gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 8.0),
         gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
-    # Expects - mean=300 var x0.5 intonation scaling
+    # Expects - mean=6 var x0.5 intonation scaling
     true_moras = [
-        gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
-        gen_mora("ン", None, None, "N", 0.0, 250.0),
+        gen_mora("コ", "k", 0.0, "o", 0.0, 5.5),
+        gen_mora("ン", None, None, "N", 0.0, 5.5),
         gen_mora("、", None, None, "pau", 0.0, 0.0),
-        gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
+        gen_mora("ヒ", "h", 0.0, "i", 0.0, 7.0),
         gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
     ]
 
@@ -235,15 +235,15 @@ def test_query_to_decoder_feature() -> None:
     accent_phrases = [
         AccentPhrase(
             moras=[
-                gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
-                gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+                gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0),
+                gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0),
             ],
             accent=1,
             pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
         ),
         AccentPhrase(
             moras=[
-                gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+                gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 8.0),
                 gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
             ],
             accent=1,
@@ -275,10 +275,10 @@ def test_query_to_decoder_feature() -> None:
     # Pitch
     #                   paw ko  N pau hi hO paw
     # frame_per_vowel = [1, 3,  2, 1, 3, 3, 3]
-    #           pau   ko     ko     ko      N      N
-    true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0]
-    #           pau  pau   hi     hi     hi
-    true2_f0 = [0.0, 0.0, 400.0, 400.0, 400.0]
+    #           pau   ko    ko    ko     N     N
+    true1_f0 = [0.0, 22.0, 22.0, 22.0, 22.0, 22.0]
+    #           pau  pau  hi    hi    hi
+    true2_f0 = [0.0, 0.0, 28.0, 28.0, 28.0]
     #           hO   hO   hO   paw  paw  paw
     true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
     true_f0 = np.array(true1_f0 + true2_f0 + true3_f0, dtype=np.float32)