From 829b35e07f28f6f573ff4408a3a620d11bd4b3a7 Mon Sep 17 00:00:00 2001
From: terepan <tarepan5884@gmail.com>
Date: Tue, 9 Jan 2024 14:49:15 +0000
Subject: [PATCH] =?UTF-8?q?refactor:=20=E7=84=A1=E9=9F=B3=E4=BB=98?=
 =?UTF-8?q?=E5=8A=A0=E3=82=92=20`CoreAdapter`=20=E3=81=B8=E7=A7=BB?=
 =?UTF-8?q?=E6=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 voicevox_engine/core_adapter.py            | 15 ++++++++++++++-
 voicevox_engine/tts_pipeline/tts_engine.py | 13 +++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/voicevox_engine/core_adapter.py b/voicevox_engine/core_adapter.py
index 9197e3876..56d9fab2c 100644
--- a/voicevox_engine/core_adapter.py
+++ b/voicevox_engine/core_adapter.py
@@ -89,8 +89,17 @@ def safe_yukarin_sa_forward(
         end_accent_phrase_list: NDArray[np.int64],
         style_id: StyleId,
     ) -> NDArray[np.float32]:
-        # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
+        # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音自動付加」「系列長・データ型に関するアダプター」を提供する
         self.initialize_style_id_synthesis(style_id, skip_reinit=True)
+
+        # 前後無音を付加する（詳細: voicevox_engine#924）
+        vowel_phoneme_list = np.r_[0, vowel_phoneme_list, 0]
+        consonant_phoneme_list = np.r_[-1, consonant_phoneme_list, -1]
+        start_accent_list = np.r_[0, start_accent_list, 0]
+        end_accent_list = np.r_[0, end_accent_list, 0]
+        start_accent_phrase_list = np.r_[0, start_accent_phrase_list, 0]
+        end_accent_phrase_list = np.r_[0, end_accent_phrase_list, 0]
+
         with self.mutex:
             f0_list = self.core.yukarin_sa_forward(
                 length=vowel_phoneme_list.shape[0],
@@ -102,6 +111,10 @@ def safe_yukarin_sa_forward(
                 end_accent_phrase_list=end_accent_phrase_list[np.newaxis],
                 style_id=np.array(style_id, dtype=np.int64).reshape(-1),
             )[0]
+
+        # 前後無音に相当する領域を破棄する
+        f0_list = f0_list[1:-1]
+
         return f0_list
 
     def safe_decode_forward(
diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index d2b9adc91..b1f4b7b1d 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -317,18 +317,11 @@ def update_pitch(
             [_create_one_hot(accent_phrase, -1) for accent_phrase in accent_phrases]
         )
 
-        # 前後無音を付加する
-        start_accent_list = np.r_[0, start_accent_list, 0]
-        end_accent_list = np.r_[0, end_accent_list, 0]
-        start_accent_phrase_list = np.r_[0, start_accent_phrase_list, 0]
-        end_accent_phrase_list = np.r_[0, end_accent_phrase_list, 0]
-
-        # アクセント句系列から（前後の無音含まない）モーラ系列と（前後の無音含む）音素系列を抽出する
+        # アクセント句系列からモーラ系列と音素系列を抽出する
         moras = to_flatten_moras(accent_phrases)
         phonemes = to_flatten_phonemes(moras)
-        phonemes = [Phoneme("pau")] + phonemes + [Phoneme("pau")]
 
-        # 前後無音付加済みの音素系列から子音ID系列・母音ID系列を抽出する
+        # 音素系列から子音ID系列・母音ID系列を抽出する
         consonants, vowels = split_mora(phonemes)
         vowel_ids = np.array([p.phoneme_id for p in vowels], dtype=np.int64)
         consonant_ids = np.array(
@@ -353,7 +346,7 @@ def update_pitch(
 
         # 更新する
         for i, mora in enumerate(moras):
-            mora.pitch = f0[i + 1]
+            mora.pitch = f0[i]
 
         return accent_phrases