From 94c261cb60196b511a3793b7cb2e4695a09a7ffa Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 27 Jun 2024 12:44:45 +0900 Subject: [PATCH] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`tts=5Fengine`=20?= =?UTF-8?q?=E3=83=A2=E3=82=B8=E3=83=A5=E3=83=BC=E3=83=AB=E5=86=85=E3=81=AE?= =?UTF-8?q?=E3=83=97=E3=83=A9=E3=82=A4=E3=83=99=E3=83=BC=E3=83=88=E3=81=AA?= =?UTF-8?q?=E9=96=A2=E6=95=B0=E3=82=92=E3=83=AA=E3=83=8D=E3=83=BC=E3=83=A0?= =?UTF-8?q?=E3=80=81=E3=83=86=E3=82=B9=E3=83=88=E3=82=92=E6=95=B4=E7=90=86?= =?UTF-8?q?=20(#1435)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: `to_flatten_moras()` の docstring を簡略化 * refactor: `to_flatten_phonemes()` をプライベートへリネーム、テストを明確化 * refactor: `generate_silence_mora()` をプライベート化 * refactor: `_apply_interrogative_upspeak()` をプライベート化 * refactor: `apply_prepost_silence()` をプライベート化、テストを簡略化 * refactor: `apply_speed_scale()` をプライベート化、テストを簡略化 * refactor: `apply_pitch_scale()` をプライベート化、テストを整理 * refactor: `apply_intonation_scale()` をプライベート化、テストをフォーマット * refactor: `apply_pause_length()` と `apply_pause_length_scale()` をプライベート化 * refactor: `apply_volume_scale()` をプライベート化、テストをフォーマット * refactor: `apply_output_sampling_rate()` をプライベート化、テストをフォーマット * refactor: `apply_output_stereo()` をプライベート化、テストをフォーマット * refactor: `query_to_decoder_feature()` をプライベート化、テストを簡略化 * refactor: `count_frame_per_unit()` をプライベート化、テストを簡略化 * refactor: `calc_phoneme_lengths()` をプライベート化 * refactor: `notes_to_keys_and_phonemes()` をプライベート化 * refactor: `frame_query_to_sf_decoder_feature()` をプライベート化 * refactor: 定数 `T` を関数化 * fix: type * fix: lint --- test/unit/tts_pipeline/test_tts_engine.py | 48 +++--- .../tts_pipeline/test_wave_synthesizer.py | 139 +++++++++--------- test/unit/tts_pipeline/tts_utils.py | 5 + voicevox_engine/tts_pipeline/tts_engine.py | 87 +++++------ 4 files changed, 134 insertions(+), 145 deletions(-) diff --git a/test/unit/tts_pipeline/test_tts_engine.py b/test/unit/tts_pipeline/test_tts_engine.py index 4cb273e98..19fdea272 100644 --- a/test/unit/tts_pipeline/test_tts_engine.py +++ b/test/unit/tts_pipeline/test_tts_engine.py @@ -20,31 +20,31 @@ from voicevox_engine.tts_pipeline.text_analyzer import text_to_accent_phrases from voicevox_engine.tts_pipeline.tts_engine import ( TTSEngine, - apply_interrogative_upspeak, + _apply_interrogative_upspeak, + _to_flatten_phonemes, to_flatten_moras, - to_flatten_phonemes, ) from .test_text_analyzer import stub_unknown_features_koxx -from .tts_utils import gen_mora +from .tts_utils import gen_mora, sec def test_to_flatten_phonemes() -> None: - """Test `to_flatten_phonemes`.""" + """Test `_to_flatten_phonemes()`.""" # Inputs moras = [ - gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), - gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", sec(2), 0.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 5.0), + gen_mora(" ", None, None, "sil", sec(6), 0.0), ] - # Expects - true_phonemes = ["pau", "h", "i", "pau"] - + true_phoneme_strs = ["pau", "h", "i", "pau"] # Outputs - phonemes = list(map(lambda p: p._phoneme, to_flatten_phonemes(moras))) + phonemes = _to_flatten_phonemes(moras) + phoneme_strs = list(map(lambda p: p._phoneme, phonemes)) - assert true_phonemes == phonemes + # Test + assert true_phoneme_strs == phoneme_strs def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]: @@ -458,7 +458,7 @@ def test_upspeak_voiced_last_mora() -> None: ) ] # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -469,7 +469,7 @@ def test_upspeak_voiced_last_mora() -> None: expected = koreha_arimasuka_base_expected() expected[-1].is_interrogative = True # Outputs - outputs = apply_interrogative_upspeak(inputs, False) + outputs = _apply_interrogative_upspeak(inputs, False) # Test assert expected == outputs @@ -479,7 +479,7 @@ def test_upspeak_voiced_last_mora() -> None: # Expects expected = koreha_arimasuka_base_expected() # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -510,7 +510,7 @@ def nn_base_expected() -> list[AccentPhrase]: # Expects expected = nn_base_expected() # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -531,7 +531,7 @@ def nn_base_expected() -> list[AccentPhrase]: ) ] # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -542,7 +542,7 @@ def nn_base_expected() -> list[AccentPhrase]: expected = nn_base_expected() expected[-1].is_interrogative = True # Outputs - outputs = apply_interrogative_upspeak(inputs, False) + outputs = _apply_interrogative_upspeak(inputs, False) # Test assert expected == outputs @@ -573,7 +573,7 @@ def ltu_base_expected() -> list[AccentPhrase]: # Expects expected = ltu_base_expected() # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -584,7 +584,7 @@ def ltu_base_expected() -> list[AccentPhrase]: expected = ltu_base_expected() expected[-1].is_interrogative = True # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -595,7 +595,7 @@ def ltu_base_expected() -> list[AccentPhrase]: expected = ltu_base_expected() expected[-1].is_interrogative = True # Outputs - outputs = apply_interrogative_upspeak(inputs, False) + outputs = _apply_interrogative_upspeak(inputs, False) # Test assert expected == outputs @@ -626,7 +626,7 @@ def su_base_expected() -> list[AccentPhrase]: # Expects expected = su_base_expected() # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -647,7 +647,7 @@ def su_base_expected() -> list[AccentPhrase]: ) ] # Outputs - outputs = apply_interrogative_upspeak(inputs, True) + outputs = _apply_interrogative_upspeak(inputs, True) # Test assert expected == outputs @@ -658,6 +658,6 @@ def su_base_expected() -> list[AccentPhrase]: expected = su_base_expected() expected[-1].is_interrogative = True # Outputs - outputs = apply_interrogative_upspeak(inputs, False) + outputs = _apply_interrogative_upspeak(inputs, False) # Test assert expected == outputs diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py index ce213c3c1..aca333936 100644 --- a/test/unit/tts_pipeline/test_wave_synthesizer.py +++ b/test/unit/tts_pipeline/test_wave_synthesizer.py @@ -5,19 +5,19 @@ from voicevox_engine.model import AudioQuery from voicevox_engine.tts_pipeline.model import AccentPhrase from voicevox_engine.tts_pipeline.tts_engine import ( - apply_intonation_scale, - apply_output_sampling_rate, - apply_output_stereo, - apply_pitch_scale, - apply_prepost_silence, - apply_speed_scale, - apply_volume_scale, - count_frame_per_unit, - query_to_decoder_feature, + _apply_intonation_scale, + _apply_output_sampling_rate, + _apply_output_stereo, + _apply_pitch_scale, + _apply_prepost_silence, + _apply_speed_scale, + _apply_volume_scale, + _count_frame_per_unit, + _query_to_decoder_feature, raw_wave_to_output_wave, ) -from .tts_utils import gen_mora +from .tts_utils import gen_mora, sec TRUE_NUM_PHONEME = 45 @@ -53,55 +53,53 @@ def _gen_query( def test_apply_prepost_silence() -> None: - """Test `apply_prepost_silence`.""" + """Test `_apply_prepost_silence()`.""" # Inputs - query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) + query = _gen_query(prePhonemeLength=sec(2), postPhonemeLength=sec(6)) moras = [ - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 5.0), ] - # Expects true_moras_with_silence = [ - gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 5.0), - gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, "sil", sec(2), 0.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 5.0), + gen_mora(" ", None, None, "sil", sec(6), 0.0), ] - # Outputs - moras_with_silence = apply_prepost_silence(moras, query) + moras_with_silence = _apply_prepost_silence(moras, query) + # Test assert moras_with_silence == true_moras_with_silence def test_apply_speed_scale() -> None: - """Test `apply_speed_scale`.""" + """Test `_apply_speed_scale()`.""" # Inputs query = _gen_query(speedScale=2.0) input_moras = [ - gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), - gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), - gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 6.0), - gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("コ", "k", sec(2), "o", sec(4), 5.0), + gen_mora("ン", None, None, "N", sec(4), 5.0), + gen_mora("、", None, None, "pau", sec(2), 0.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 6.0), + gen_mora("ホ", "h", sec(4), "O", sec(2), 0.0), ] - # Expects - x2 fast true_moras = [ - gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 5.0), - gen_mora("ン", None, None, "N", 2 * 0.01067, 5.0), - gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), - gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 6.0), - gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + gen_mora("コ", "k", sec(1), "o", sec(2), 5.0), + gen_mora("ン", None, None, "N", sec(2), 5.0), + gen_mora("、", None, None, "pau", sec(1), 0.0), + gen_mora("ヒ", "h", sec(1), "i", sec(2), 6.0), + gen_mora("ホ", "h", sec(2), "O", sec(1), 0.0), ] - # Outputs - moras = apply_speed_scale(input_moras, query) + moras = _apply_speed_scale(input_moras, query) + # Test assert moras == true_moras def test_apply_pitch_scale() -> None: - """Test `apply_pitch_scale`.""" + """Test `_apply_pitch_scale()`.""" # Inputs query = _gen_query(pitchScale=2.0) input_moras = [ @@ -111,7 +109,6 @@ def test_apply_pitch_scale() -> None: gen_mora("ヒ", "h", 0.0, "i", 0.0, 6.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Expects - x4 value scaled true_moras = [ gen_mora("コ", "k", 0.0, "o", 0.0, 20.0), @@ -120,15 +117,15 @@ def test_apply_pitch_scale() -> None: gen_mora("ヒ", "h", 0.0, "i", 0.0, 24.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Outputs - moras = apply_pitch_scale(input_moras, query) + moras = _apply_pitch_scale(input_moras, query) + # Test assert moras == true_moras def test_apply_intonation_scale() -> None: - """Test `apply_intonation_scale`.""" + """Test `_apply_intonation_scale()`.""" # Inputs query = _gen_query(intonationScale=0.5) input_moras = [ @@ -138,7 +135,6 @@ def test_apply_intonation_scale() -> None: gen_mora("ヒ", "h", 0.0, "i", 0.0, 8.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Expects - mean=6 var x0.5 intonation scaling true_moras = [ gen_mora("コ", "k", 0.0, "o", 0.0, 5.5), @@ -147,71 +143,68 @@ def test_apply_intonation_scale() -> None: gen_mora("ヒ", "h", 0.0, "i", 0.0, 7.0), gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), ] - # Outputs - moras = apply_intonation_scale(input_moras, query) + moras = _apply_intonation_scale(input_moras, query) + # Test assert moras == true_moras def test_apply_volume_scale() -> None: - """Test `apply_volume_scale`.""" + """Test `_apply_volume_scale()`.""" # Inputs query = _gen_query(volumeScale=3.0) input_wave = np.array([0.0, 1.0, 2.0]) - # Expects - x3 scale true_wave = np.array([0.0, 3.0, 6.0]) - # Outputs - wave = apply_volume_scale(input_wave, query) + wave = _apply_volume_scale(input_wave, query) + # Test assert np.allclose(wave, true_wave) def test_apply_output_sampling_rate() -> None: - """Test `apply_output_sampling_rate`.""" + """Test `_apply_output_sampling_rate()`.""" # Inputs query = _gen_query(outputSamplingRate=12000) input_wave = np.array([1.0 for _ in range(120)]) input_sr_wave = 24000 - # Expects - half sampling rate true_wave = np.array([1.0 for _ in range(60)]) assert true_wave.shape == (60,), "Prerequisites" - # Outputs - wave = apply_output_sampling_rate(input_wave, input_sr_wave, query) + wave = _apply_output_sampling_rate(input_wave, input_sr_wave, query) + # Test assert wave.shape[0] == true_wave.shape[0] def test_apply_output_stereo() -> None: - """Test `apply_output_stereo`.""" + """Test `_apply_output_stereo()`.""" # Inputs query = _gen_query(outputStereo=True) input_wave = np.array([1.0, 0.0, 2.0]) - # Expects - Stereo :: (Time, Channel) true_wave = np.array([[1.0, 1.0], [0.0, 0.0], [2.0, 2.0]]) - # Outputs - wave = apply_output_stereo(input_wave, query) + wave = _apply_output_stereo(input_wave, query) + # Test assert np.array_equal(wave, true_wave) def test_count_frame_per_unit() -> None: - """Test `count_frame_per_unit`.""" + """Test `_count_frame_per_unit()`.""" # Inputs moras = [ - gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] - gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), - gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), - gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), - gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), - gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), + gen_mora(" ", None, None, " ", sec(2), 0.0), + gen_mora("コ", "k", sec(2), "o", sec(4), 0.0), + gen_mora("ン", None, None, "N", sec(4), 0.0), + gen_mora("、", None, None, "pau", sec(2), 0.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 0.0), + gen_mora("ホ", "h", sec(4), "O", sec(2), 0.0), + gen_mora(" ", None, None, " ", sec(6), 0.0), ] # Expects @@ -223,28 +216,29 @@ def test_count_frame_per_unit() -> None: true_frame_per_mora = np.array(true_frame_per_mora_list, dtype=np.int32) # Outputs - frame_per_phoneme, frame_per_mora = count_frame_per_unit(moras) + frame_per_phoneme, frame_per_mora = _count_frame_per_unit(moras) + # Test assert np.array_equal(frame_per_phoneme, true_frame_per_phoneme) assert np.array_equal(frame_per_mora, true_frame_per_mora) def test_query_to_decoder_feature() -> None: - """Test `query_to_decoder_feature`.""" + """Test `_query_to_decoder_feature()`.""" # Inputs accent_phrases = [ AccentPhrase( moras=[ - gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 5.0), - gen_mora("ン", None, None, "N", 4 * 0.01067, 5.0), + gen_mora("コ", "k", sec(2), "o", sec(4), 5.0), + gen_mora("ン", None, None, "N", sec(4), 5.0), ], accent=1, - pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + pause_mora=gen_mora("、", None, None, "pau", sec(2), 0.0), ), AccentPhrase( moras=[ - gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 8.0), - gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + gen_mora("ヒ", "h", sec(2), "i", sec(4), 8.0), + gen_mora("ホ", "h", sec(4), "O", sec(2), 0.0), ], accent=1, pause_mora=None, @@ -255,9 +249,9 @@ def test_query_to_decoder_feature() -> None: speedScale=2.0, pitchScale=2.0, intonationScale=0.5, - prePhonemeLength=2 * 0.01067, - postPhonemeLength=6 * 0.01067, - pauseLength=16 * 0.01067, + prePhonemeLength=sec(2), + postPhonemeLength=sec(6), + pauseLength=sec(16), pauseLengthScale=0.25, ) @@ -284,8 +278,9 @@ def test_query_to_decoder_feature() -> None: true_f0 = np.array(true1_f0 + true2_f0 + true3_f0, dtype=np.float32) # Outputs - phoneme, f0 = query_to_decoder_feature(query) + phoneme, f0 = _query_to_decoder_feature(query) + # Test assert np.array_equal(phoneme, true_phoneme) assert np.array_equal(f0, true_f0) diff --git a/test/unit/tts_pipeline/tts_utils.py b/test/unit/tts_pipeline/tts_utils.py index 947d496a6..dc58ae004 100644 --- a/test/unit/tts_pipeline/tts_utils.py +++ b/test/unit/tts_pipeline/tts_utils.py @@ -3,6 +3,11 @@ from voicevox_engine.tts_pipeline.model import Mora +def sec(frame: int) -> float: + """フレーム数に相当する秒数を返す。""" + return 0.01067 * frame # 1 フレームが約 10.67 ミリ秒 + + def gen_mora( text: str, consonant: str | None, diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 373d65c11..91de3993a 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -31,19 +31,8 @@ class TalkSingInvalidInputError(Exception): pass -# TODO: move mora utility to mora module def to_flatten_moras(accent_phrases: list[AccentPhrase]) -> list[Mora]: - """ - アクセント句系列に含まれるモーラの抽出 - Parameters - ---------- - accent_phrases : list[AccentPhrase] - アクセント句系列 - Returns - ------- - moras : list[Mora] - モーラ系列。ポーズモーラを含む。 - """ + """アクセント句系列からモーラ系列を抽出する。""" moras: list[Mora] = [] for accent_phrase in accent_phrases: moras += accent_phrase.moras @@ -52,7 +41,7 @@ def to_flatten_moras(accent_phrases: list[AccentPhrase]) -> list[Mora]: return moras -def to_flatten_phonemes(moras: list[Mora]) -> list[Phoneme]: +def _to_flatten_phonemes(moras: list[Mora]) -> list[Phoneme]: """モーラ系列から音素系列を抽出する""" phonemes: list[Phoneme] = [] for mora in moras: @@ -73,12 +62,12 @@ def _create_one_hot(accent_phrase: AccentPhrase, index: int) -> NDArray[np.int64 return onehot.astype(np.int64) -def generate_silence_mora(length: float) -> Mora: +def _generate_silence_mora(length: float) -> Mora: """無音モーラの生成""" return Mora(text=" ", vowel="sil", vowel_length=length, pitch=0.0) -def apply_interrogative_upspeak( +def _apply_interrogative_upspeak( accent_phrases: list[AccentPhrase], enable_interrogative_upspeak: bool ) -> list[AccentPhrase]: """必要に応じて各アクセント句の末尾へ疑問形モーラ(同一母音・継続長 0.15秒・音高↑)を付与する""" @@ -105,15 +94,15 @@ def apply_interrogative_upspeak( return accent_phrases -def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ前後無音(`prePhonemeLength` & `postPhonemeLength`)を付加する""" - pre_silence_moras = [generate_silence_mora(query.prePhonemeLength)] - post_silence_moras = [generate_silence_mora(query.postPhonemeLength)] + pre_silence_moras = [_generate_silence_mora(query.prePhonemeLength)] + post_silence_moras = [_generate_silence_mora(query.postPhonemeLength)] moras = pre_silence_moras + moras + post_silence_moras return moras -def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ話速スケール(`speedScale`)を適用する""" for mora in moras: mora.vowel_length /= query.speedScale @@ -122,7 +111,7 @@ def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def count_frame_per_unit( +def _count_frame_per_unit( moras: list[Mora], ) -> tuple[NDArray[np.int64], NDArray[np.int64]]: """ @@ -167,14 +156,14 @@ def _to_frame(sec: float) -> int: return sec_rounded.astype(np.int32).item() -def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ音高スケール(`pitchScale`)を適用する""" for mora in moras: mora.pitch *= 2**query.pitchScale return moras -def apply_pause_length(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_pause_length(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ無音時間(`pauseLength`)を適用する""" if query.pauseLength is not None: for mora in moras: @@ -183,7 +172,7 @@ def apply_pause_length(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def apply_pause_length_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_pause_length_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ無音時間スケール(`pauseLengthScale`)を適用する""" for mora in moras: if mora.vowel == "pau": @@ -191,7 +180,7 @@ def apply_pause_length_scale(moras: list[Mora], query: AudioQuery) -> list[Mora] return moras -def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: +def _apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """モーラ系列へ音声合成用のクエリがもつ抑揚スケール(`intonationScale`)を適用する""" # 有声音素 (f0>0) の平均値に対する乖離度をスケール voiced = list(filter(lambda mora: mora.pitch > 0, moras)) @@ -202,14 +191,14 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def apply_volume_scale( +def _apply_volume_scale( wave: NDArray[np.float32], query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつ音量スケール(`volumeScale`)を適用する""" return wave * query.volumeScale -def apply_output_sampling_rate( +def _apply_output_sampling_rate( wave: NDArray[np.float32], sr_wave: float, query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつ出力サンプリングレート(`outputSamplingRate`)を適用する""" @@ -220,7 +209,7 @@ def apply_output_sampling_rate( return wave -def apply_output_stereo( +def _apply_output_stereo( wave: NDArray[np.float32], query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつステレオ出力設定(`outputStereo`)を適用する""" @@ -229,26 +218,26 @@ def apply_output_stereo( return wave -def query_to_decoder_feature( +def _query_to_decoder_feature( query: AudioQuery, ) -> tuple[NDArray[np.float32], NDArray[np.float32]]: """音声合成用のクエリからフレームごとの音素 (shape=(フレーム長, 音素数)) と音高 (shape=(フレーム長,)) を得る""" moras = to_flatten_moras(query.accent_phrases) # 設定を適用する - moras = apply_prepost_silence(moras, query) - moras = apply_pause_length(moras, query) - moras = apply_pause_length_scale(moras, query) - moras = apply_speed_scale(moras, query) - moras = apply_pitch_scale(moras, query) - moras = apply_intonation_scale(moras, query) + moras = _apply_prepost_silence(moras, query) + moras = _apply_pause_length(moras, query) + moras = _apply_pause_length_scale(moras, query) + moras = _apply_speed_scale(moras, query) + moras = _apply_pitch_scale(moras, query) + moras = _apply_intonation_scale(moras, query) # 表現を変更する(音素クラス → 音素 onehot ベクトル、モーラクラス → 音高スカラ) - phoneme = np.stack([p.onehot for p in to_flatten_phonemes(moras)]) + phoneme = np.stack([p.onehot for p in _to_flatten_phonemes(moras)]) f0 = np.array([mora.pitch for mora in moras], dtype=np.float32) # 時間スケールを変更する(音素・モーラ → フレーム) - frame_per_phoneme, frame_per_mora = count_frame_per_unit(moras) + frame_per_phoneme, frame_per_mora = _count_frame_per_unit(moras) phoneme = np.repeat(phoneme, frame_per_phoneme, axis=0) f0 = np.repeat(f0, frame_per_mora) @@ -259,9 +248,9 @@ def raw_wave_to_output_wave( query: AudioQuery | FrameAudioQuery, wave: NDArray[np.float32], sr_wave: int ) -> NDArray[np.float32]: """生音声波形に音声合成用のクエリを適用して出力音声波形を生成する""" - wave = apply_volume_scale(wave, query) - wave = apply_output_sampling_rate(wave, sr_wave, query) - wave = apply_output_stereo(wave, query) + wave = _apply_volume_scale(wave, query) + wave = _apply_output_sampling_rate(wave, sr_wave, query) + wave = _apply_output_stereo(wave, query) return wave @@ -270,7 +259,7 @@ def _hira_to_kana(text: str) -> str: return "".join(chr(ord(c) + 96) if "ぁ" <= c <= "ゔ" else c for c in text) -def calc_phoneme_lengths( +def _calc_phoneme_lengths( consonant_lengths: NDArray[np.int64], note_durations: NDArray[np.int64], ) -> NDArray[np.int64]: @@ -313,7 +302,7 @@ def calc_phoneme_lengths( return phoneme_durations_array -def notes_to_keys_and_phonemes( +def _notes_to_keys_and_phonemes( notes: list[Note], ) -> tuple[ NDArray[np.int64], @@ -405,7 +394,7 @@ def notes_to_keys_and_phonemes( ) -def frame_query_to_sf_decoder_feature( +def _frame_query_to_sf_decoder_feature( query: FrameAudioQuery, ) -> tuple[NDArray[np.int64], NDArray[np.float32], NDArray[np.float32]]: """歌声合成用のクエリからフレームごとの音素・音高・音量を得る""" @@ -457,7 +446,7 @@ def update_length( moras = to_flatten_moras(accent_phrases) # 音素系列を抽出する - phonemes = to_flatten_phonemes(moras) + phonemes = _to_flatten_phonemes(moras) # 音素クラスから音素IDスカラへ表現を変換する phoneme_ids = np.array([p.id for p in phonemes], dtype=np.int64) @@ -574,11 +563,11 @@ def synthesize_wave( """音声合成用のクエリ・スタイルID・疑問文語尾自動調整フラグに基づいて音声波形を生成する""" # モーフィング時などに同一参照のqueryで複数回呼ばれる可能性があるので、元の引数のqueryに破壊的変更を行わない query = copy.deepcopy(query) - query.accent_phrases = apply_interrogative_upspeak( + query.accent_phrases = _apply_interrogative_upspeak( query.accent_phrases, enable_interrogative_upspeak ) - phoneme, f0 = query_to_decoder_feature(query) + phoneme, f0 = _query_to_decoder_feature(query) raw_wave, sr_raw_wave = self._core.safe_decode_forward(phoneme, f0, style_id) wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave @@ -607,7 +596,7 @@ def create_sing_phoneme_and_f0_and_volume( note_vowels_array, phonemes_array, phoneme_keys_array, - ) = notes_to_keys_and_phonemes(notes) + ) = _notes_to_keys_and_phonemes(notes) # コアを用いて子音長を生成する consonant_lengths = self._core.safe_predict_sing_consonant_length_forward( @@ -615,7 +604,7 @@ def create_sing_phoneme_and_f0_and_volume( ) # 予測した子音長を元に、すべての音素長を計算する - phoneme_lengths = calc_phoneme_lengths(consonant_lengths, note_lengths_array) + phoneme_lengths = _calc_phoneme_lengths(consonant_lengths, note_lengths_array) # 時間スケールを変更する(音素 → フレーム) frame_phonemes = np.repeat(phonemes_array, phoneme_lengths) @@ -658,7 +647,7 @@ def create_sing_volume_from_phoneme_and_f0( _, phonemes_array_from_notes, phoneme_keys_array, - ) = notes_to_keys_and_phonemes(notes) + ) = _notes_to_keys_and_phonemes(notes) phonemes_array = np.array( [Phoneme(p.phoneme).id for p in phonemes], dtype=np.int64 @@ -700,7 +689,7 @@ def frame_synthsize_wave( ) -> NDArray[np.float32]: """歌声合成用のクエリ・スタイルIDに基づいて音声波形を生成する""" - phoneme, f0, volume = frame_query_to_sf_decoder_feature(query) + phoneme, f0, volume = _frame_query_to_sf_decoder_feature(query) raw_wave, sr_raw_wave = self._core.safe_sf_decode_forward( phoneme, f0, volume, style_id )