Skip to content

Commit

Permalink
整理: 音声合成系テストの utils を統廃合 (VOICEVOX#1428)
Browse files Browse the repository at this point in the history
* refactor: ローカルの util 関数を解体

* refactor: `_gen_mora()` を util へ切り出して `gen_mora()` へリネーム

* fix: lint
  • Loading branch information
tarepan authored Jun 24, 2024
1 parent 6208379 commit 4a5541e
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 109 deletions.
60 changes: 17 additions & 43 deletions test/unit/tts_pipeline/test_tts_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)

from .test_text_analyzer import stub_unknown_features_koxx
from .tts_utils import gen_mora


def yukarin_s_mock(
Expand Down Expand Up @@ -103,32 +104,13 @@ def is_model_loaded(self, style_id: str) -> bool:
return True


def _gen_mora(
text: str,
consonant: str | None,
consonant_length: float | None,
vowel: str,
vowel_length: float,
pitch: float,
) -> Mora:
"""Generate Mora with positional arguments for test simplicity."""
return Mora(
text=text,
consonant=consonant,
consonant_length=consonant_length,
vowel=vowel,
vowel_length=vowel_length,
pitch=pitch,
)


def test_to_flatten_phonemes() -> None:
"""Test `to_flatten_phonemes`."""
# Inputs
moras = [
_gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
_gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0),
gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0),
]

# Expects
Expand All @@ -140,33 +122,25 @@ def test_to_flatten_phonemes() -> None:
assert true_phonemes == phonemes


def _gen_hello_hiho_text() -> str:
return "こんにちは、ヒホです"


def _gen_hello_hiho_kana() -> str:
return "コンニチワ'、ヒ'ホデ_ス"


def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
_gen_mora("コ", "k", 0.0, "o", 0.0, 0.0),
_gen_mora("ン", None, None, "N", 0.0, 0.0),
_gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0),
_gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0),
_gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0),
gen_mora("コ", "k", 0.0, "o", 0.0, 0.0),
gen_mora("ン", None, None, "N", 0.0, 0.0),
gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0),
gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0),
gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0),
],
accent=5,
pause_mora=_gen_mora("、", None, None, "pau", 0.0, 0.0),
pause_mora=gen_mora("、", None, None, "pau", 0.0, 0.0),
),
AccentPhrase(
moras=[
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0),
_gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0),
_gen_mora("デ", "d", 0.0, "e", 0.0, 0.0),
_gen_mora("ス", "s", 0.0, "U", 0.0, 0.0),
gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0),
gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0),
gen_mora("デ", "d", 0.0, "e", 0.0, 0.0),
gen_mora("ス", "s", 0.0, "U", 0.0, 0.0),
],
accent=1,
pause_mora=None,
Expand All @@ -187,7 +161,7 @@ def _gen_hello_hiho_query() -> AudioQuery:
pauseLengthScale=0.8,
outputSamplingRate=12000,
outputStereo=True,
kana=_gen_hello_hiho_kana(),
kana="コンニチワ'、ヒ'ホデ_ス",
)


Expand Down Expand Up @@ -352,7 +326,7 @@ def test_mocked_create_accent_phrases_output(
"""モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_text()
hello_hiho = "こんにちは、ヒホです"
# Outputs
result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1))
# Tests
Expand All @@ -365,7 +339,7 @@ def test_mocked_create_accent_phrases_from_kana_output(
"""モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_kana()
hello_hiho = "コンニチワ'、ヒ'ホデ_ス"
# Outputs
result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1))
# Tests
Expand Down
115 changes: 49 additions & 66 deletions test/unit/tts_pipeline/test_wave_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from voicevox_engine.model import AudioQuery
from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora
from voicevox_engine.tts_pipeline.model import AccentPhrase
from voicevox_engine.tts_pipeline.tts_engine import (
apply_intonation_scale,
apply_output_sampling_rate,
Expand All @@ -17,6 +17,8 @@
raw_wave_to_output_wave,
)

from .tts_utils import gen_mora

TRUE_NUM_PHONEME = 45


Expand Down Expand Up @@ -50,38 +52,19 @@ def _gen_query(
)


def _gen_mora(
text: str,
consonant: str | None,
consonant_length: float | None,
vowel: str,
vowel_length: float,
pitch: float,
) -> Mora:
"""Generate Mora with positional arguments for test simplicity."""
return Mora(
text=text,
consonant=consonant,
consonant_length=consonant_length,
vowel=vowel,
vowel_length=vowel_length,
pitch=pitch,
)


def test_apply_prepost_silence() -> None:
"""Test `apply_prepost_silence`."""
# Inputs
query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067)
moras = [
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
]

# Expects
true_moras_with_silence = [
_gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
_gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0),
gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0),
]

# Outputs
Expand All @@ -95,20 +78,20 @@ def test_apply_speed_scale() -> None:
# Inputs
query = _gen_query(speedScale=2.0)
input_moras = [
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
]

# Expects - x2 fast
true_moras = [
_gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
_gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
_gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
_gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
]

# Outputs
Expand All @@ -122,20 +105,20 @@ def test_apply_pitch_scale() -> None:
# Inputs
query = _gen_query(pitchScale=2.0)
input_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
_gen_mora("ン", None, None, "N", 0.0, 50.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
gen_mora("ン", None, None, "N", 0.0, 50.0),
gen_mora("、", None, None, "pau", 0.0, 0.0),
gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Expects - x4 value scaled
true_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
_gen_mora("ン", None, None, "N", 0.0, 200.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
gen_mora("ン", None, None, "N", 0.0, 200.0),
gen_mora("、", None, None, "pau", 0.0, 0.0),
gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Outputs
Expand All @@ -149,20 +132,20 @@ def test_apply_intonation_scale() -> None:
# Inputs
query = _gen_query(intonationScale=0.5)
input_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
_gen_mora("ン", None, None, "N", 0.0, 200.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
gen_mora("ン", None, None, "N", 0.0, 200.0),
gen_mora("、", None, None, "pau", 0.0, 0.0),
gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Expects - mean=300 var x0.5 intonation scaling
true_moras = [
_gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
_gen_mora("ン", None, None, "N", 0.0, 250.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
gen_mora("ン", None, None, "N", 0.0, 250.0),
gen_mora("、", None, None, "pau", 0.0, 0.0),
gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
]

# Outputs
Expand Down Expand Up @@ -222,13 +205,13 @@ def test_count_frame_per_unit() -> None:
"""Test `count_frame_per_unit`."""
# Inputs
moras = [
_gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame]
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
_gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0),
gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame]
gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0),
]

# Expects
Expand All @@ -252,16 +235,16 @@ def test_query_to_decoder_feature() -> None:
accent_phrases = [
AccentPhrase(
moras=[
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
],
accent=1,
pause_mora=_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
pause_mora=gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
),
AccentPhrase(
moras=[
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
],
accent=1,
pause_mora=None,
Expand Down
22 changes: 22 additions & 0 deletions test/unit/tts_pipeline/tts_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""合成系テスト向けの utility"""

from voicevox_engine.tts_pipeline.model import Mora


def gen_mora(
text: str,
consonant: str | None,
consonant_length: float | None,
vowel: str,
vowel_length: float,
pitch: float,
) -> Mora:
"""Generate Mora with positional arguments for test simplicity."""
return Mora(
text=text,
consonant=consonant,
consonant_length=consonant_length,
vowel=vowel,
vowel_length=vowel_length,
pitch=pitch,
)

0 comments on commit 4a5541e

Please sign in to comment.