From 7d4eb987136ed7d6196fbe4a130d92e2a28e24be Mon Sep 17 00:00:00 2001 From: qwerty2501 <939468+qwerty2501@users.noreply.github.com> Date: Thu, 6 Jan 2022 23:40:57 +0900 Subject: [PATCH] =?UTF-8?q?=E7=96=91=E5=95=8F=E6=96=87=E3=81=AE=E4=BB=95?= =?UTF-8?q?=E6=A7=98=E5=A4=89=E6=9B=B4=E5=8F=8D=E6=98=A0=20#272?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AccentPhraseにis_interrogativeをもたせる - 疑問符Mora追加を調整前ではなく調整後に行うようにした https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1006316072 refs #272 --- test/test_synthesis_engine_base.py | 167 +++++++++++++----- voicevox_engine/model.py | 1 + .../synthesis_engine/synthesis_engine_base.py | 74 +++----- 3 files changed, 143 insertions(+), 99 deletions(-) diff --git a/test/test_synthesis_engine_base.py b/test/test_synthesis_engine_base.py index 8cc7d0068..dd500caf2 100644 --- a/test/test_synthesis_engine_base.py +++ b/test/test_synthesis_engine_base.py @@ -1,14 +1,84 @@ -from typing import List +from typing import List, Union from unittest import TestCase +from unittest.mock import Mock + +import numpy -from voicevox_engine.dev.synthesis_engine.mock import MockSynthesisEngine from voicevox_engine.model import AccentPhrase, Mora +from voicevox_engine.synthesis_engine import SynthesisEngine + + +def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray): + result = [] + # mockとしての適当な処理、特に意味はない + for i in range(length): + result.append(round(float(phoneme_list[i] * 0.0625 + speaker_id), 2)) + return numpy.array(result) + + +def yukarin_sa_mock( + length: int, + vowel_phoneme_list: numpy.ndarray, + consonant_phoneme_list: numpy.ndarray, + start_accent_list: numpy.ndarray, + end_accent_list: numpy.ndarray, + start_accent_phrase_list: numpy.ndarray, + end_accent_phrase_list: numpy.ndarray, + speaker_id: numpy.ndarray, +): + result = [] + # mockとしての適当な処理、特に意味はない + for i in range(length): + result.append( + round( + float( + ( + vowel_phoneme_list[0][i] + + consonant_phoneme_list[0][i] + + start_accent_list[0][i] + + end_accent_list[0][i] + + start_accent_phrase_list[0][i] + + end_accent_phrase_list[0][i] + ) + * 0.0625 + + speaker_id + ), + 2, + ) + ) + return numpy.array(result)[numpy.newaxis] + + +def decode_mock( + length: int, + phoneme_size: int, + f0: numpy.ndarray, + phoneme: numpy.ndarray, + speaker_id: Union[numpy.ndarray, int], +): + result = [] + # mockとしての適当な処理、特に意味はない + for i in range(length): + # decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる + for _ in range(256): + result.append( + float( + f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) + + speaker_id + ) + ) + return numpy.array(result) class TestSynthesisEngineBase(TestCase): def setUp(self): super().setUp() - self.synthesis_engine = MockSynthesisEngine(speakers="") + self.synthesis_engine = SynthesisEngine( + yukarin_s_forwarder=Mock(side_effect=yukarin_s_mock), + yukarin_sa_forwarder=Mock(side_effect=yukarin_sa_mock), + decode_forwarder=Mock(side_effect=decode_mock), + speakers="", + ) def create_accent_phrases_test_base( self, text: str, expected: List[AccentPhrase], enable_interrogative: bool @@ -16,6 +86,8 @@ def create_accent_phrases_test_base( actual = self.synthesis_engine.create_accent_phrases( text, 1, enable_interrogative ) + print(expected) + print(actual) self.assertEqual( expected, actual, @@ -34,30 +106,31 @@ def koreha_arimasuka_base_expected(): Mora( text="コ", consonant="k", - consonant_length=0, + consonant_length=2.44, vowel="o", - vowel_length=0, - pitch=0, + vowel_length=2.88, + pitch=4.38, ), Mora( text="レ", consonant="r", - consonant_length=0, + consonant_length=3.06, vowel="e", - vowel_length=0, - pitch=0, + vowel_length=1.88, + pitch=4.0, ), Mora( text="ワ", consonant="w", - consonant_length=0, + consonant_length=3.62, vowel="a", - vowel_length=0, - pitch=0, + vowel_length=1.44, + pitch=4.19, ), ], accent=3, pause_mora=None, + is_interrogative=False, ), AccentPhrase( moras=[ @@ -66,48 +139,50 @@ def koreha_arimasuka_base_expected(): consonant=None, consonant_length=None, vowel="a", - vowel_length=0, - pitch=0, + vowel_length=1.44, + pitch=1.44, ), Mora( text="リ", consonant="r", - consonant_length=0, + consonant_length=3.06, vowel="i", - vowel_length=0, - pitch=0, + vowel_length=2.31, + pitch=4.44, ), Mora( text="マ", consonant="m", - consonant_length=0, + consonant_length=2.62, vowel="a", - vowel_length=0, - pitch=0, + vowel_length=1.44, + pitch=3.12, ), Mora( text="ス", consonant="s", - consonant_length=0, + consonant_length=3.19, vowel="U", - vowel_length=0, - pitch=0, + vowel_length=1.38, + pitch=0.0, ), Mora( text="カ", consonant="k", - consonant_length=0, + consonant_length=2.44, vowel="a", - vowel_length=0, - pitch=0, + vowel_length=1.44, + pitch=2.94, ), ], accent=3, pause_mora=None, + is_interrogative=False, ), ] expected = koreha_arimasuka_base_expected() + expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ア", @@ -115,7 +190,7 @@ def koreha_arimasuka_base_expected(): consonant_length=None, vowel="a", vowel_length=0.15, - pitch=0.3, + pitch=round(expected[-1].moras[-1].pitch + 0.3, 2), ) ] self.create_accent_phrases_test_base( @@ -147,12 +222,13 @@ def nn_base_expected(): consonant=None, consonant_length=None, vowel="N", - vowel_length=0, - pitch=0, - ), + vowel_length=1.25, + pitch=1.44, + ) ], accent=1, pause_mora=None, + is_interrogative=False, ) ] @@ -164,6 +240,7 @@ def nn_base_expected(): ) expected = nn_base_expected() + expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ン", @@ -171,7 +248,7 @@ def nn_base_expected(): consonant_length=None, vowel="N", vowel_length=0.15, - pitch=0.3, + pitch=round(expected[-1].moras[-1].pitch + 0.3, 2), ) ] self.create_accent_phrases_test_base( @@ -196,12 +273,13 @@ def ltu_base_expected(): consonant=None, consonant_length=None, vowel="cl", - vowel_length=0, + vowel_length=1.69, pitch=0.0, - ), + ) ], accent=1, pause_mora=None, + is_interrogative=False, ) ] @@ -213,16 +291,7 @@ def ltu_base_expected(): ) expected = ltu_base_expected() - expected[-1].moras += [ - Mora( - text="ッ", - consonant=None, - consonant_length=None, - vowel="cl", - vowel_length=0.15, - pitch=0.3, - ) - ] + expected[-1].is_interrogative = True self.create_accent_phrases_test_base( text="っ?", expected=expected, @@ -243,14 +312,15 @@ def su_base_expected(): Mora( text="ス", consonant="s", - consonant_length=0, + consonant_length=3.19, vowel="u", - vowel_length=0, - pitch=0, - ), + vowel_length=3.5, + pitch=5.94, + ) ], accent=1, pause_mora=None, + is_interrogative=False, ) ] @@ -262,6 +332,7 @@ def su_base_expected(): ) expected = su_base_expected() + expected[-1].is_interrogative = True expected[-1].moras += [ Mora( text="ウ", @@ -269,7 +340,7 @@ def su_base_expected(): consonant_length=None, vowel="u", vowel_length=0.15, - pitch=0.3, + pitch=round(expected[-1].moras[-1].pitch + 0.3, 2), ) ] self.create_accent_phrases_test_base( diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index e67b7d902..3ef6a88ee 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -32,6 +32,7 @@ class AccentPhrase(BaseModel): moras: List[Mora] = Field(title="モーラのリスト") accent: int = Field(title="アクセント箇所") pause_mora: Optional[Mora] = Field(title="後ろに無音を付けるかどうか") + is_interrogative: bool = Field(default=False, title="疑問系かどうか") def __hash__(self): items = [ diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index 64ac71959..5f55b566d 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -18,27 +18,8 @@ def mora_to_text(mora: str) -> str: return mora -def add_interrogative_mora_if_last_phoneme_is_interrogative( - full_context_accent_phrase: full_context_label.AccentPhrase, - enable_interrogative: bool, -) -> List[full_context_label.Mora]: - """ - enable_interrogativeが有効になっていて与えられたfull_context_accent_phraseが疑問系だった場合、 - accent_phraseのmoraに対して疑問系の発音を擬似的に行うMoraを末尾に一つ追加する - """ - last_mora = full_context_accent_phrase.moras[-1] - return ( - full_context_accent_phrase.moras - + [full_context_label.Mora(None, last_mora.vowel)] - if full_context_accent_phrase.is_interrogative and enable_interrogative - else full_context_accent_phrase.moras - ) - - def adjust_interrogative_accent_phrases( accent_phrases: List[AccentPhrase], - interrogative_accent_phrase_marks: List[bool], - enable_interrogative: bool, ) -> List[AccentPhrase]: """ enable_interrogativeが有効になっていて与えられたaccent_phrasesに疑問系のものがあった場合、 @@ -47,35 +28,37 @@ def adjust_interrogative_accent_phrases( """ return [ AccentPhrase( - moras=adjust_interrogative_moras(accent_phrase.moras) - if enable_interrogative and interrogative_accent_phrase_mark - else accent_phrase.moras, + moras=adjust_interrogative_moras(accent_phrase), accent=accent_phrase.accent, pause_mora=accent_phrase.pause_mora, + is_interrogative=accent_phrase.is_interrogative, ) - for accent_phrase, interrogative_accent_phrase_mark in zip( - accent_phrases, interrogative_accent_phrase_marks - ) + for accent_phrase in accent_phrases ] -def adjust_interrogative_moras(moras: List[Mora]) -> List[Mora]: - if len(moras) <= 1: +def adjust_interrogative_moras(accent_phrase: AccentPhrase) -> List[Mora]: + moras = copy.deepcopy(accent_phrase.moras) + if accent_phrase.is_interrogative and not (len(moras) == 0 or moras[-1].pitch == 0): + interrogative_mora = make_interrogative_mora(moras[-1]) + moras.append(interrogative_mora) + return moras + else: return moras - moras = copy.deepcopy(moras) - moras[-1] = adjust_interrogative_mora(moras[-1], moras[-2]) - return moras -def adjust_interrogative_mora(mora: Mora, before_mora: Mora) -> Mora: - mora = copy.deepcopy(mora) +def make_interrogative_mora(last_mora: Mora) -> Mora: fix_vowel_length = 0.15 - mora.vowel_length = fix_vowel_length - adjust_pitch = 0.3 max_pitch = 6.5 - mora.pitch = min(before_mora.pitch + adjust_pitch, max_pitch) - return mora + return Mora( + text=openjtalk_mora2text[last_mora.vowel], + consonant=None, + consonant_length=None, + vowel=last_mora.vowel, + vowel_length=fix_vowel_length, + pitch=min(round(last_mora.pitch + adjust_pitch, 2), max_pitch), + ) def full_context_label_moras_to_moras( @@ -156,21 +139,10 @@ def create_accent_phrases( if len(utterance.breath_groups) == 0: return [] - interrogative_accent_phrase_marks = [ - accent_phrase.is_interrogative - for breath_group in utterance.breath_groups - for accent_phrase in breath_group.accent_phrases - ] - accent_phrases = self.replace_mora_data( accent_phrases=[ AccentPhrase( - moras=full_context_label_moras_to_moras( - add_interrogative_mora_if_last_phoneme_is_interrogative( - accent_phrase, - enable_interrogative, - ), - ), + moras=full_context_label_moras_to_moras(accent_phrase.moras), accent=accent_phrase.accent, pause_mora=( Mora( @@ -187,6 +159,8 @@ def create_accent_phrases( ) else None ), + is_interrogative=accent_phrase.is_interrogative + and enable_interrogative, ) for i_breath_group, breath_group in enumerate(utterance.breath_groups) for i_accent_phrase, accent_phrase in enumerate( @@ -195,9 +169,7 @@ def create_accent_phrases( ], speaker_id=speaker_id, ) - return adjust_interrogative_accent_phrases( - accent_phrases, interrogative_accent_phrase_marks, enable_interrogative - ) + return adjust_interrogative_accent_phrases(accent_phrases) @abstractmethod def synthesis(self, query: AudioQuery, speaker_id: int):