From 17b891e1b24376daa02e852609f9dbaeab6705eb Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:05:53 +0900 Subject: [PATCH 1/7] add SynthesisEngineBase --- test/test_mora_to_text.py | 2 +- .../dev/synthesis_engine/__init__.py | 4 +- voicevox_engine/dev/synthesis_engine/mock.py | 3 +- voicevox_engine/synthesis_engine/__init__.py | 2 + .../synthesis_engine/make_synthesis_engine.py | 2 +- .../synthesis_engine/synthesis_engine.py | 81 +---------- .../synthesis_engine/synthesis_engine_base.py | 135 ++++++++++++++++++ 7 files changed, 145 insertions(+), 84 deletions(-) create mode 100644 voicevox_engine/synthesis_engine/synthesis_engine_base.py diff --git a/test/test_mora_to_text.py b/test/test_mora_to_text.py index ac1e6bf59..691681dd1 100644 --- a/test/test_mora_to_text.py +++ b/test/test_mora_to_text.py @@ -1,7 +1,7 @@ from unittest import TestCase # TODO: import from voicevox_engine.synthesis_engine.mora -from voicevox_engine.synthesis_engine.synthesis_engine import mora_to_text +from voicevox_engine.synthesis_engine.synthesis_engine_base import mora_to_text class TestMoraToText(TestCase): diff --git a/voicevox_engine/dev/synthesis_engine/__init__.py b/voicevox_engine/dev/synthesis_engine/__init__.py index 373ee86de..e7b2ac5b1 100644 --- a/voicevox_engine/dev/synthesis_engine/__init__.py +++ b/voicevox_engine/dev/synthesis_engine/__init__.py @@ -1,3 +1,3 @@ -from .mock import SynthesisEngine +from .mock import MockSynthesisEngine -__all__ = ["SynthesisEngine"] +__all__ = ["MockSynthesisEngine"] diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index 3896a412e..39682ebb9 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -6,10 +6,11 @@ from scipy.signal import resample from ...model import AccentPhrase, AudioQuery +from ...synthesis_engine import SynthesisEngineBase from ...synthesis_engine.synthesis_engine import to_flatten_moras -class SynthesisEngine: +class MockSynthesisEngine(SynthesisEngineBase): """ SynthesisEngine [Mock] """ diff --git a/voicevox_engine/synthesis_engine/__init__.py b/voicevox_engine/synthesis_engine/__init__.py index a84a8810a..15ff7dc78 100644 --- a/voicevox_engine/synthesis_engine/__init__.py +++ b/voicevox_engine/synthesis_engine/__init__.py @@ -1,9 +1,11 @@ from .forwarder import Forwarder from .make_synthesis_engine import make_synthesis_engine from .synthesis_engine import SynthesisEngine +from .synthesis_engine_base import SynthesisEngineBase __all__ = [ "Forwarder", "make_synthesis_engine", "SynthesisEngine", + "SynthesisEngineBase", ] diff --git a/voicevox_engine/synthesis_engine/make_synthesis_engine.py b/voicevox_engine/synthesis_engine/make_synthesis_engine.py index 5d6e5942d..f983edc93 100644 --- a/voicevox_engine/synthesis_engine/make_synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/make_synthesis_engine.py @@ -57,7 +57,7 @@ def make_synthesis_engine( speakers=core.metas(), ) - from ..dev.synthesis_engine import SynthesisEngine as MockSynthesisEngine + from ..dev.synthesis_engine import MockSynthesisEngine # モックで置き換える return MockSynthesisEngine(speakers=core.metas()) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index d85d0cc3d..245757909 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -5,9 +5,8 @@ from scipy.signal import resample from ..acoustic_feature_extractor import OjtPhoneme, SamplingData -from ..full_context_label import extract_full_context_label from ..model import AccentPhrase, AudioQuery, Mora -from ..mora_list import openjtalk_mora2text +from .synthesis_engine_base import SynthesisEngineBase unvoiced_mora_phoneme_list = ["A", "I", "U", "E", "O", "cl", "pau"] mora_phoneme_list = ["a", "i", "u", "e", "o", "N"] + unvoiced_mora_phoneme_list @@ -126,17 +125,7 @@ def pre_process( return flatten_moras, phoneme_data_list -def mora_to_text(mora: str) -> str: - if mora[-1:] in ["A", "I", "U", "E", "O"]: - # 無声化母音を小文字に - mora = mora[:-1] + mora[-1].lower() - if mora in openjtalk_mora2text: - return openjtalk_mora2text[mora] - else: - return mora - - -class SynthesisEngine: +class SynthesisEngine(SynthesisEngineBase): def __init__( self, yukarin_s_forwarder, @@ -365,72 +354,6 @@ def _create_one_hot(accent_phrase: AccentPhrase, position: int): return accent_phrases - def replace_mora_data( - self, - accent_phrases: List[AccentPhrase], - speaker_id: int, - ) -> List[AccentPhrase]: - return self.replace_mora_pitch( - accent_phrases=self.replace_phoneme_length( - accent_phrases=accent_phrases, - speaker_id=speaker_id, - ), - speaker_id=speaker_id, - ) - - def create_accent_phrases(self, text: str, speaker_id: int) -> List[AccentPhrase]: - if len(text.strip()) == 0: - return [] - - utterance = extract_full_context_label(text) - if len(utterance.breath_groups) == 0: - return [] - - return self.replace_mora_data( - accent_phrases=[ - AccentPhrase( - moras=[ - Mora( - text=mora_to_text( - "".join([p.phoneme for p in mora.phonemes]) - ), - consonant=( - mora.consonant.phoneme - if mora.consonant is not None - else None - ), - consonant_length=0 if mora.consonant is not None else None, - vowel=mora.vowel.phoneme, - vowel_length=0, - pitch=0, - ) - for mora in accent_phrase.moras - ], - accent=accent_phrase.accent, - pause_mora=( - Mora( - text="、", - consonant=None, - consonant_length=None, - vowel="pau", - vowel_length=0, - pitch=0, - ) - if ( - i_accent_phrase == len(breath_group.accent_phrases) - 1 - and i_breath_group != len(utterance.breath_groups) - 1 - ) - else None - ), - ) - for i_breath_group, breath_group in enumerate(utterance.breath_groups) - for i_accent_phrase, accent_phrase in enumerate( - breath_group.accent_phrases - ) - ], - speaker_id=speaker_id, - ) - def synthesis(self, query: AudioQuery, speaker_id: int): """ 音声合成クエリから音声合成に必要な情報を構成し、実際に音声合成を行う diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py new file mode 100644 index 000000000..8be2d8fee --- /dev/null +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -0,0 +1,135 @@ +from typing import List + +from ..full_context_label import extract_full_context_label +from ..model import AccentPhrase, AudioQuery, Mora +from ..mora_list import openjtalk_mora2text + + +def mora_to_text(mora: str) -> str: + if mora[-1:] in ["A", "I", "U", "E", "O"]: + # 無声化母音を小文字に + mora = mora[:-1] + mora[-1].lower() + if mora in openjtalk_mora2text: + return openjtalk_mora2text[mora] + else: + return mora + + +class SynthesisEngineBase: + def replace_phoneme_length( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: + """ + accent_phrasesの母音・子音の長さを設定する + Parameters + ---------- + accent_phrases : List[AccentPhrase] + アクセント句モデルのリスト + speaker_id : int + 話者ID + Returns + ------- + accent_phrases : List[AccentPhrase] + 母音・子音の長さが設定されたアクセント句モデルのリスト + """ + raise Exception('Unimplemented') + + def replace_mora_pitch( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: + """ + accent_phrasesの音高(ピッチ)を設定する + Parameters + ---------- + accent_phrases : List[AccentPhrase] + アクセント句モデルのリスト + speaker_id : int + 話者ID + Returns + ------- + accent_phrases : List[AccentPhrase] + 音高(ピッチ)が設定されたアクセント句モデルのリスト + """ + raise Exception('Unimplemented') + + def replace_mora_data( + self, + accent_phrases: List[AccentPhrase], + speaker_id: int, + ) -> List[AccentPhrase]: + return self.replace_mora_pitch( + accent_phrases=self.replace_phoneme_length( + accent_phrases=accent_phrases, + speaker_id=speaker_id, + ), + speaker_id=speaker_id, + ) + + def create_accent_phrases(self, text: str, speaker_id: int) -> List[AccentPhrase]: + if len(text.strip()) == 0: + return [] + + utterance = extract_full_context_label(text) + if len(utterance.breath_groups) == 0: + return [] + + return self.replace_mora_data( + accent_phrases=[ + AccentPhrase( + moras=[ + Mora( + text=mora_to_text( + "".join([p.phoneme for p in mora.phonemes]) + ), + consonant=( + mora.consonant.phoneme + if mora.consonant is not None + else None + ), + consonant_length=0 if mora.consonant is not None else None, + vowel=mora.vowel.phoneme, + vowel_length=0, + pitch=0, + ) + for mora in accent_phrase.moras + ], + accent=accent_phrase.accent, + pause_mora=( + Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0, + pitch=0, + ) + if ( + i_accent_phrase == len(breath_group.accent_phrases) - 1 + and i_breath_group != len(utterance.breath_groups) - 1 + ) + else None + ), + ) + for i_breath_group, breath_group in enumerate(utterance.breath_groups) + for i_accent_phrase, accent_phrase in enumerate( + breath_group.accent_phrases + ) + ], + speaker_id=speaker_id, + ) + + def synthesis(self, query: AudioQuery, speaker_id: int): + """ + 音声合成クエリから音声合成に必要な情報を構成し、実際に音声合成を行う + Parameters + ---------- + query : AudioQuery + 音声合成クエリ + speaker_id : int + 話者ID + Returns + ------- + wave : numpy.ndarray + 音声合成結果 + """ + raise Exception('Unimplemented') From 69fd2876f84bcb415e98c4a7421d0261b26d4a11 Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:19:35 +0900 Subject: [PATCH 2/7] use NotImplementedError --- voicevox_engine/synthesis_engine/synthesis_engine_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index 8be2d8fee..ff7dab1dd 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -32,7 +32,7 @@ def replace_phoneme_length( accent_phrases : List[AccentPhrase] 母音・子音の長さが設定されたアクセント句モデルのリスト """ - raise Exception('Unimplemented') + raise NotImplementedError() def replace_mora_pitch( self, accent_phrases: List[AccentPhrase], speaker_id: int @@ -50,7 +50,7 @@ def replace_mora_pitch( accent_phrases : List[AccentPhrase] 音高(ピッチ)が設定されたアクセント句モデルのリスト """ - raise Exception('Unimplemented') + raise NotImplementedError() def replace_mora_data( self, @@ -132,4 +132,4 @@ def synthesis(self, query: AudioQuery, speaker_id: int): wave : numpy.ndarray 音声合成結果 """ - raise Exception('Unimplemented') + raise NotImplementedError() From 46510f4e16cdc05ed65b45f2508c925796cced7c Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:20:30 +0900 Subject: [PATCH 3/7] add abstractmethod annotation --- voicevox_engine/synthesis_engine/synthesis_engine_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index ff7dab1dd..20d8b1620 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from typing import List from ..full_context_label import extract_full_context_label @@ -16,6 +17,7 @@ def mora_to_text(mora: str) -> str: class SynthesisEngineBase: + @abstractmethod def replace_phoneme_length( self, accent_phrases: List[AccentPhrase], speaker_id: int ) -> List[AccentPhrase]: @@ -34,6 +36,7 @@ def replace_phoneme_length( """ raise NotImplementedError() + @abstractmethod def replace_mora_pitch( self, accent_phrases: List[AccentPhrase], speaker_id: int ) -> List[AccentPhrase]: @@ -118,6 +121,7 @@ def create_accent_phrases(self, text: str, speaker_id: int) -> List[AccentPhrase speaker_id=speaker_id, ) + @abstractmethod def synthesis(self, query: AudioQuery, speaker_id: int): """ 音声合成クエリから音声合成に必要な情報を構成し、実際に音声合成を行う From 2384a43ff3f40529dbc0a11ab21d67a030d6cbd0 Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:21:24 +0900 Subject: [PATCH 4/7] add metaclass=ABCMeta --- voicevox_engine/synthesis_engine/synthesis_engine_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index 20d8b1620..3ad9bb86d 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -1,4 +1,4 @@ -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from typing import List from ..full_context_label import extract_full_context_label @@ -16,7 +16,7 @@ def mora_to_text(mora: str) -> str: return mora -class SynthesisEngineBase: +class SynthesisEngineBase(metaclass=ABCMeta): @abstractmethod def replace_phoneme_length( self, accent_phrases: List[AccentPhrase], speaker_id: int From 3fddad7892ea4564cd70be62c663e79b7ba8ffce Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:31:05 +0900 Subject: [PATCH 5/7] add test_mock_synthesis_engine --- test/test_mock_synthesis_engine.py | 107 +++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 test/test_mock_synthesis_engine.py diff --git a/test/test_mock_synthesis_engine.py b/test/test_mock_synthesis_engine.py new file mode 100644 index 000000000..ba082adde --- /dev/null +++ b/test/test_mock_synthesis_engine.py @@ -0,0 +1,107 @@ +from unittest import TestCase +from voicevox_engine.dev.synthesis_engine import MockSynthesisEngine +from voicevox_engine.model import AccentPhrase, AudioQuery, Mora + +class TestMockSynthesisEngine(TestCase): + def setUp(self): + super().setUp() + + self.accent_phrases_hello_hiho = [ + AccentPhrase( + moras=[ + Mora( + text="コ", + consonant="k", + consonant_length=0.0, + vowel="o", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="ン", + consonant=None, + consonant_length=None, + vowel="N", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="ニ", + consonant="n", + consonant_length=0.0, + vowel="i", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="チ", + consonant="ch", + consonant_length=0.0, + vowel="i", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="ワ", + consonant="w", + consonant_length=0.0, + vowel="a", + vowel_length=0.0, + pitch=0.0, + ), + ], + accent=5, + pause_mora=Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0.0, + pitch=0.0, + ), + ), + AccentPhrase( + moras=[ + Mora( + text="ヒ", + consonant="h", + consonant_length=0.0, + vowel="i", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="ホ", + consonant="h", + consonant_length=0.0, + vowel="o", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="デ", + consonant="d", + consonant_length=0.0, + vowel="e", + vowel_length=0.0, + pitch=0.0, + ), + Mora( + text="ス", + consonant="s", + consonant_length=0.0, + vowel="U", + vowel_length=0.0, + pitch=0.0, + ), + ], + accent=1, + pause_mora=None, + ), + ] + + def test_mock_synthesis_engine(self): + engine = MockSynthesisEngine(speakers='') + + self.assertEqual(engine.replace_phoneme_length(accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0), self.accent_phrases_hello_hiho) + self.assertEqual(engine.replace_mora_pitch(accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0), self.accent_phrases_hello_hiho) From f9f33d3b0a33502c3d379575a27095170b915584 Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:33:02 +0900 Subject: [PATCH 6/7] split test --- test/test_mock_synthesis_engine.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/test/test_mock_synthesis_engine.py b/test/test_mock_synthesis_engine.py index ba082adde..a7b3cfcaa 100644 --- a/test/test_mock_synthesis_engine.py +++ b/test/test_mock_synthesis_engine.py @@ -1,6 +1,8 @@ from unittest import TestCase + from voicevox_engine.dev.synthesis_engine import MockSynthesisEngine -from voicevox_engine.model import AccentPhrase, AudioQuery, Mora +from voicevox_engine.model import AccentPhrase, Mora + class TestMockSynthesisEngine(TestCase): def setUp(self): @@ -99,9 +101,20 @@ def setUp(self): pause_mora=None, ), ] + self.engine = MockSynthesisEngine(speakers="") - def test_mock_synthesis_engine(self): - engine = MockSynthesisEngine(speakers='') + def test_replace_phoneme_length(self): + self.assertEqual( + self.engine.replace_phoneme_length( + accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0 + ), + self.accent_phrases_hello_hiho, + ) - self.assertEqual(engine.replace_phoneme_length(accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0), self.accent_phrases_hello_hiho) - self.assertEqual(engine.replace_mora_pitch(accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0), self.accent_phrases_hello_hiho) + def test_replace_mora_pitch(self): + self.assertEqual( + self.engine.replace_mora_pitch( + accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0 + ), + self.accent_phrases_hello_hiho, + ) From 5870c9be4bc369110da018d3befb4ee8b9fb4f95 Mon Sep 17 00:00:00 2001 From: aoirint Date: Fri, 10 Dec 2021 05:40:12 +0900 Subject: [PATCH 7/7] synthesis test --- run.py | 4 +-- test/test_mock_synthesis_engine.py | 26 ++++++++++++++++--- .../synthesis_engine/make_synthesis_engine.py | 4 +-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/run.py b/run.py index fa855cb3d..a8c2b2347 100644 --- a/run.py +++ b/run.py @@ -28,7 +28,7 @@ SpeakerInfo, ) from voicevox_engine.preset import Preset, PresetLoader -from voicevox_engine.synthesis_engine import SynthesisEngine, make_synthesis_engine +from voicevox_engine.synthesis_engine import SynthesisEngineBase, make_synthesis_engine from voicevox_engine.utility import ConnectBase64WavesException, connect_base64_waves @@ -36,7 +36,7 @@ def b64encode_str(s): return base64.b64encode(s).decode("utf-8") -def generate_app(engine: SynthesisEngine) -> FastAPI: +def generate_app(engine: SynthesisEngineBase) -> FastAPI: root_dir = Path(__file__).parent default_sampling_rate = engine.default_sampling_rate diff --git a/test/test_mock_synthesis_engine.py b/test/test_mock_synthesis_engine.py index a7b3cfcaa..27bf20bf5 100644 --- a/test/test_mock_synthesis_engine.py +++ b/test/test_mock_synthesis_engine.py @@ -1,7 +1,8 @@ from unittest import TestCase from voicevox_engine.dev.synthesis_engine import MockSynthesisEngine -from voicevox_engine.model import AccentPhrase, Mora +from voicevox_engine.kana_parser import create_kana +from voicevox_engine.model import AccentPhrase, AudioQuery, Mora class TestMockSynthesisEngine(TestCase): @@ -106,7 +107,8 @@ def setUp(self): def test_replace_phoneme_length(self): self.assertEqual( self.engine.replace_phoneme_length( - accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0 + accent_phrases=self.accent_phrases_hello_hiho, + speaker_id=0, ), self.accent_phrases_hello_hiho, ) @@ -114,7 +116,25 @@ def test_replace_phoneme_length(self): def test_replace_mora_pitch(self): self.assertEqual( self.engine.replace_mora_pitch( - accent_phrases=self.accent_phrases_hello_hiho, speaker_id=0 + accent_phrases=self.accent_phrases_hello_hiho, + speaker_id=0, ), self.accent_phrases_hello_hiho, ) + + def test_synthesis(self): + self.engine.synthesis( + AudioQuery( + accent_phrases=self.accent_phrases_hello_hiho, + speedScale=1, + pitchScale=0, + intonationScale=1, + volumeScale=1, + prePhonemeLength=0.1, + postPhonemeLength=0.1, + outputSamplingRate=24000, + outputStereo=False, + kana=create_kana(self.accent_phrases_hello_hiho), + ), + speaker_id=0, + ) diff --git a/voicevox_engine/synthesis_engine/make_synthesis_engine.py b/voicevox_engine/synthesis_engine/make_synthesis_engine.py index f983edc93..070a69c5b 100644 --- a/voicevox_engine/synthesis_engine/make_synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/make_synthesis_engine.py @@ -2,14 +2,14 @@ from pathlib import Path from typing import Optional -from .synthesis_engine import SynthesisEngine +from .synthesis_engine import SynthesisEngine, SynthesisEngineBase def make_synthesis_engine( use_gpu: bool, voicelib_dir: Path, voicevox_dir: Optional[Path] = None, -) -> SynthesisEngine: +) -> SynthesisEngineBase: """ 音声ライブラリをロードして、音声合成エンジンを生成