From 927cb7853fcf07b67ac258aef40064875a6a28c3 Mon Sep 17 00:00:00 2001 From: buckw6eat Date: Wed, 8 Sep 2021 23:39:15 +0900 Subject: [PATCH 1/7] =?UTF-8?q?=E9=9F=B3=E7=B4=A0=E9=95=B7=E3=81=A8?= =?UTF-8?q?=E9=9F=B3=E9=AB=98=E8=A8=AD=E5=AE=9A=E3=81=A7=20API=20=E3=82=92?= =?UTF-8?q?=E5=88=86=E3=81=91=E3=82=8B=20(#71)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: divide mora API for length and pitch * revert: remove useless case --- run.py | 30 ++++++++++++++- voicevox_engine/synthesis_engine.py | 58 ++++++++++++++++++++--------- 2 files changed, 68 insertions(+), 20 deletions(-) diff --git a/run.py b/run.py index a73e6f267..7faf765eb 100644 --- a/run.py +++ b/run.py @@ -98,8 +98,12 @@ def generate_app(engine: SynthesisEngine) -> FastAPI: def replace_mora_data( accent_phrases: List[AccentPhrase], speaker_id: int ) -> List[AccentPhrase]: - return engine.replace_phoneme_data( - accent_phrases=accent_phrases, speaker_id=speaker_id + return engine.replace_mora_pitch( + accent_phrases=engine.replace_phoneme_length( + accent_phrases=accent_phrases, + speaker_id=speaker_id, + ), + speaker_id=speaker_id, ) def create_accent_phrases(text: str, speaker_id: int) -> List[AccentPhrase]: @@ -192,6 +196,28 @@ def accent_phrases(text: str, speaker: int): def mora_data(accent_phrases: List[AccentPhrase], speaker: int): return replace_mora_data(accent_phrases, speaker_id=speaker) + @app.post( + "/mora_length", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="アクセント句から音素長を得る", + ) + def mora_length(accent_phrases: List[AccentPhrase], speaker: int): + return engine.replace_phoneme_length( + accent_phrases=accent_phrases, speaker_id=speaker + ) + + @app.post( + "/mora_pitch", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="アクセント句から音高を得る", + ) + def mora_pitch(accent_phrases: List[AccentPhrase], speaker: int): + return engine.replace_mora_pitch( + accent_phrases=accent_phrases, speaker_id=speaker + ) + @app.post( "/synthesis", response_class=FileResponse, diff --git a/voicevox_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine.py index e41d8f842..ca2215162 100644 --- a/voicevox_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine.py @@ -97,7 +97,45 @@ def __init__( self.yukarin_s_phoneme_class = OjtPhoneme self.yukarin_soso_phoneme_class = OjtPhoneme - def replace_phoneme_data(self, accent_phrases: List[AccentPhrase], speaker_id: int): + def replace_phoneme_length( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: + # phoneme + flatten_moras = to_flatten_moras(accent_phrases) + + phoneme_each_mora = [ + ([mora.consonant] if mora.consonant is not None else []) + [mora.vowel] + for mora in flatten_moras + ] + phoneme_str_list = list(chain.from_iterable(phoneme_each_mora)) + phoneme_str_list = ["pau"] + phoneme_str_list + ["pau"] + + phoneme_data_list = to_phoneme_data_list(phoneme_str_list) + _, _, vowel_indexes_data = split_mora(phoneme_data_list) + + # yukarin_s + phoneme_list_s = numpy.array( + [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 + ) + phoneme_length = self.yukarin_s_forwarder( + length=len(phoneme_list_s), + phoneme_list=phoneme_list_s, + speaker_id=numpy.array(speaker_id, dtype=numpy.int64).reshape(-1), + ) + + for i, mora in enumerate(flatten_moras): + mora.consonant_length = ( + phoneme_length[vowel_indexes_data[i + 1] - 1] + if mora.consonant is not None + else None + ) + mora.vowel_length = phoneme_length[vowel_indexes_data[i + 1]] + + return accent_phrases + + def replace_mora_pitch( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: # phoneme flatten_moras = to_flatten_moras(accent_phrases) @@ -193,16 +231,6 @@ def _repeat_with_mora(array, accent_phrase): vowel_indexes_data, ) = split_mora(phoneme_data_list) - # yukarin_s - phoneme_list_s = numpy.array( - [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 - ) - phoneme_length = self.yukarin_s_forwarder( - length=len(phoneme_list_s), - phoneme_list=phoneme_list_s, - speaker_id=numpy.array(speaker_id, dtype=numpy.int64).reshape(-1), - ) - # yukarin_sa vowel_indexes = numpy.array(vowel_indexes_data, dtype=numpy.int64) @@ -236,12 +264,6 @@ def _repeat_with_mora(array, accent_phrase): for i, mora in enumerate(flatten_moras): mora.pitch = f0_list[i + 1] - mora.consonant_length = ( - phoneme_length[vowel_indexes_data[i + 1] - 1] - if mora.consonant is not None - else None - ) - mora.vowel_length = phoneme_length[vowel_indexes_data[i + 1]] return accent_phrases @@ -257,12 +279,12 @@ def synthesis(self, query: AudioQuery, speaker_id: int): phoneme_str_list = list(chain.from_iterable(phoneme_each_mora)) phoneme_str_list = ["pau"] + phoneme_str_list + ["pau"] - # yukarin_s phoneme_data_list = to_phoneme_data_list(phoneme_str_list) phoneme_list_s = numpy.array( [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 ) + # length phoneme_length_list = ( [query.prePhonemeLength] + [ From 20a18a27470effa5481eaf0a72d802a6517249e4 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Thu, 9 Sep 2021 01:20:14 +0900 Subject: [PATCH 2/7] to 0.5.1 (#74) * to 0.5.0 * update core * update numpy version * update __file__ * 0.5.1 --- VERSION.txt | 2 +- run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION.txt b/VERSION.txt index 79a2734bb..5d4294b91 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.5.0 \ No newline at end of file +0.5.1 \ No newline at end of file diff --git a/run.py b/run.py index 7faf765eb..6627f4256 100644 --- a/run.py +++ b/run.py @@ -84,7 +84,7 @@ def generate_app(engine: SynthesisEngine) -> FastAPI: app = FastAPI( title="VOICEVOX ENGINE", description="VOICEVOXの音声合成エンジンです。", - version=(root_dir / "VERSION.txt").read_text(), + version=(root_dir / "VERSION.txt").read_text().strip(), ) app.add_middleware( From 46a0d08cfcc05340e15375c16a51499cc0ec3c3d Mon Sep 17 00:00:00 2001 From: ISHIDA Naoto Date: Thu, 9 Sep 2021 14:36:48 +0900 Subject: [PATCH 3/7] =?UTF-8?q?[Mock]=20/synthesis=20=E3=81=A7=E3=80=81?= =?UTF-8?q?=E9=9B=91=E9=9F=B3=EF=BC=88=E7=84=A1=E9=9F=B3=EF=BC=9F=EF=BC=89?= =?UTF-8?q?=E3=81=AE=E4=BB=A3=E3=82=8F=E3=82=8A=E3=81=AB=E3=80=81=E5=9B=BA?= =?UTF-8?q?=E5=AE=9A=E3=81=AE=E3=83=80=E3=83=9F=E3=83=BC=E3=83=BB=E3=83=86?= =?UTF-8?q?=E3=82=AD=E3=82=B9=E3=83=88=E3=82=92=E8=AA=AD=E3=81=BF=E4=B8=8A?= =?UTF-8?q?=E3=81=92=E3=82=8B=20#27=20(#77)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update: use dummy text instead of void audio for synthesis #27 * fix: add comment #27 --- voicevox_engine/dev/core/mock.py | 42 ++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/voicevox_engine/dev/core/mock.py b/voicevox_engine/dev/core/mock.py index 11f024f24..db5845111 100644 --- a/voicevox_engine/dev/core/mock.py +++ b/voicevox_engine/dev/core/mock.py @@ -2,9 +2,13 @@ from typing import Any, Dict, List import numpy as np +from pyopenjtalk import tts +from resampy import resample +DUMMY_TEXT = "これはダミーのテキストです" -def initialize(*args: List[Any]) -> None: + +def initialize(path: str, use_gpu: bool, *args: List[Any]) -> None: pass @@ -25,8 +29,42 @@ def yukarin_sa_forward(length: int, **kwargs: Dict[str, Any]) -> np.ndarray: def decode_forward(length: int, **kwargs: Dict[str, Any]) -> np.ndarray: + """ + 合成音声の波形データをNumPy配列で返します。ただし、常に固定の文言を読み上げます(DUMMY_TEXT) + 参照→SynthesisEngine のdocstring [Mock] + + Parameters + ---------- + length : int + フレームの長さ + + Returns + ------- + wave : np.ndarray + 音声合成した波形データ + + Note + ------- + ここで行う音声合成では、調声(ピッチ等)を反映しない + また、入力内容によらず常に固定の文言を読み上げる + + # pyopenjtalk.tts()の出力仕様 + dtype=np.float64, 16 bit, mono 48000 Hz + + # resampleの説明 + 本来はfloat64の入力でも問題ないのかと思われたが、実際には出力が音割れひどかった。 + 対策として、あらかじめint16に型変換しておくと、期待通りの結果になった。 + 非モックdecode_forwardと合わせるために、出力を24kHzに変換した。 + """ logger = getLogger("uvicorn") # FastAPI / Uvicorn 内からの利用のため logger.info( "Sorry, decode_forward() is a mock. Return values are incorrect.", ) - return np.ones(length * 256) + wave, sr = tts(DUMMY_TEXT) + wave = resample( + wave.astype("int16"), + sr, + 24000, + filter="kaiser_fast", + ) + return wave From ea79f9540f37b9ff73a7ab6e88d5dee0a5dcdc17 Mon Sep 17 00:00:00 2001 From: Yosshi999 Date: Thu, 9 Sep 2021 23:35:48 +0900 Subject: [PATCH 4/7] =?UTF-8?q?Softalk=E3=83=A9=E3=82=A4=E3=82=AF=E3=81=AA?= =?UTF-8?q?=E8=AA=AD=E3=81=BF=E4=BB=AE=E5=90=8D=E3=81=AE=E5=85=A5=E5=87=BA?= =?UTF-8?q?=E5=8A=9B=E5=AF=BE=E5=BF=9C=20(#73)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * support softalk-ish query * set 400 error * move definition to model.py * kana parser for /accent_phrases * apply pysen lint * change model * bugfix: unvoice in created kana * test for kana parser * apply pysen lint * remove unused package import * remove is_kana field from audio_query, use the term 'AquesTalk' * refactoring * replace mora data * rename badrequest --- run.py | 44 ++++- test/test_kana_parser.py | 76 ++++++++ voicevox_engine/kana_parser.py | 122 ++++++++++++ voicevox_engine/model.py | 40 +++- voicevox_engine/mora_list.py | 339 +++++++++++++++++---------------- 5 files changed, 449 insertions(+), 172 deletions(-) create mode 100644 test/test_kana_parser.py create mode 100644 voicevox_engine/kana_parser.py diff --git a/run.py b/run.py index 6627f4256..bcbd7b512 100644 --- a/run.py +++ b/run.py @@ -8,12 +8,20 @@ import resampy import soundfile import uvicorn -from fastapi import FastAPI, Response +from fastapi import FastAPI, HTTPException, Response from fastapi.middleware.cors import CORSMiddleware from starlette.responses import FileResponse from voicevox_engine.full_context_label import extract_full_context_label -from voicevox_engine.model import AccentPhrase, AudioQuery, Mora, Speaker +from voicevox_engine.kana_parser import create_kana, parse_kana +from voicevox_engine.model import ( + AccentPhrase, + AudioQuery, + Mora, + ParseKanaBadRequest, + ParseKanaError, + Speaker, +) from voicevox_engine.mora_list import openjtalk_mora2text from voicevox_engine.synthesis_engine import SynthesisEngine @@ -166,8 +174,9 @@ def audio_query(text: str, speaker: int): """ クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 """ + accent_phrases = create_accent_phrases(text, speaker_id=speaker) return AudioQuery( - accent_phrases=create_accent_phrases(text, speaker_id=speaker), + accent_phrases=accent_phrases, speedScale=1, pitchScale=0, intonationScale=1, @@ -176,6 +185,7 @@ def audio_query(text: str, speaker: int): postPhonemeLength=0.1, outputSamplingRate=default_sampling_rate, outputStereo=False, + kana=create_kana(accent_phrases), ) @app.post( @@ -183,9 +193,33 @@ def audio_query(text: str, speaker: int): response_model=List[AccentPhrase], tags=["クエリ編集"], summary="テキストからアクセント句を得る", + responses={ + 400: { + "description": "読み仮名のパースに失敗", + "model": ParseKanaBadRequest, + } + }, ) - def accent_phrases(text: str, speaker: int): - return create_accent_phrases(text, speaker_id=speaker) + def accent_phrases(text: str, speaker: int, is_kana: bool = False): + """ + テキストからアクセント句を得ます。 + is_kanaが`true`のとき、テキストは次のようなAquesTalkライクな記法に従う読み仮名として処理されます。デフォルトは`false`です。 + * 全てのカナはカタカナで記述される + * アクセント句は`/`または`、`で区切る。`、`で区切った場合に限り無音区間が挿入される。 + * カナの手前に`_`を入れるとそのカナは無声化される + * アクセント位置を`'`で指定する。全てのアクセント句にはアクセント位置を1つ指定する必要がある。 + """ + if is_kana: + try: + accent_phrases = parse_kana(text) + except ParseKanaError as err: + raise HTTPException( + status_code=400, + detail=ParseKanaBadRequest(err).dict(), + ) + return replace_mora_data(accent_phrases=accent_phrases, speaker_id=speaker) + else: + return create_accent_phrases(text, speaker_id=speaker) @app.post( "/mora_data", diff --git a/test/test_kana_parser.py b/test/test_kana_parser.py new file mode 100644 index 000000000..98ff4e303 --- /dev/null +++ b/test/test_kana_parser.py @@ -0,0 +1,76 @@ +from unittest import TestCase + +from voicevox_engine.kana_parser import create_kana, parse_kana +from voicevox_engine.model import ParseKanaError, ParseKanaErrorCode + + +class TestParseKana(TestCase): + def test_phrase_length(self): + self.assertEqual(len(parse_kana("ア'/ア'")), 2) + self.assertEqual(len(parse_kana("ア'、ア'")), 2) + self.assertEqual(len(parse_kana("ア'/ア'/ア'/ア'/ア'")), 5) + self.assertEqual(len(parse_kana("ス'")), 1) + self.assertEqual(len(parse_kana("_ス'")), 1) + self.assertEqual(len(parse_kana("ギェ'")), 1) + self.assertEqual(len(parse_kana("ギェ'、ギェ'/ギェ'")), 3) + + def test_accent(self): + self.assertEqual(parse_kana("シャ'シシュシェショ")[0].accent, 1) + self.assertEqual(parse_kana("シャ'_シシュシェショ")[0].accent, 1) + self.assertEqual(parse_kana("シャシ'シュシェショ")[0].accent, 2) + self.assertEqual(parse_kana("シャ_シ'シュシェショ")[0].accent, 2) + self.assertEqual(parse_kana("シャシシュ'シェショ")[0].accent, 3) + self.assertEqual(parse_kana("シャ_シシュ'シェショ")[0].accent, 3) + self.assertEqual(parse_kana("シャシシュシェショ'")[0].accent, 5) + self.assertEqual(parse_kana("シャ_シシュシェショ'")[0].accent, 5) + + def test_mora_length(self): + self.assertEqual(len(parse_kana("シャ'シシュシェショ")[0].moras), 5) + self.assertEqual(len(parse_kana("シャ'_シシュシェショ")[0].moras), 5) + self.assertEqual(len(parse_kana("シャシ'シュシェショ")[0].moras), 5) + self.assertEqual(len(parse_kana("シャ_シ'シュシェショ")[0].moras), 5) + self.assertEqual(len(parse_kana("シャシシュシェショ'")[0].moras), 5) + self.assertEqual(len(parse_kana("シャ_シシュシェショ'")[0].moras), 5) + + def test_pause(self): + self.assertIsNone(parse_kana("ア'/ア'")[0].pause_mora) + self.assertIsNone(parse_kana("ア'/ア'")[1].pause_mora) + self.assertIsNotNone(parse_kana("ア'、ア'")[0].pause_mora) + self.assertIsNone(parse_kana("ア'、ア'")[1].pause_mora) + + def test_unvoice(self): + self.assertEqual(parse_kana("ス'")[0].moras[0].vowel, "u") + self.assertEqual(parse_kana("_ス'")[0].moras[0].vowel, "U") + + def test_roundtrip(self): + for text in ["コンニチワ'", "ワタシワ'/シャチョオデ'_ス", "トテモ'、エラ'インデス"]: + self.assertEqual(create_kana(parse_kana(text)), text) + + for text in ["ヲ'", "ェ'"]: + self.assertEqual(create_kana(parse_kana(text)), text) + + +class TestParseKanaException(TestCase): + def _assert_error_code(self, kana: str, code: ParseKanaErrorCode): + with self.assertRaises(ParseKanaError) as err: + parse_kana(kana) + self.assertEqual(err.exception.errcode, code) + + def test_exceptions(self): + self._assert_error_code("アクセント", ParseKanaErrorCode.ACCENT_NOTFOUND) + self._assert_error_code("'アクセント", ParseKanaErrorCode.ACCENT_TOP) + self._assert_error_code("ア'ク'セント", ParseKanaErrorCode.ACCENT_TWICE) + self._assert_error_code("ひ'らがな", ParseKanaErrorCode.UNKNOWN_TEXT) + self._assert_error_code("__ス'", ParseKanaErrorCode.UNKNOWN_TEXT) + self._assert_error_code("ア'/", ParseKanaErrorCode.EMPTY_PHRASE) + self._assert_error_code("/ア'", ParseKanaErrorCode.EMPTY_PHRASE) + + with self.assertRaises(ParseKanaError) as err: + parse_kana("ヒト'ツメ/フタツメ") + self.assertEqual(err.exception.errcode, ParseKanaErrorCode.ACCENT_NOTFOUND) + self.assertEqual(err.exception.kwargs, {"text": "フタツメ"}) + + with self.assertRaises(ParseKanaError) as err: + parse_kana("ア'/") + self.assertEqual(err.exception.errcode, ParseKanaErrorCode.EMPTY_PHRASE) + self.assertEqual(err.exception.kwargs, {"position": "2"}) diff --git a/voicevox_engine/kana_parser.py b/voicevox_engine/kana_parser.py new file mode 100644 index 000000000..4092f01cd --- /dev/null +++ b/voicevox_engine/kana_parser.py @@ -0,0 +1,122 @@ +from typing import List, Optional + +from voicevox_engine.model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode +from voicevox_engine.mora_list import openjtalk_text2mora + +LOOP_LIMIT = 300 +UNVOICE_SYMBOL = "_" +ACCENT_SYMBOL = "'" +NOPAUSE_DELIMITER = "/" +PAUSE_DELIMITER = "、" + +text2mora_with_unvoice = {} +for text, (consonant, vowel) in openjtalk_text2mora.items(): + text2mora_with_unvoice[text] = Mora( + text=text, + consonant=consonant if len(consonant) > 0 else None, + consonant_length=0 if len(consonant) > 0 else None, + vowel=vowel, + vowel_length=0, + pitch=0, + ) + if vowel in ["a", "i", "u", "e", "o"]: + text2mora_with_unvoice[UNVOICE_SYMBOL + text] = Mora( + text=text, + consonant=consonant if len(consonant) > 0 else None, + consonant_length=0 if len(consonant) > 0 else None, + vowel=vowel.upper(), + vowel_length=0, + pitch=0, + ) + + +def _text_to_accent_phrase(phrase: str) -> List[AccentPhrase]: + """ + longest matchにより読み仮名からAccentPhraseを生成 + 入力長Nに対し計算量O(N^2) + """ + accent_index: Optional[int] = None + moras: List[Mora] = [] + + base_index = 0 # パース開始位置。ここから右の文字列をstackに詰めていく。 + stack = "" # 保留中の文字列 + matched_text: Optional[str] = None # 保留中の文字列内で最後にマッチした仮名 + + outer_loop = 0 + while base_index < len(phrase): + outer_loop += 1 + if phrase[base_index] == ACCENT_SYMBOL: + if len(moras) == 0: + raise ParseKanaError(ParseKanaErrorCode.ACCENT_TOP, text=phrase) + if accent_index is not None: + raise ParseKanaError(ParseKanaErrorCode.ACCENT_TWICE, text=phrase) + accent_index = len(moras) + base_index += 1 + continue + for watch_index in range(base_index, len(phrase)): + if phrase[watch_index] == ACCENT_SYMBOL: + break + # 普通の文字の場合 + stack += phrase[watch_index] + if stack in text2mora_with_unvoice: + matched_text = stack + # push mora + if matched_text is None: + raise ParseKanaError(ParseKanaErrorCode.UNKNOWN_TEXT, text=stack) + else: + moras.append(text2mora_with_unvoice[matched_text]) + base_index += len(matched_text) + stack = "" + matched_text = None + if outer_loop > LOOP_LIMIT: + raise ParseKanaError(ParseKanaErrorCode.INFINITE_LOOP) + if accent_index is None: + raise ParseKanaError(ParseKanaErrorCode.ACCENT_NOTFOUND, text=phrase) + else: + return AccentPhrase(moras=moras, accent=accent_index, pause_mora=None) + + +def parse_kana(text: str) -> List[AccentPhrase]: + """ + AquesTalkライクな読み仮名をパースして音長・音高未指定のaccent phraseに変換 + """ + parsed_results: List[AccentPhrase] = [] + phrase_base = 0 + for i in range(len(text) + 1): + if i == len(text) or text[i] in [PAUSE_DELIMITER, NOPAUSE_DELIMITER]: + phrase = text[phrase_base:i] + if len(phrase) == 0: + raise ParseKanaError( + ParseKanaErrorCode.EMPTY_PHRASE, + position=str(len(parsed_results) + 1), + ) + phrase_base = i + 1 + accent_phrase: AccentPhrase = _text_to_accent_phrase(phrase) + if i < len(text) and text[i] == PAUSE_DELIMITER: + accent_phrase.pause_mora = Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0, + pitch=0, + ) + parsed_results.append(accent_phrase) + return parsed_results + + +def create_kana(accent_phrases: List[AccentPhrase]) -> str: + text = "" + for i, phrase in enumerate(accent_phrases): + for j, mora in enumerate(phrase.moras): + if mora.vowel in ["A", "I", "U", "E", "O"]: + text += UNVOICE_SYMBOL + text += mora.text + if j + 1 == phrase.accent: + text += ACCENT_SYMBOL + if i < len(accent_phrases) - 1: + if phrase.pause_mora is None: + text += NOPAUSE_DELIMITER + else: + text += PAUSE_DELIMITER + return text diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index 0cf41721d..c94209056 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -1,4 +1,5 @@ -from typing import List, Optional +from enum import Enum +from typing import Dict, List, Optional from pydantic import BaseModel, Field @@ -40,6 +41,43 @@ class AudioQuery(BaseModel): postPhonemeLength: float = Field(title="音声の後の無音時間") outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") outputStereo: bool = Field(title="音声データをステレオ出力するか否か") + kana: Optional[str] = Field(title="[読み取り専用]AquesTalkライクな読み仮名。音声合成クエリとしては無視される") + + +class ParseKanaErrorCode(Enum): + UNKNOWN_TEXT = "判別できない読み仮名があります: {text}" + ACCENT_TOP = "句頭にアクセントは置けません: {text}" + ACCENT_TWICE = "1つのアクセント句に二つ以上のアクセントは置けません: {text}" + ACCENT_NOTFOUND = "アクセントを指定していないアクセント句があります: {text}" + EMPTY_PHRASE = "{position}番目のアクセント句が空白です" + INFINITE_LOOP = "処理時に無限ループになってしまいました...バグ報告をお願いします。" + + +class ParseKanaError(Exception): + def __init__(self, errcode: ParseKanaErrorCode, **kwargs): + self.errcode = errcode + self.errname = errcode.name + self.kwargs: Dict[str, str] = kwargs + err_fmt: str = errcode.value + self.text = err_fmt.format(**kwargs) + + +class ParseKanaBadRequest(BaseModel): + text: str = Field(title="エラーメッセージ") + error_name: str = Field( + title="エラー名", + description="|name|description|\n|---|---|\n" + + "\n".join( + [ + "| {} | {} |".format(err.name, err.value) + for err in list(ParseKanaErrorCode) + ] + ), + ) + error_args: Dict[str, str] = Field(title="エラーを起こした箇所") + + def __init__(self, err: ParseKanaError): + super().__init__(text=err.text, error_name=err.errname, error_args=err.kwargs) class Speaker(BaseModel): diff --git a/voicevox_engine/mora_list.py b/voicevox_engine/mora_list.py index af7da55d9..5a49f4a3a 100644 --- a/voicevox_engine/mora_list.py +++ b/voicevox_engine/mora_list.py @@ -41,171 +41,178 @@ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ +_mora_list_minimum = [ + ["ヴォ", "v", "o"], + ["ヴェ", "v", "e"], + ["ヴィ", "v", "i"], + ["ヴァ", "v", "a"], + ["ヴ", "v", "u"], + ["ン", "", "N"], + ["ワ", "w", "a"], + ["ロ", "r", "o"], + ["レ", "r", "e"], + ["ル", "r", "u"], + ["リョ", "ry", "o"], + ["リュ", "ry", "u"], + ["リャ", "ry", "a"], + ["リェ", "ry", "e"], + ["リ", "r", "i"], + ["ラ", "r", "a"], + ["ヨ", "y", "o"], + ["ユ", "y", "u"], + ["ヤ", "y", "a"], + ["モ", "m", "o"], + ["メ", "m", "e"], + ["ム", "m", "u"], + ["ミョ", "my", "o"], + ["ミュ", "my", "u"], + ["ミャ", "my", "a"], + ["ミェ", "my", "e"], + ["ミ", "m", "i"], + ["マ", "m", "a"], + ["ポ", "p", "o"], + ["ボ", "b", "o"], + ["ホ", "h", "o"], + ["ペ", "p", "e"], + ["ベ", "b", "e"], + ["ヘ", "h", "e"], + ["プ", "p", "u"], + ["ブ", "b", "u"], + ["フォ", "f", "o"], + ["フェ", "f", "e"], + ["フィ", "f", "i"], + ["ファ", "f", "a"], + ["フ", "f", "u"], + ["ピョ", "py", "o"], + ["ピュ", "py", "u"], + ["ピャ", "py", "a"], + ["ピェ", "py", "e"], + ["ピ", "p", "i"], + ["ビョ", "by", "o"], + ["ビュ", "by", "u"], + ["ビャ", "by", "a"], + ["ビェ", "by", "e"], + ["ビ", "b", "i"], + ["ヒョ", "hy", "o"], + ["ヒュ", "hy", "u"], + ["ヒャ", "hy", "a"], + ["ヒェ", "hy", "e"], + ["ヒ", "h", "i"], + ["パ", "p", "a"], + ["バ", "b", "a"], + ["ハ", "h", "a"], + ["ノ", "n", "o"], + ["ネ", "n", "e"], + ["ヌ", "n", "u"], + ["ニョ", "ny", "o"], + ["ニュ", "ny", "u"], + ["ニャ", "ny", "a"], + ["ニェ", "ny", "e"], + ["ニ", "n", "i"], + ["ナ", "n", "a"], + ["ドゥ", "d", "u"], + ["ド", "d", "o"], + ["トゥ", "t", "u"], + ["ト", "t", "o"], + ["デョ", "dy", "o"], + ["デュ", "dy", "u"], + ["デャ", "dy", "a"], + ["デェ", "dy", "e"], + ["ディ", "d", "i"], + ["デ", "d", "e"], + ["テョ", "ty", "o"], + ["テュ", "ty", "u"], + ["テャ", "ty", "a"], + ["ティ", "t", "i"], + ["テ", "t", "e"], + ["ツォ", "ts", "o"], + ["ツェ", "ts", "e"], + ["ツィ", "ts", "i"], + ["ツァ", "ts", "a"], + ["ツ", "ts", "u"], + ["ッ", "", "cl"], + ["チョ", "ch", "o"], + ["チュ", "ch", "u"], + ["チャ", "ch", "a"], + ["チェ", "ch", "e"], + ["チ", "ch", "i"], + ["ダ", "d", "a"], + ["タ", "t", "a"], + ["ゾ", "z", "o"], + ["ソ", "s", "o"], + ["ゼ", "z", "e"], + ["セ", "s", "e"], + ["ズィ", "z", "i"], + ["ズ", "z", "u"], + ["スィ", "s", "i"], + ["ス", "s", "u"], + ["ジョ", "j", "o"], + ["ジュ", "j", "u"], + ["ジャ", "j", "a"], + ["ジェ", "j", "e"], + ["ジ", "j", "i"], + ["ショ", "sh", "o"], + ["シュ", "sh", "u"], + ["シャ", "sh", "a"], + ["シェ", "sh", "e"], + ["シ", "sh", "i"], + ["ザ", "z", "a"], + ["サ", "s", "a"], + ["ゴ", "g", "o"], + ["コ", "k", "o"], + ["ゲ", "g", "e"], + ["ケ", "k", "e"], + ["グヮ", "gw", "a"], + ["グ", "g", "u"], + ["クヮ", "kw", "a"], + ["ク", "k", "u"], + ["ギョ", "gy", "o"], + ["ギュ", "gy", "u"], + ["ギャ", "gy", "a"], + ["ギェ", "gy", "e"], + ["ギ", "g", "i"], + ["キョ", "ky", "o"], + ["キュ", "ky", "u"], + ["キャ", "ky", "a"], + ["キェ", "ky", "e"], + ["キ", "k", "i"], + ["ガ", "g", "a"], + ["カ", "k", "a"], + ["オ", "", "o"], + ["エ", "", "e"], + ["ウォ", "w", "o"], + ["ウェ", "w", "e"], + ["ウィ", "w", "i"], + ["ウ", "", "u"], + ["イェ", "y", "e"], + ["イ", "", "i"], + ["ア", "", "a"], +] +_mora_list_additional = [ + ["ヴョ", "by", "o"], + ["ヴュ", "by", "u"], + ["ヴャ", "by", "a"], + ["ヲ", "", "o"], + ["ヱ", "", "e"], + ["ヰ", "", "i"], + ["ヮ", "w", "a"], + ["ョ", "y", "o"], + ["ュ", "y", "u"], + ["ヅ", "z", "u"], + ["ヂ", "j", "i"], + ["ヶ", "k", "e"], + ["ャ", "y", "a"], + ["ォ", "", "o"], + ["ェ", "", "e"], + ["ゥ", "", "u"], + ["ィ", "", "i"], + ["ァ", "", "a"], +] + openjtalk_mora2text = { - consonant + vowel: text - for [text, consonant, vowel] in [ - # ["ヴョ", "by", "o"], - # ["ヴュ", "by", "u"], - # ["ヴャ", "by", "a"], - ["ヴォ", "v", "o"], - ["ヴェ", "v", "e"], - ["ヴィ", "v", "i"], - ["ヴァ", "v", "a"], - ["ヴ", "v", "u"], - ["ン", "", "N"], - # ["ヲ", "", "o"], - # ["ヱ", "", "e"], - # ["ヰ", "", "i"], - ["ワ", "w", "a"], - # ["ヮ", "w", "a"], - ["ロ", "r", "o"], - ["レ", "r", "e"], - ["ル", "r", "u"], - ["リョ", "ry", "o"], - ["リュ", "ry", "u"], - ["リャ", "ry", "a"], - ["リェ", "ry", "e"], - ["リ", "r", "i"], - ["ラ", "r", "a"], - ["ヨ", "y", "o"], - # ["ョ", "y", "o"], - ["ユ", "y", "u"], - # ["ュ", "y", "u"], - ["ヤ", "y", "a"], - # ["ャ", "y", "a"], - ["モ", "m", "o"], - ["メ", "m", "e"], - ["ム", "m", "u"], - ["ミョ", "my", "o"], - ["ミュ", "my", "u"], - ["ミャ", "my", "a"], - ["ミェ", "my", "e"], - ["ミ", "m", "i"], - ["マ", "m", "a"], - ["ポ", "p", "o"], - ["ボ", "b", "o"], - ["ホ", "h", "o"], - ["ペ", "p", "e"], - ["ベ", "b", "e"], - ["ヘ", "h", "e"], - ["プ", "p", "u"], - ["ブ", "b", "u"], - ["フォ", "f", "o"], - ["フェ", "f", "e"], - ["フィ", "f", "i"], - ["ファ", "f", "a"], - ["フ", "f", "u"], - ["ピョ", "py", "o"], - ["ピュ", "py", "u"], - ["ピャ", "py", "a"], - ["ピェ", "py", "e"], - ["ピ", "p", "i"], - ["ビョ", "by", "o"], - ["ビュ", "by", "u"], - ["ビャ", "by", "a"], - ["ビェ", "by", "e"], - ["ビ", "b", "i"], - ["ヒョ", "hy", "o"], - ["ヒュ", "hy", "u"], - ["ヒャ", "hy", "a"], - ["ヒェ", "hy", "e"], - ["ヒ", "h", "i"], - ["パ", "p", "a"], - ["バ", "b", "a"], - ["ハ", "h", "a"], - ["ノ", "n", "o"], - ["ネ", "n", "e"], - ["ヌ", "n", "u"], - ["ニョ", "ny", "o"], - ["ニュ", "ny", "u"], - ["ニャ", "ny", "a"], - ["ニェ", "ny", "e"], - ["ニ", "n", "i"], - ["ナ", "n", "a"], - ["ドゥ", "d", "u"], - ["ド", "d", "o"], - ["トゥ", "t", "u"], - ["ト", "t", "o"], - ["デョ", "dy", "o"], - ["デュ", "dy", "u"], - ["デャ", "dy", "a"], - ["デェ", "dy", "e"], - ["ディ", "d", "i"], - ["デ", "d", "e"], - ["テョ", "ty", "o"], - ["テュ", "ty", "u"], - ["テャ", "ty", "a"], - ["ティ", "t", "i"], - ["テ", "t", "e"], - # ["ヅ", "z", "u"], - ["ツォ", "ts", "o"], - ["ツェ", "ts", "e"], - ["ツィ", "ts", "i"], - ["ツァ", "ts", "a"], - ["ツ", "ts", "u"], - ["ッ", "", "cl"], - # ["ヂ", "j", "i"], - ["チョ", "ch", "o"], - ["チュ", "ch", "u"], - ["チャ", "ch", "a"], - ["チェ", "ch", "e"], - ["チ", "ch", "i"], - ["ダ", "d", "a"], - ["タ", "t", "a"], - ["ゾ", "z", "o"], - ["ソ", "s", "o"], - ["ゼ", "z", "e"], - ["セ", "s", "e"], - ["ズィ", "z", "i"], - ["ズ", "z", "u"], - ["スィ", "s", "i"], - ["ス", "s", "u"], - ["ジョ", "j", "o"], - ["ジュ", "j", "u"], - ["ジャ", "j", "a"], - ["ジェ", "j", "e"], - ["ジ", "j", "i"], - ["ショ", "sh", "o"], - ["シュ", "sh", "u"], - ["シャ", "sh", "a"], - ["シェ", "sh", "e"], - ["シ", "sh", "i"], - ["ザ", "z", "a"], - ["サ", "s", "a"], - ["ゴ", "g", "o"], - ["コ", "k", "o"], - ["ゲ", "g", "e"], - ["ケ", "k", "e"], - # ["ヶ", "k", "e"], - ["グヮ", "gw", "a"], - ["グ", "g", "u"], - ["クヮ", "kw", "a"], - ["ク", "k", "u"], - ["ギョ", "gy", "o"], - ["ギュ", "gy", "u"], - ["ギャ", "gy", "a"], - ["ギェ", "gy", "e"], - ["ギ", "g", "i"], - ["キョ", "ky", "o"], - ["キュ", "ky", "u"], - ["キャ", "ky", "a"], - ["キェ", "ky", "e"], - ["キ", "k", "i"], - ["ガ", "g", "a"], - ["カ", "k", "a"], - ["オ", "", "o"], - # ["ォ", "", "o"], - ["エ", "", "e"], - # ["ェ", "", "e"], - ["ウォ", "w", "o"], - ["ウェ", "w", "e"], - ["ウィ", "w", "i"], - ["ウ", "", "u"], - # ["ゥ", "", "u"], - ["イェ", "y", "e"], - ["イ", "", "i"], - # ["ィ", "", "i"], - ["ア", "", "a"], - # ["ァ", "", "a"], - ] + consonant + vowel: text for [text, consonant, vowel] in _mora_list_minimum +} +openjtalk_text2mora = { + text: (consonant, vowel) + for [text, consonant, vowel] in _mora_list_minimum + _mora_list_additional } From 5ed623e5edc6b714c80c9e91fcf66074d826d93a Mon Sep 17 00:00:00 2001 From: ISHIDA Naoto Date: Fri, 10 Sep 2021 00:43:25 +0900 Subject: [PATCH 5/7] =?UTF-8?q?[Mock]=20SynthesisEngine=E3=81=AE=E7=8E=87?= =?UTF-8?q?=E7=9B=B4=E3=81=AAMock=20ref.=20#27=20(#78)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add Mock class of SynthesisEngine w/ pyopenjtalk * update: use Mock class for SynthesisEngine if import core was failure. * fix: add new line for pass ci/lint * fix: type annotation. uses numpy.typing * fix: boolean value was reversed --- run.py | 19 ++- .../dev/synthesis_engine/__init__.py | 3 + voicevox_engine/dev/synthesis_engine/mock.py | 128 ++++++++++++++++++ 3 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 voicevox_engine/dev/synthesis_engine/__init__.py create mode 100644 voicevox_engine/dev/synthesis_engine/mock.py diff --git a/run.py b/run.py index bcbd7b512..4b13f78d4 100644 --- a/run.py +++ b/run.py @@ -52,11 +52,14 @@ def make_synthesis_engine( if voicevox_dir.exists(): sys.path.insert(0, str(voicevox_dir)) + has_voicevox_core = True try: import core except ImportError: from voicevox_engine.dev import core + has_voicevox_core = False + # 音声ライブラリの Python モジュールをロードできなかった print( "Notice: mock-library will be used. Try re-run with valid --voicevox_dir", # noqa @@ -68,12 +71,20 @@ def make_synthesis_engine( core.initialize(voicelib_dir.as_posix() + "/", use_gpu) - return SynthesisEngine( - yukarin_s_forwarder=core.yukarin_s_forward, - yukarin_sa_forwarder=core.yukarin_sa_forward, - decode_forwarder=core.decode_forward, + if has_voicevox_core: + return SynthesisEngine( + yukarin_s_forwarder=core.yukarin_s_forward, + yukarin_sa_forwarder=core.yukarin_sa_forward, + decode_forwarder=core.decode_forward, + ) + + from voicevox_engine.dev.synthesis_engine import ( + SynthesisEngine as mock_synthesis_engine, ) + # モックで置き換える + return mock_synthesis_engine() + def mora_to_text(mora: str): if mora[-1:] in ["A", "I", "U", "E", "O"]: diff --git a/voicevox_engine/dev/synthesis_engine/__init__.py b/voicevox_engine/dev/synthesis_engine/__init__.py new file mode 100644 index 000000000..373ee86de --- /dev/null +++ b/voicevox_engine/dev/synthesis_engine/__init__.py @@ -0,0 +1,3 @@ +from .mock import SynthesisEngine + +__all__ = ["SynthesisEngine"] diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py new file mode 100644 index 000000000..d8b6a3449 --- /dev/null +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -0,0 +1,128 @@ +from logging import getLogger +from typing import Any, Dict, List + +import numpy as np +import numpy.typing as npt +from pyopenjtalk import tts +from resampy import resample + +from voicevox_engine.model import AccentPhrase, AudioQuery +from voicevox_engine.synthesis_engine import to_flatten_moras + + +class SynthesisEngine: + """ + SynthesisEngine [Mock] + """ + + def __init__(self, **kwargs): + """ + __init__ [Mock] + """ + super().__init__() + + def replace_phoneme_length( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: + """ + replace_phoneme_length 入力accent_phrasesを変更せずにそのまま返します [Mock] + + Parameters + ---------- + accent_phrases : List[AccentPhrase] + フレーズ句のリスト + speaker_id : int + 話者 + + Returns + ------- + List[AccentPhrase] + フレーズ句のリスト(変更なし) + """ + return accent_phrases + + def replace_mora_pitch( + self, accent_phrases: List[AccentPhrase], speaker_id: int + ) -> List[AccentPhrase]: + """ + replace_mora_pitch 入力accent_phrasesを変更せずにそのまま返します [Mock] + + Parameters + ---------- + accent_phrases : List[AccentPhrase] + フレーズ句のリスト + speaker_id : int + 話者 + + Returns + ------- + List[AccentPhrase] + フレーズ句のリスト(変更なし) + """ + return accent_phrases + + def synthesis(self, query: AudioQuery, speaker_id: int) -> npt.NDArray[np.int16]: + """ + synthesis voicevox coreを使わずに、音声合成する [Mock] + + Parameters + ---------- + query : AudioQuery + /audio_query APIで得たjson + speaker_id : int + 話者 + + Returns + ------- + wave [npt.NDArray[np.int16]] + 音声波形データをNumPy配列で返します + """ + # recall text in katakana + flatten_moras = to_flatten_moras(query.accent_phrases) + kana_text = "".join([mora.text for mora in flatten_moras]) + + wave = self.forward(kana_text) + + # volume + if query.volumeScale != 1: + wave *= query.volumeScale + + return wave.astype("int16") + + def forward(self, text: str, **kwargs: Dict[str, Any]) -> npt.NDArray[np.int16]: + """ + forward tts via pyopenjtalk.tts() + 参照→SynthesisEngine のdocstring [Mock] + + Parameters + ---------- + text : str + 入力文字列(例:読み上げたい文章をカタカナにした文字列、等) + + Returns + ------- + wave [npt.NDArray[np.int16]] + 音声波形データをNumPy配列で返します + + Note + ------- + ここで行う音声合成では、調声(ピッチ等)を反映しない + + # pyopenjtalk.tts()の出力仕様 + dtype=np.float64, 16 bit, mono 48000 Hz + + # resampleの説明 + 本来はfloat64の入力でも問題ないのかと思われたが、実際には出力が音割れひどかった。 + 対策として、あらかじめint16に型変換しておくと、期待通りの結果になった。 + 非モック実装(decode_forward)と合わせるために、出力を24kHzに変換した。 + """ + logger = getLogger("uvicorn") # FastAPI / Uvicorn 内からの利用のため + logger.info("[Mock] input text: %s" % text) + wave, sr = tts(text) + wave = resample( + wave.astype("int16"), + sr, + 24000, + filter="kaiser_fast", + ) + return wave.astype("int16") From f3f9f91c3068bbbba0a194f5783ed9998dd0e374 Mon Sep 17 00:00:00 2001 From: buckw6eat Date: Sat, 11 Sep 2021 00:50:59 +0900 Subject: [PATCH 6/7] fix: align numpy 1.20 syntax (#81) --- voicevox_engine/dev/synthesis_engine/mock.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index d8b6a3449..801d5b348 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -2,7 +2,6 @@ from typing import Any, Dict, List import numpy as np -import numpy.typing as npt from pyopenjtalk import tts from resampy import resample @@ -61,7 +60,7 @@ def replace_mora_pitch( """ return accent_phrases - def synthesis(self, query: AudioQuery, speaker_id: int) -> npt.NDArray[np.int16]: + def synthesis(self, query: AudioQuery, speaker_id: int) -> np.ndarray: """ synthesis voicevox coreを使わずに、音声合成する [Mock] @@ -89,7 +88,7 @@ def synthesis(self, query: AudioQuery, speaker_id: int) -> npt.NDArray[np.int16] return wave.astype("int16") - def forward(self, text: str, **kwargs: Dict[str, Any]) -> npt.NDArray[np.int16]: + def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray: """ forward tts via pyopenjtalk.tts() 参照→SynthesisEngine のdocstring [Mock] From 264aff64cfdf4cf91f51972685a080df0f1271c9 Mon Sep 17 00:00:00 2001 From: buckw6eat Date: Sat, 11 Sep 2021 01:09:31 +0900 Subject: [PATCH 7/7] =?UTF-8?q?=E9=9F=B3=E7=B4=A0=E5=8C=96=E3=81=95?= =?UTF-8?q?=E3=82=8C=E3=81=AA=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88?= =?UTF-8?q?=E5=85=A5=E5=8A=9B=E3=81=AB=E5=AF=BE=E5=BF=9C=20(#82)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: make sure `utterance` isn't empty * Apply suggestions from code review テスト Co-authored-by: Hiroshiba --- run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/run.py b/run.py index 4b13f78d4..cf59c1bfd 100644 --- a/run.py +++ b/run.py @@ -130,6 +130,9 @@ def create_accent_phrases(text: str, speaker_id: int) -> List[AccentPhrase]: return [] utterance = extract_full_context_label(text) + if len(utterance.breath_groups) == 0: + return [] + return replace_mora_data( accent_phrases=[ AccentPhrase(