From 7bc1b210a82f46fa4520b05106bb1891ca3635e6 Mon Sep 17 00:00:00 2001 From: Yuto Ashida Date: Mon, 22 Jan 2024 23:25:45 +0900 Subject: [PATCH] =?UTF-8?q?[project-s]=20=E3=83=8F=E3=83=9F=E3=83=B3?= =?UTF-8?q?=E3=82=B0=E7=94=A8API=E3=82=92=E8=BF=BD=E5=8A=A0=20(#1008)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update metas (add style type) * update engine manifest (add frame rate) * add sing api to core wrapper * add sing api to core adapter * add models for sing api * add sing process to tts engine * add sing api * fix miss * add fixme comment Co-authored-by: Hiroshiba * remove sing type * fix typo * remove optional * translate error detail * get -> create * fix docs * Revert "remove optional" This reverts commit 12b8fc6413049c115f98035b07a74f12394ea5de. * fix pytest * add comment * add fixme comment Co-authored-by: Hiroshiba * improve models --------- Co-authored-by: Hiroshiba --- engine_manifest.json | 1 + run.py | 73 ++++++ .../test_fetch_speakers_success.json | 30 ++- voicevox_engine/core_adapter.py | 81 +++++++ voicevox_engine/core_wrapper.py | 209 ++++++++++++++++++ .../engine_manifest/EngineManifest.py | 1 + .../engine_manifest/EngineManifestLoader.py | 1 + voicevox_engine/metas/Metas.py | 4 +- voicevox_engine/model.py | 40 ++++ voicevox_engine/tts_pipeline/tts_engine.py | 204 ++++++++++++++++- 10 files changed, 627 insertions(+), 17 deletions(-) diff --git a/engine_manifest.json b/engine_manifest.json index f4a5dfaa1..c4dd5f352 100644 --- a/engine_manifest.json +++ b/engine_manifest.json @@ -9,6 +9,7 @@ "port": 50021, "icon": "engine_manifest_assets/icon.png", "default_sampling_rate": 24000, + "frame_rate": 93.75, "terms_of_service": "engine_manifest_assets/terms_of_service.md", "update_infos": "engine_manifest_assets/update_infos.json", "dependency_licenses": "engine_manifest_assets/dependency_licenses.json", diff --git a/run.py b/run.py index 7041028cb..1843ba04c 100644 --- a/run.py +++ b/run.py @@ -42,10 +42,12 @@ AudioQuery, BaseLibraryInfo, DownloadableLibraryInfo, + FrameAudioQuery, InstalledLibraryInfo, MorphableTargetInfo, ParseKanaBadRequest, ParseKanaError, + Score, Speaker, SpeakerInfo, StyleIdNotFoundError, @@ -704,6 +706,77 @@ def _synthesis_morphing( background=BackgroundTask(delete_file, f.name), ) + @app.post( + "/sing_frame_audio_query", + response_model=FrameAudioQuery, + tags=["クエリ作成"], + summary="歌唱音声合成用のクエリを作成する", + ) + def sing_frame_audio_query( + score: Score, + style_id: StyleId | None = Query(default=None), # noqa: B008 + speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008 + core_version: str | None = None, + ) -> FrameAudioQuery: + """ + 歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。 + """ + style_id = get_style_id_from_deprecated( + style_id=style_id, deprecated_speaker=speaker + ) + engine = get_engine(core_version) + core = get_core(core_version) + phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume( + score, style_id + ) + + return FrameAudioQuery( + f0=f0, + volume=volume, + phonemes=phonemes, + volumeScale=1, + outputSamplingRate=core.default_sampling_rate, + outputStereo=False, + ) + + @app.post( + "/frame_synthesis", + response_class=FileResponse, + responses={ + 200: { + "content": { + "audio/wav": {"schema": {"type": "string", "format": "binary"}} + }, + } + }, + tags=["音声合成"], + ) + def frame_synthesis( + query: FrameAudioQuery, + style_id: StyleId | None = Query(default=None), # noqa: B008 + speaker: StyleId | None = Query(default=None, deprecated=True), # noqa: B008 + core_version: str | None = None, + ) -> FileResponse: + """ + 歌唱音声合成を行います。 + """ + style_id = get_style_id_from_deprecated( + style_id=style_id, deprecated_speaker=speaker + ) + engine = get_engine(core_version) + wave = engine.frame_synthsize_wave(query, style_id) + + with NamedTemporaryFile(delete=False) as f: + soundfile.write( + file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV" + ) + + return FileResponse( + f.name, + media_type="audio/wav", + background=BackgroundTask(delete_file, f.name), + ) + @app.post( "/connect_waves", response_class=FileResponse, diff --git a/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json b/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json index f948bf0f1..9464bec39 100644 --- a/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json +++ b/test/e2e/__snapshots__/test_validate_speakers/test_fetch_speakers_success.json @@ -5,19 +5,23 @@ "styles": [ { "id": 0, - "name": "style0" + "name": "style0", + "type": null }, { "id": 2, - "name": "style1" + "name": "style1", + "type": null }, { "id": 4, - "name": "style2" + "name": "style2", + "type": null }, { "id": 6, - "name": "style3" + "name": "style3", + "type": null } ], "supported_features": { @@ -31,19 +35,23 @@ "styles": [ { "id": 1, - "name": "style0" + "name": "style0", + "type": null }, { "id": 3, - "name": "style1" + "name": "style1", + "type": null }, { "id": 5, - "name": "style2" + "name": "style2", + "type": null }, { "id": 7, - "name": "style3" + "name": "style3", + "type": null } ], "supported_features": { @@ -57,7 +65,8 @@ "styles": [ { "id": 8, - "name": "style0" + "name": "style0", + "type": null } ], "supported_features": { @@ -71,7 +80,8 @@ "styles": [ { "id": 9, - "name": "style0" + "name": "style0", + "type": null } ], "supported_features": { diff --git a/voicevox_engine/core_adapter.py b/voicevox_engine/core_adapter.py index 2fe77349a..5ff5d61f2 100644 --- a/voicevox_engine/core_adapter.py +++ b/voicevox_engine/core_adapter.py @@ -143,3 +143,84 @@ def safe_decode_forward( ) sr_wave = self.default_sampling_rate return wave, sr_wave + + def safe_predict_sing_consonant_length_forward( + self, + consonant: NDArray[np.int64], + vowel: NDArray[np.int64], + note_duration: NDArray[np.int64], + style_id: StyleId, + ) -> NDArray[np.int64]: + # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する + self.initialize_style_id_synthesis(style_id, skip_reinit=True) + + with self.mutex: + consonant_length = self.core.predict_sing_consonant_length_forward( + length=consonant.shape[0], + consonant=consonant[np.newaxis], + vowel=vowel[np.newaxis], + note_duration=note_duration[np.newaxis], + style_id=np.array(style_id, dtype=np.int64).reshape(-1), + ) + + return consonant_length + + def safe_predict_sing_f0_forward( + self, + phoneme: NDArray[np.int64], + note: NDArray[np.int64], + style_id: StyleId, + ) -> NDArray[np.float32]: + # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する + self.initialize_style_id_synthesis(style_id, skip_reinit=True) + + with self.mutex: + f0 = self.core.predict_sing_f0_forward( + length=phoneme.shape[0], + phoneme=phoneme[np.newaxis], + note=note[np.newaxis], + style_id=np.array(style_id, dtype=np.int64).reshape(-1), + ) + + return f0 + + def safe_predict_sing_volume_forward( + self, + phoneme: NDArray[np.int64], + note: NDArray[np.int64], + f0: NDArray[np.float32], + style_id: StyleId, + ) -> NDArray[np.float32]: + # 「指定スタイルを初期化」「mutexによる安全性」「コア仕様に従う無音付加」「系列長・データ型に関するアダプター」を提供する + self.initialize_style_id_synthesis(style_id, skip_reinit=True) + + with self.mutex: + volume = self.core.predict_sing_volume_forward( + length=phoneme.shape[0], + phoneme=phoneme[np.newaxis], + note=note[np.newaxis], + f0=f0[np.newaxis], + style_id=np.array(style_id, dtype=np.int64).reshape(-1), + ) + + return volume + + def safe_sf_decode_forward( + self, + phoneme: NDArray[np.int64], + f0: NDArray[np.float32], + volume: NDArray[np.float32], + style_id: StyleId, + ) -> tuple[NDArray[np.float32], int]: + # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する + self.initialize_style_id_synthesis(style_id, skip_reinit=True) + with self.mutex: + wave = self.core.sf_decode_forward( + length=phoneme.shape[0], + phoneme=phoneme[np.newaxis], + f0=f0[np.newaxis], + volume=volume[np.newaxis], + style_id=np.array(style_id, dtype=np.int64).reshape(-1), + ) + sr_wave = self.default_sampling_rate + return wave, sr_wave diff --git a/voicevox_engine/core_wrapper.py b/voicevox_engine/core_wrapper.py index 83fa5f417..d8fb10f7f 100644 --- a/voicevox_engine/core_wrapper.py +++ b/voicevox_engine/core_wrapper.py @@ -432,6 +432,57 @@ def _type_decode_forward(core_cdll: CDLL) -> None: core_cdll.decode_forward.restype = c_bool +def _type_predict_sing_consonant_length_forward(core_cdll: CDLL) -> None: + """コアDLL `predict_sing_consonant_length_forward` 関数を型付けする""" + core_cdll.predict_sing_consonant_length_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + ) + core_cdll.predict_sing_consonant_length_forward.restype = c_bool + + +def _type_predict_sing_f0_forward(core_cdll: CDLL) -> None: + """コアDLL `predict_sing_f0_forward` 関数を型付けする""" + core_cdll.predict_sing_f0_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.predict_sing_f0_forward.restype = c_bool + + +def _type_predict_sing_volume_forward(core_cdll: CDLL) -> None: + """コアDLL `predict_sing_volume_forward` 関数を型付けする""" + core_cdll.predict_sing_volume_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_long), + POINTER(c_float), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.predict_sing_volume_forward.restype = c_bool + + +def _type_sf_decode_forward(core_cdll: CDLL) -> None: + """コアDLL `sf_decoder_forward` 関数を型付けする""" + core_cdll.sf_decode_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_float), + POINTER(c_float), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.sf_decode_forward.restype = c_bool + + def _type_last_error_message(core_cdll: CDLL) -> None: """コアDLL `last_error_message` 関数を型付けする""" core_cdll.last_error_message.restype = c_char_p @@ -477,6 +528,10 @@ def __init__( _type_yukarin_s_forward(self.core) _type_yukarin_sa_forward(self.core) _type_decode_forward(self.core) + _type_predict_sing_consonant_length_forward(self.core) + _type_predict_sing_f0_forward(self.core) + _type_predict_sing_volume_forward(self.core) + _type_sf_decode_forward(self.core) _type_last_error_message(self.core) self.exist_supported_devices = False @@ -656,6 +711,160 @@ def decode_forward( ) return output + def predict_sing_consonant_length_forward( + self, + length: int, + consonant: NDArray[np.int64], + vowel: NDArray[np.int64], + note_duration: NDArray[np.int64], + style_id: NDArray[np.int64], + ) -> NDArray[np.int64]: + """ + 子音・母音列から、音素ごとの長さを求める関数 + Parameters + ---------- + length : int + 音素列の長さ + consonant : NDArray[np.int64] + 子音列 + vowel : NDArray[np.int64] + 母音列 + note_duration : NDArray[np.int64] + ノート列 + style_id : NDArray[np.int64] + スタイル番号 + Returns + ------- + output : NDArray[np.int64] + 子音長 + """ + output = np.zeros((length,), dtype=np.int64) + self.assert_core_success( + self.core.predict_sing_consonant_length_forward( + c_int(length), + consonant.ctypes.data_as(POINTER(c_long)), + vowel.ctypes.data_as(POINTER(c_long)), + note_duration.ctypes.data_as(POINTER(c_long)), + style_id.ctypes.data_as(POINTER(c_long)), + output.ctypes.data_as(POINTER(c_long)), + ) + ) + return output + + def predict_sing_f0_forward( + self, + length: int, + phoneme: NDArray[np.int64], + note: NDArray[np.int64], + style_id: NDArray[np.int64], + ) -> NDArray[np.float32]: + """ + フレームごとの音素列とノート列から、フレームごとのF0を求める関数 + Parameters + ---------- + length : int + 音素列の長さ + phoneme : NDArray[np.int64] + 音素列 + note : NDArray[np.int64] + ノート列 + style_id : NDArray[np.int64] + スタイル番号 + Returns + ------- + output : NDArray[np.float32] + フレームごとのF0 + """ + output = np.zeros((length,), dtype=np.float32) + self.assert_core_success( + self.core.predict_sing_f0_forward( + c_int(length), + phoneme.ctypes.data_as(POINTER(c_long)), + note.ctypes.data_as(POINTER(c_long)), + style_id.ctypes.data_as(POINTER(c_long)), + output.ctypes.data_as(POINTER(c_float)), + ) + ) + return output + + def predict_sing_volume_forward( + self, + length: int, + phoneme: NDArray[np.int64], + note: NDArray[np.int64], + f0: NDArray[np.float32], + style_id: NDArray[np.int64], + ) -> NDArray[np.float32]: + """ + フレームごとの音素列とノート列から、フレームごとのvolumeを求める関数 + Parameters + ---------- + length : int + 音素列の長さ + phoneme : NDArray[np.int64] + 音素列 + note : NDArray[np.int64] + ノート列 + style_id : NDArray[np.int64] + スタイル番号 + Returns + ------- + output : NDArray[np.float32] + フレームごとのF0 + """ + output = np.zeros((length,), dtype=np.float32) + self.assert_core_success( + self.core.predict_sing_volume_forward( + c_int(length), + phoneme.ctypes.data_as(POINTER(c_long)), + note.ctypes.data_as(POINTER(c_long)), + f0.ctypes.data_as(POINTER(c_float)), + style_id.ctypes.data_as(POINTER(c_long)), + output.ctypes.data_as(POINTER(c_float)), + ) + ) + return output + + def sf_decode_forward( + self, + length: int, + phoneme: NDArray[np.int64], + f0: NDArray[np.float32], + volume: NDArray[np.float32], + style_id: NDArray[np.int64], + ) -> NDArray[np.float32]: + """ + フレームごとの音素と音高から波形を求める関数 + Parameters + ---------- + length : int + フレームの長さ + phoneme : NDArray[np.int64] + フレームごとの音素 + f0 : NDArray[np.float32] + フレームごとの音高 + volume : NDArray[np.float32] + フレームごとの音量 + style_id : NDArray[np.int64] + スタイル番号 + Returns + ------- + output : NDArray[np.float32] + 音声波形 + """ + output = np.zeros((length * 256,), dtype=np.float32) + self.assert_core_success( + self.core.sf_decode_forward( + c_int(length), + phoneme.ctypes.data_as(POINTER(c_long)), + f0.ctypes.data_as(POINTER(c_float)), + volume.ctypes.data_as(POINTER(c_float)), + style_id.ctypes.data_as(POINTER(c_long)), + output.ctypes.data_as(POINTER(c_float)), + ) + ) + return output + def supported_devices(self) -> str: """ coreから取得した対応デバイスに関するjsonデータの文字列 diff --git a/voicevox_engine/engine_manifest/EngineManifest.py b/voicevox_engine/engine_manifest/EngineManifest.py index a203767aa..f3a02e173 100644 --- a/voicevox_engine/engine_manifest/EngineManifest.py +++ b/voicevox_engine/engine_manifest/EngineManifest.py @@ -57,6 +57,7 @@ class EngineManifest(BaseModel): url: str = Field(title="エンジンのURL") icon: str = Field(title="エンジンのアイコンをBASE64エンコードしたもの") default_sampling_rate: int = Field(title="デフォルトのサンプリング周波数") + frame_rate: float = Field(title="エンジンのフレームレート") terms_of_service: str = Field(title="エンジンの利用規約") update_infos: List[UpdateInfo] = Field(title="エンジンのアップデート情報") dependency_licenses: List[LicenseInfo] = Field(title="依存関係のライセンス情報") diff --git a/voicevox_engine/engine_manifest/EngineManifestLoader.py b/voicevox_engine/engine_manifest/EngineManifestLoader.py index 5f6f2199d..5335dd2bc 100644 --- a/voicevox_engine/engine_manifest/EngineManifestLoader.py +++ b/voicevox_engine/engine_manifest/EngineManifestLoader.py @@ -20,6 +20,7 @@ def load_manifest(self) -> EngineManifest: uuid=manifest["uuid"], url=manifest["url"], default_sampling_rate=manifest["default_sampling_rate"], + frame_rate=manifest["frame_rate"], icon=b64encode((self.root_dir / manifest["icon"]).read_bytes()).decode( "utf-8" ), diff --git a/voicevox_engine/metas/Metas.py b/voicevox_engine/metas/Metas.py index bc615a16f..39c45cb64 100644 --- a/voicevox_engine/metas/Metas.py +++ b/voicevox_engine/metas/Metas.py @@ -1,11 +1,12 @@ from enum import Enum -from typing import List, NewType, Optional +from typing import List, Literal, NewType, Optional from pydantic import BaseModel, Field # NOTE: 循環importを防ぐためにとりあえずここに書いている # FIXME: 他のmodelに依存せず、全modelから参照できる場所に配置する StyleId = NewType("StyleId", int) +StyleType = Literal["talk", "humming", "sing_teacher"] class SpeakerStyle(BaseModel): @@ -15,6 +16,7 @@ class SpeakerStyle(BaseModel): name: str = Field(title="スタイル名") id: StyleId = Field(title="スタイルID") + type: Optional[StyleType] = Field(title="モデルの種類") class SpeakerSupportPermittedSynthesisMorphing(str, Enum): diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index cd92cab4f..f9e50b62c 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -72,6 +72,46 @@ def __hash__(self): return hash(tuple(sorted(items))) +class Note(BaseModel): + """ + 音符ごとの情報 + """ + + key: int | None = Field(title="音階") + frame_length: int = Field(title="音符のフレーム長") + lyric: str = Field(title="音符の歌詞") + + +class Score(BaseModel): + """ + 楽譜情報 + """ + + notes: List[Note] = Field(title="音符のリスト") + + +class FramePhoneme(BaseModel): + """ + 音素の情報 + """ + + phoneme: str = Field(title="音素") + frame_length: int = Field(title="音素のフレーム長") + + +class FrameAudioQuery(BaseModel): + """ + フレームごとの音声合成用のクエリ + """ + + f0: List[float] = Field(title="フレームごとの基本周波数") + volume: List[float] = Field(title="フレームごとの音量") + phonemes: List[FramePhoneme] = Field(title="音素のリスト") + volumeScale: float = Field(title="全体の音量") + outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") + outputStereo: bool = Field(title="音声データをステレオ出力するか否か") + + class ParseKanaErrorCode(Enum): UNKNOWN_TEXT = "判別できない読み仮名があります: {text}" ACCENT_TOP = "句頭にアクセントは置けません: {text}" diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 610330bda..ceb2843b7 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -2,16 +2,19 @@ import math import numpy as np +from fastapi import HTTPException from numpy.typing import NDArray from soxr import resample from ..core_adapter import CoreAdapter from ..core_wrapper import CoreWrapper from ..metas.Metas import StyleId -from ..model import AccentPhrase, AudioQuery, Mora +from ..model import AccentPhrase, AudioQuery, FrameAudioQuery, Mora +from ..model import FramePhoneme +from ..model import Score from .acoustic_feature_extractor import Phoneme from .kana_converter import parse_kana -from .mora_list import mora_phonemes_to_mora_kana +from .mora_list import mora_kana_to_mora_phonemes, mora_phonemes_to_mora_kana from .text_analyzer import text_to_accent_phrases # 疑問文語尾定数 @@ -172,14 +175,14 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: def apply_volume_scale( - wave: NDArray[np.float32], query: AudioQuery + wave: NDArray[np.float32], query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつ音量スケール(`volumeScale`)を適用する""" return wave * query.volumeScale def apply_output_sampling_rate( - wave: NDArray[np.float32], sr_wave: float, query: AudioQuery + wave: NDArray[np.float32], sr_wave: float, query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつ出力サンプリングレート(`outputSamplingRate`)を適用する""" # サンプリングレート一致のときはスルー @@ -190,7 +193,7 @@ def apply_output_sampling_rate( def apply_output_stereo( - wave: NDArray[np.float32], query: AudioQuery + wave: NDArray[np.float32], query: AudioQuery | FrameAudioQuery ) -> NDArray[np.float32]: """音声波形へ音声合成用のクエリがもつステレオ出力設定(`outputStereo`)を適用する""" if query.outputStereo: @@ -223,7 +226,7 @@ def query_to_decoder_feature( def raw_wave_to_output_wave( - query: AudioQuery, wave: NDArray[np.float32], sr_wave: int + query: AudioQuery | FrameAudioQuery, wave: NDArray[np.float32], sr_wave: int ) -> NDArray[np.float32]: """生音声波形に音声合成用のクエリを適用して出力音声波形を生成する""" wave = apply_volume_scale(wave, query) @@ -232,6 +235,55 @@ def raw_wave_to_output_wave( return wave +def _hira_to_kana(text: str) -> str: + """ひらがなをカタカナに変換する""" + return "".join(chr(ord(c) + 96) if "ぁ" <= c <= "ゔ" else c for c in text) + + +def calc_phoneme_lengths( + consonant_lengths: NDArray[np.int64], + note_durations: NDArray[np.int64], +) -> NDArray[np.int64]: + """ + 子音長と音符長から音素長を計算する + ただし、母音はノートの頭にくるようにするため、 + 予測された子音長は前のノートの長さを超えないように調整される + """ + phoneme_durations = [] + for i in range(len(consonant_lengths)): + if i < len(consonant_lengths) - 1: + # 最初のノートは子音長が0の、pauである必要がある + if i == 0 and consonant_lengths[i] != 0: + raise HTTPException( + status_code=400, + detail=f"consonant_lengths[0] must be 0, but {consonant_lengths[0]}", + ) + + next_consonant_length = consonant_lengths[i + 1] + note_duration = note_durations[i] + + # もし、次のノートの子音長が負になる場合、現在のノートの半分にする + if next_consonant_length < 0: + next_consonant_length = consonant_lengths[i + 1] = note_duration // 2 + vowel_length = note_duration - next_consonant_length + + # もし、現在のノートの母音長が負になる場合、 + # 次のノートの子音長を現在のノートの半分にする + if vowel_length < 0: + next_consonant_length = consonant_lengths[i + 1] = note_duration // 2 + vowel_length = note_duration - next_consonant_length + + phoneme_durations.append(vowel_length) + if next_consonant_length > 0: + phoneme_durations.append(next_consonant_length) + else: + vowel_length = note_durations[i] + phoneme_durations.append(vowel_length) + + phoneme_durations_array = np.array(phoneme_durations, dtype=np.int64) + return phoneme_durations_array + + class TTSEngine: """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" @@ -374,6 +426,146 @@ def synthesize_wave( wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave + # FIXME: sing用のエンジンに移すかクラス名変える + # 返す値の総称を考え、関数名を変更する + def create_sing_phoneme_and_f0_and_volume( + self, + score: Score, + style_id: StyleId, + ) -> tuple[list[FramePhoneme], list[float], list[float]]: + """歌声合成用のスコア・スタイルIDに基づいてフレームごとの音素・音高・音量を生成する""" + notes = score.notes + + # Scoreを分解し、ノート単位のデータ、音素単位のデータを作成する + note_lengths: list[int] = [] + note_consonants: list[int] = [] + note_vowels: list[int] = [] + phonemes: list[int] = [] + phoneme_keys: list[int] = [] + + for note in notes: + if note.lyric == "": + if note.key is not None: + raise HTTPException( + status_code=400, + detail="lyricが空文字列の場合、keyはnullである必要があります。", + ) + note_lengths.append(note.frame_length) + note_consonants.append(-1) + note_vowels.append(0) # pau + phonemes.append(0) # pau + phoneme_keys.append(-1) + else: + if note.key is None: + raise HTTPException( + status_code=400, + detail="keyがnullの場合、lyricは空文字列である必要があります。", + ) + + # TODO: 1ノートに複数のモーラがある場合の処理 + mora_phonemes = mora_kana_to_mora_phonemes.get( + note.lyric # type: ignore + ) or mora_kana_to_mora_phonemes.get( + _hira_to_kana(note.lyric) # type: ignore + ) + if mora_phonemes is None: + raise HTTPException( + status_code=400, + detail=f"lyricが不正です: {note.lyric}", + ) + + consonant, vowel = mora_phonemes + if consonant is None: + consonant_id = -1 + else: + consonant_id = Phoneme(consonant).id + vowel_id = Phoneme(vowel).id + + note_lengths.append(note.frame_length) + note_consonants.append(consonant_id) + note_vowels.append(vowel_id) + if consonant_id != -1: + phonemes.append(consonant_id) + phoneme_keys.append(note.key) + phonemes.append(vowel_id) + phoneme_keys.append(note.key) + + # 各データをnumpy配列に変換する + note_lengths_array = np.array(note_lengths, dtype=np.int64) + note_consonants_array = np.array(note_consonants, dtype=np.int64) + note_vowels_array = np.array(note_vowels, dtype=np.int64) + phonemes_array = np.array(phonemes, dtype=np.int64) + phoneme_keys_array = np.array(phoneme_keys, dtype=np.int64) + + # コアを用いて子音長を生成する + consonant_lengths = self._core.safe_predict_sing_consonant_length_forward( + note_consonants_array, note_vowels_array, note_lengths_array, style_id + ) + + # 予測した子音長を元に、すべての音素長を計算する + phoneme_lengths = calc_phoneme_lengths(consonant_lengths, note_lengths_array) + + # 時間スケールを変更する(音素 → フレーム) + frame_phonemes = np.repeat(phonemes_array, phoneme_lengths) + frame_keys = np.repeat(phoneme_keys_array, phoneme_lengths) + + # コアを用いて音高を生成する + f0s = self._core.safe_predict_sing_f0_forward( + frame_phonemes, frame_keys, style_id + ) + + # コアを用いて音量を生成する + # FIXME: 変数名のsいらない? + volumes = self._core.safe_predict_sing_volume_forward( + frame_phonemes, frame_keys, f0s, style_id + ) + + phoneme_data_list = [ + FramePhoneme( + phoneme=Phoneme._PHONEME_LIST[phoneme_id], + frame_length=phoneme_duration, + ) + for phoneme_id, phoneme_duration in zip(phonemes, phoneme_lengths) + ] + + return phoneme_data_list, f0s.tolist(), volumes.tolist() + + def frame_synthsize_wave( + self, + frame_audio_query: FrameAudioQuery, + style_id: StyleId, + ) -> NDArray[np.float32]: + """歌声合成用のクエリ・スタイルIDに基づいて音声波形を生成する""" + + # 各データを分解・numpy配列に変換する + phonemes = [] + phoneme_lengths = [] + + for phoneme in frame_audio_query.phonemes: + if phoneme.phoneme not in Phoneme._PHONEME_LIST: + raise HTTPException( + status_code=400, + detail=f"phoneme {phoneme.phoneme} is not valid", + ) + + phonemes.append(Phoneme(phoneme.phoneme).id) + phoneme_lengths.append(phoneme.frame_length) + + phonemes_array = np.array(phonemes, dtype=np.int64) + phoneme_lengths_array = np.array(phoneme_lengths, dtype=np.int64) + + frame_phonemes = np.repeat(phonemes_array, phoneme_lengths_array) + f0s = np.array(frame_audio_query.f0, dtype=np.float32) + volumes = np.array(frame_audio_query.volume, dtype=np.float32) + + # コアを用いて音声を生成する + raw_wave, sr_raw_wave = self._core.safe_sf_decode_forward( + frame_phonemes, f0s, volumes, style_id + ) + + wave = raw_wave_to_output_wave(frame_audio_query, raw_wave, sr_raw_wave) + return wave + def make_tts_engines_from_cores(cores: dict[str, CoreAdapter]) -> dict[str, TTSEngine]: """コア一覧からTTSエンジン一覧を生成する"""