From 96bd927d79cd2a4a8baf03e8779b4e57e26886c7 Mon Sep 17 00:00:00 2001 From: Yuto Ashida Date: Tue, 9 Jan 2024 07:26:29 +0900 Subject: [PATCH] =?UTF-8?q?[project-s]=20=E3=83=8F=E3=83=9F=E3=83=B3?= =?UTF-8?q?=E3=82=B0=E6=A9=9F=E8=83=BD=E3=83=BB=E6=AD=8C=E6=A9=9F=E8=83=BD?= =?UTF-8?q?=E5=90=91=E3=81=91=E3=81=AE=E3=83=A2=E3=83=87=E3=83=AB=E3=83=BB?= =?UTF-8?q?API(compatible=5Fengine=E3=81=AE=E3=81=BF)=E3=82=92=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0=20(#724)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove contour and rename to talk xxx * fix speaker id map * rename functions and variables * add models to model file * add sing style and source filter models to model file set * add new models to status * rename get model index and speaker id * add new models session * change i32 to i64 * add new predictor to inference core * add new predictor to core * add new predictor to compatible engine * rename source filter to sf decode * fix rename miss * rename sing style to sing teacher * fix rename miss * remove vector * add TODO comment (add sing tests) Co-authored-by: Hiroshiba * fix comment out * lint --------- Co-authored-by: Hiroshiba --- crates/voicevox_core/src/publish.rs | 447 ++++++++++++------ crates/voicevox_core/src/status.rs | 344 +++++++++++--- crates/voicevox_core/src/status/model_file.rs | 43 +- .../src/compatible_engine.rs | 117 ++++- 4 files changed, 707 insertions(+), 244 deletions(-) diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 9c18367d5..bc38ea859 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -140,21 +140,6 @@ impl VoicevoxCore { ) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - self.synthesis_engine.inference_core_mut().predict_contour( - length, - f0_discrete, - phoneme_vector, - speaker_id, - ) - } - pub fn decode( &mut self, length: usize, @@ -172,6 +157,53 @@ impl VoicevoxCore { ) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_consonant_length(consonant, vowel, note_duration, speaker_id) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_f0(phoneme, note, speaker_id) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + f0: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_volume(phoneme, note, f0, speaker_id) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .sf_decode(phoneme, f0, volume, speaker_id) + } + pub fn audio_query( &mut self, text: &str, @@ -297,8 +329,14 @@ impl InferenceCore { status.load_metas()?; if load_all_models { - for model_index in 0..MODEL_FILE_SET.models_count() { - status.load_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.talk_models_count() { + status.load_talk_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sing_teacher_models_count() { + status.load_sing_teacher_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sf_decode_models_count() { + status.load_sf_decode_model(model_index)?; } } @@ -326,10 +364,28 @@ impl InferenceCore { .status_option .as_mut() .ok_or(Error::UninitializedStatus)?; - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.load_model(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.load_talk_model(model_index) } else { - Err(Error::InvalidSpeakerId { speaker_id }) + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + status.load_sing_teacher_model(model_index)?; + loaded = true; + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + status.load_sf_decode_model(model_index)?; + loaded = true; + } + + if loaded { + Ok(()) + } else { + Err(Error::InvalidSpeakerId { speaker_id }) + } } } else { Err(Error::UninitializedStatus) @@ -337,10 +393,21 @@ impl InferenceCore { } pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.is_model_loaded(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.is_talk_model_loaded(model_index) } else { - false + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + loaded |= status.is_sing_teacher_model_loaded(model_index); + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + loaded |= status.is_sf_decode_model_loaded(model_index); + } + loaded } } else { false @@ -369,14 +436,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -422,14 +490,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -459,59 +528,6 @@ impl InferenceCore { status.predict_intonation_session_run(model_index, input_tensors) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - if !self.initialized { - return Err(Error::UninitializedStatus); - } - - let status = self - .status_option - .as_mut() - .ok_or(Error::UninitializedStatus)?; - - if !status.validate_speaker_id(speaker_id) { - return Err(Error::InvalidSpeakerId { speaker_id }); - } - - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; - - if model_index >= MODEL_FILE_SET.models_count() { - return Err(Error::InvalidModelIndex { model_index }); - } - - let mut f0_discrete_array = - NdArray::new(ndarray::arr1(f0_discrete).into_shape([length, 1]).unwrap()); - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); - - let input_tensors: Vec<&mut dyn AnyArray> = vec![ - &mut f0_discrete_array, - &mut phoneme_vector_array, - &mut speaker_id_array, - ]; - - let (mut f0_contour, voiced) = - status.predict_contour_session_run(model_index, input_tensors)?; - for (f0_contour_item, voiced_item) in f0_contour.iter_mut().zip(voiced.iter()) { - if *voiced_item < 0.0 { - *f0_contour_item = 0.0; - } - } - - Ok(f0_contour) - } - pub fn decode( &mut self, length: usize, @@ -533,14 +549,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -580,6 +597,184 @@ impl InferenceCore { .map(|output| Self::trim_padding_from_output(output, padding_size)) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut consonant_array = NdArray::new(ndarray::arr1(consonant)); + let mut vowel_array = NdArray::new(ndarray::arr1(vowel)); + let mut note_duration_array = NdArray::new(ndarray::arr1(note_duration)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut consonant_array, + &mut vowel_array, + &mut note_duration_array, + &mut speaker_id_array, + ]; + + status.predict_sing_consonant_length_session_run(model_index, input_tensors) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; + + status.predict_sing_f0_session_run(model_index, input_tensors) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + _f0: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + // TODO: f0を使う + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; + + status.predict_sing_volume_session_run(model_index, input_tensors) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sf_decode_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sf_decode_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut f0_array = NdArray::new(ndarray::arr1(f0)); + let mut volume_array = NdArray::new(ndarray::arr1(volume)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut phoneme_array, + &mut f0_array, + &mut volume_array, + &mut speaker_id_array, + ]; + + status.sf_decode_session_run(model_index, input_tensors) + } + fn make_f0_with_padding( f0_slice: &[f32], length_with_padding: usize, @@ -643,8 +838,22 @@ pub static SUPPORTED_DEVICES: Lazy = pub static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap()); -fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.speaker_id_map.get(&speaker_id).copied() +fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() +} + +fn get_sing_teacher_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sing_teacher_speaker_id_map + .get(&speaker_id) + .copied() +} + +fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sf_decode_speaker_id_map + .get(&speaker_id) + .copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { @@ -853,7 +1062,7 @@ mod tests { #[case] speaker_id: u32, #[case] expected: Option<(usize, u32)>, ) { - let actual = get_model_index_and_speaker_id(speaker_id); + let actual = get_talk_model_index_and_speaker_id(speaker_id); assert_eq!(expected, actual); } @@ -921,43 +1130,7 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } - #[rstest] - fn predict_contour_works() { - let internal = VoicevoxCore::new_with_mutex(); - internal - .lock() - .unwrap() - .initialize(InitializeOptions { - load_all_models: true, - acceleration_mode: AccelerationMode::Cpu, - ..Default::default() - }) - .unwrap(); - - // 「テスト」という文章に対応する入力 - const F0_LENGTH: usize = 69; - let mut f0_discrete = [0.; F0_LENGTH]; - f0_discrete[9..24].fill(5.905218); - f0_discrete[37..60].fill(5.565851); - - let mut phoneme = [0; F0_LENGTH]; - phoneme[0..9].fill(0); - phoneme[9..13].fill(37); - phoneme[13..24].fill(14); - phoneme[24..30].fill(35); - phoneme[30..37].fill(6); - phoneme[37..45].fill(37); - phoneme[45..60].fill(30); - phoneme[60..69].fill(0); - - let result = internal - .lock() - .unwrap() - .predict_contour(F0_LENGTH, &f0_discrete, &phoneme, 2); - - assert!(result.is_ok(), "{result:?}"); - assert_eq!(result.unwrap().len(), F0_LENGTH); - } + // TODO: sing系のテストを足す #[rstest] fn decode_works() { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 7274bdd9b..04932cf67 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -31,19 +31,30 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { }); pub struct Status { - models: StatusModels, + talk_models: StatusTalkModels, + sing_teacher_models: StatusSingTeacherModels, + sf_decode_models: StatusSfModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, } -struct StatusModels { +struct StatusTalkModels { predict_duration: BTreeMap>, predict_intonation: BTreeMap>, - predict_contour: BTreeMap>>, decode: BTreeMap>, } +struct StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap>, + predict_sing_f0: BTreeMap>, + predict_sing_volume: BTreeMap>, +} + +struct StatusSfModels { + sf_decode: BTreeMap>, +} + #[derive(new, Getters)] struct SessionOptions { cpu_num_threads: u16, @@ -51,9 +62,13 @@ struct SessionOptions { } pub(crate) struct ModelFileSet { - pub(crate) speaker_id_map: BTreeMap, + pub(crate) talk_speaker_id_map: BTreeMap, + pub(crate) sing_teacher_speaker_id_map: BTreeMap, + pub(crate) sf_decode_speaker_id_map: BTreeMap, pub(crate) metas_str: String, - models: Vec, + talk_models: Vec, + sing_teacher_models: Vec, + sf_decode_models: Vec, } impl ModelFileSet { @@ -77,63 +92,124 @@ impl ModelFileSet { let metas_str = fs_err::read_to_string(path("metas.json"))?; - let models = model_file::MODEL_FILE_NAMES + let talk_models = model_file::TALK_MODEL_FILE_NAMES .iter() .map( - |&ModelFileNames { + |&TalkModelFileNames { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }| { let predict_duration_model = ModelFile::new(&path(predict_duration_model))?; let predict_intonation_model = ModelFile::new(&path(predict_intonation_model))?; - let predict_contour_model = predict_contour_model - .map(|s| ModelFile::new(&path(s))) - .transpose()?; let decode_model = ModelFile::new(&path(decode_model))?; - Ok(Model { + Ok(TalkModel { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }) }, ) .collect::>()?; + let sing_teacher_models = model_file::SING_TEACHER_MODEL_FILE_NAMES + .iter() + .map( + |&SingTeacherModelFileNames { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }| { + let predict_sing_consonant_length_model = + ModelFile::new(&path(predict_sing_consonant_length_model))?; + let predict_sing_f0_model = ModelFile::new(&path(predict_sing_f0_model))?; + let predict_sing_volume_model = + ModelFile::new(&path(predict_sing_volume_model))?; + Ok(SingTeacherModel { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }) + }, + ) + .collect::>()?; + + let sf_decode_models = model_file::SF_DECODE_MODEL_FILE_NAMES + .iter() + .map(|&SfDecodeModelFileNames { sf_decode_model }| { + let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; + Ok(SfDecodeModel { sf_decode_model }) + }) + .collect::>()?; + return Ok(Self { - speaker_id_map: model_file::SPEAKER_ID_MAP.iter().copied().collect(), + talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), + sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), metas_str, - models, + talk_models, + sing_teacher_models, + sf_decode_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; } - pub(crate) fn models_count(&self) -> usize { - self.models.len() + pub(crate) fn talk_models_count(&self) -> usize { + self.talk_models.len() + } + + pub(crate) fn sing_teacher_models_count(&self) -> usize { + self.sing_teacher_models.len() + } + + pub(crate) fn sf_decode_models_count(&self) -> usize { + self.sf_decode_models.len() } } -struct ModelFileNames { +struct TalkModelFileNames { predict_duration_model: &'static str, predict_intonation_model: &'static str, - predict_contour_model: Option<&'static str>, decode_model: &'static str, } +struct SingTeacherModelFileNames { + predict_sing_consonant_length_model: &'static str, + predict_sing_f0_model: &'static str, + predict_sing_volume_model: &'static str, +} + +struct SfDecodeModelFileNames { + sf_decode_model: &'static str, +} + #[derive(thiserror::Error, Debug)] #[error("不正なモデルファイルです")] struct DecryptModelError; -struct Model { +struct TalkModel { predict_duration_model: ModelFile, predict_intonation_model: ModelFile, - predict_contour_model: Option, decode_model: ModelFile, } +struct SingTeacherModel { + predict_sing_consonant_length_model: ModelFile, + predict_sing_f0_model: ModelFile, + predict_sing_volume_model: ModelFile, +} + +struct SfDecodeModel { + sf_decode_model: ModelFile, +} + struct ModelFile { path: PathBuf, content: Vec, @@ -213,12 +289,19 @@ unsafe impl Send for Status {} impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - models: StatusModels { + talk_models: StatusTalkModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), - predict_contour: BTreeMap::new(), decode: BTreeMap::new(), }, + sing_teacher_models: StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap::new(), + predict_sing_f0: BTreeMap::new(), + predict_sing_volume: BTreeMap::new(), + }, + sf_decode_models: StatusSfModels { + sf_decode: BTreeMap::new(), + }, light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), supported_styles: BTreeSet::default(), @@ -238,32 +321,24 @@ impl Status { Ok(()) } - pub fn load_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.models.len() { - let model = &MODEL_FILE_SET.models[model_index]; + pub fn load_talk_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.talk_models.len() { + let model = &MODEL_FILE_SET.talk_models[model_index]; let predict_duration_session = self.new_session(&model.predict_duration_model, &self.light_session_options)?; let predict_intonation_session = self.new_session(&model.predict_intonation_model, &self.light_session_options)?; - let predict_contour_session = if let Some(model) = &model.predict_contour_model { - Some(self.new_session(model, &self.light_session_options)?) - } else { - None - }; let decode_model = self.new_session(&model.decode_model, &self.heavy_session_options)?; - self.models + self.talk_models .predict_duration .insert(model_index, predict_duration_session); - self.models + self.talk_models .predict_intonation .insert(model_index, predict_intonation_session); - self.models - .predict_contour - .insert(model_index, predict_contour_session); - self.models.decode.insert(model_index, decode_model); + self.talk_models.decode.insert(model_index, decode_model); Ok(()) } else { @@ -271,11 +346,77 @@ impl Status { } } - pub fn is_model_loaded(&self, model_index: usize) -> bool { - self.models.predict_duration.contains_key(&model_index) - && self.models.predict_intonation.contains_key(&model_index) - && self.models.predict_contour.contains_key(&model_index) - && self.models.decode.contains_key(&model_index) + pub fn is_talk_model_loaded(&self, model_index: usize) -> bool { + self.talk_models.predict_duration.contains_key(&model_index) + && self + .talk_models + .predict_intonation + .contains_key(&model_index) + && self.talk_models.decode.contains_key(&model_index) + } + + pub fn load_sing_teacher_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sing_teacher_models.len() { + let model = &MODEL_FILE_SET.sing_teacher_models[model_index]; + let predict_sing_consonant_length_session = self.new_session( + &model.predict_sing_consonant_length_model, + &self.light_session_options, + )?; + let predict_sing_f0_session = + self.new_session(&model.predict_sing_f0_model, &self.light_session_options)?; + let predict_sing_volume_session = self.new_session( + &model.predict_sing_volume_model, + &self.light_session_options, + )?; + + self.sing_teacher_models + .predict_sing_consonant_length + .insert(model_index, predict_sing_consonant_length_session); + self.sing_teacher_models + .predict_sing_f0 + .insert(model_index, predict_sing_f0_session); + self.sing_teacher_models + .predict_sing_volume + .insert(model_index, predict_sing_volume_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_sing_teacher_model_loaded(&self, model_index: usize) -> bool { + self.sing_teacher_models + .predict_sing_consonant_length + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_f0 + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_volume + .contains_key(&model_index) + } + + pub fn load_sf_decode_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sf_decode_models.len() { + let model = &MODEL_FILE_SET.sf_decode_models[model_index]; + let sf_decode_session = + self.new_session(&model.sf_decode_model, &self.heavy_session_options)?; + + self.sf_decode_models + .sf_decode + .insert(model_index, sf_decode_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_sf_decode_model_loaded(&self, model_index: usize) -> bool { + self.sf_decode_models.sf_decode.contains_key(&model_index) } fn new_session( @@ -329,7 +470,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_duration.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_duration.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -345,7 +486,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_intonation.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_intonation.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -356,35 +497,88 @@ impl Status { } } - pub fn predict_contour_session_run( + pub fn decode_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, - ) -> Result<(Vec, Vec)> { - if let Some(model) = self.models.predict_contour.get_mut(&model_index) { - if let Some(model) = model { - if let Ok(output_tensors) = model.run(inputs) { - Ok(( - output_tensors[0].as_slice().unwrap().to_owned(), - output_tensors[1].as_slice().unwrap().to_owned(), - )) - } else { - Err(Error::InferenceFailed) - } + ) -> Result> { + if let Some(model) = self.talk_models.decode.get_mut(&model_index) { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { - Err(Error::UnsupportedModel) + Err(Error::InferenceFailed) } } else { Err(Error::InvalidModelIndex { model_index }) } } - pub fn decode_session_run( + pub fn predict_sing_consonant_length_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_consonant_length + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_f0_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.decode.get_mut(&model_index) { + if let Some(model) = self + .sing_teacher_models + .predict_sing_f0 + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_volume_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_volume + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn sf_decode_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self.sf_decode_models.sf_decode.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -422,10 +616,9 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.predict_duration.is_empty()); - assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.predict_contour.is_empty()); - assert!(status.models.decode.is_empty()); + assert!(status.talk_models.predict_duration.is_empty()); + assert!(status.talk_models.predict_intonation.is_empty()); + assert!(status.talk_models.decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -446,29 +639,30 @@ mod tests { } #[rstest] - fn status_load_model_works() { + fn status_load_talk_model_works() { let mut status = Status::new(false, 0); - let result = status.load_model(0); - assert_eq!(Ok(()), result); - assert_eq!(1, status.models.predict_duration.len()); - assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.predict_contour.len()); - assert_eq!(1, status.models.decode.len()); + let result = status.load_talk_model(0); + assert_debug_fmt_eq!(Ok(()), result); + assert_eq!(1, status.talk_models.predict_duration.len()); + assert_eq!(1, status.talk_models.predict_intonation.len()); + assert_eq!(1, status.talk_models.decode.len()); } #[rstest] - fn status_is_model_loaded_works() { + fn status_is_talk_model_loaded_works() { let mut status = Status::new(false, 0); let model_index = 0; assert!( - !status.is_model_loaded(model_index), + !status.is_talk_model_loaded(model_index), "model should not be loaded" ); - let result = status.load_model(model_index); - assert_eq!(Ok(()), result); + let result = status.load_talk_model(model_index); + assert_debug_fmt_eq!(Ok(()), result); assert!( - status.is_model_loaded(model_index), + status.is_talk_model_loaded(model_index), "model should be loaded" ); } + + // TODO: sing系のテスト足す } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index aa618be76..eb8f8913c 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,23 +1,32 @@ -use super::{DecryptModelError, ModelFileNames}; +use super::{ + DecryptModelError, SfDecodeModelFileNames, SingTeacherModelFileNames, TalkModelFileNames, +}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) } -pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = - &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))]; +pub(super) const TALK_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const MODEL_FILE_NAMES: &[ModelFileNames] = &[ - ModelFileNames { - predict_duration_model: "predict_duration-0.onnx", - predict_intonation_model: "predict_intonation-0.onnx", - predict_contour_model: None, - decode_model: "decode-0.onnx", - }, - ModelFileNames { - predict_duration_model: "predict_duration-1.onnx", - predict_intonation_model: "predict_intonation-1.onnx", - predict_contour_model: Some("predict_contour-1.onnx"), - decode_model: "decode-1.onnx", - }, -]; +pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[TalkModelFileNames { + predict_duration_model: "predict_duration-0.onnx", + predict_intonation_model: "predict_intonation-0.onnx", + decode_model: "decode-0.onnx", +}]; + +// TODO: 変更する +pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = + &[SingTeacherModelFileNames { + predict_sing_consonant_length_model: "predict_duration-1.onnx", + predict_sing_f0_model: "predict_intonation-1.onnx", + predict_sing_volume_model: "predict_intonation-1.onnx", + }]; + +pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = + &[SfDecodeModelFileNames { + sf_decode_model: "decode-1.onnx", + }]; diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 02a3b4efb..d919f72f0 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -128,22 +128,82 @@ pub extern "C" fn yukarin_sa_forward( } #[no_mangle] -pub extern "C" fn yukarin_sosf_forward( +pub extern "C" fn decode_forward( + length: i64, + phoneme_size: i64, + f0: *mut f32, + phoneme: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let phoneme_size = phoneme_size as usize; + let result = lock_internal().decode( + length, + phoneme_size, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_consonant_length_forward( + length: i64, + consonant: *mut i64, + vowel: *mut i64, + note_duration: *mut i64, + speaker_id: *mut i64, + output: *mut i64, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_consonant_length( + unsafe { std::slice::from_raw_parts(consonant, length) }, + unsafe { std::slice::from_raw_parts(vowel, length) }, + unsafe { std::slice::from_raw_parts(note_duration, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_f0_forward( length: i64, - f0_discrete: *mut f32, phoneme: *mut i64, + note: *mut i64, speaker_id: *mut i64, output: *mut f32, ) -> bool { - let result = lock_internal().predict_contour( - length as usize, - unsafe { std::slice::from_raw_parts(f0_discrete, length as usize) }, - unsafe { std::slice::from_raw_parts(phoneme, length as usize) }, + let length = length as usize; + let result = lock_internal().predict_sing_f0( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, unsafe { *speaker_id as u32 }, ); match result { Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; output_slice.clone_from_slice(&output_vec); true } @@ -155,21 +215,48 @@ pub extern "C" fn yukarin_sosf_forward( } #[no_mangle] -pub extern "C" fn decode_forward( +pub extern "C" fn predict_sing_volume_forward( length: i64, - phoneme_size: i64, + phoneme: *mut i64, + note: *mut i64, f0: *mut f32, - phoneme: *mut f32, speaker_id: *mut i64, output: *mut f32, ) -> bool { let length = length as usize; - let phoneme_size = phoneme_size as usize; - let result = lock_internal().decode( - length, - phoneme_size, + let result = lock_internal().predict_sing_volume( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, unsafe { std::slice::from_raw_parts(f0, length) }, - unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn sf_decode_forward( + length: i64, + phoneme: *mut i64, + f0: *mut f32, + volume: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().sf_decode( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(volume, length) }, unsafe { *speaker_id as u32 }, ); match result {