diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 5dbb333a9..9c2675209 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -140,21 +140,6 @@ impl VoicevoxCore { ) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - self.synthesis_engine.inference_core_mut().predict_contour( - length, - f0_discrete, - phoneme_vector, - speaker_id, - ) - } - pub fn decode( &mut self, length: usize, @@ -172,6 +157,53 @@ impl VoicevoxCore { ) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_consonant_length(consonant, vowel, note_duration, speaker_id) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_f0(phoneme, note, speaker_id) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + f0: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_volume(phoneme, note, f0, speaker_id) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .sf_decode(phoneme, f0, volume, speaker_id) + } + pub fn audio_query( &mut self, text: &str, @@ -359,8 +391,14 @@ impl InferenceCore { status.load_metas()?; if load_all_models { - for model_index in 0..MODEL_FILE_SET.models_count() { - status.load_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.talk_models_count() { + status.load_talk_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sing_teacher_models_count() { + status.load_sing_teacher_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sf_decode_models_count() { + status.load_sf_decode_model(model_index)?; } } @@ -388,10 +426,28 @@ impl InferenceCore { .status_option .as_mut() .ok_or(Error::UninitializedStatus)?; - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.load_model(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.load_talk_model(model_index) } else { - Err(Error::InvalidSpeakerId { speaker_id }) + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + status.load_sing_teacher_model(model_index)?; + loaded = true; + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + status.load_sf_decode_model(model_index)?; + loaded = true; + } + + if loaded { + Ok(()) + } else { + Err(Error::InvalidSpeakerId { speaker_id }) + } } } else { Err(Error::UninitializedStatus) @@ -399,10 +455,21 @@ impl InferenceCore { } pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.is_model_loaded(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.is_talk_model_loaded(model_index) } else { - false + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + loaded |= status.is_sing_teacher_model_loaded(model_index); + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + loaded |= status.is_sf_decode_model_loaded(model_index); + } + loaded } } else { false @@ -431,14 +498,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -484,14 +552,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -521,59 +590,6 @@ impl InferenceCore { status.predict_intonation_session_run(model_index, input_tensors) } - pub fn predict_contour( - &mut self, - length: usize, - f0_discrete: &[f32], - phoneme_vector: &[i64], - speaker_id: u32, - ) -> Result> { - if !self.initialized { - return Err(Error::UninitializedStatus); - } - - let status = self - .status_option - .as_mut() - .ok_or(Error::UninitializedStatus)?; - - if !status.validate_speaker_id(speaker_id) { - return Err(Error::InvalidSpeakerId { speaker_id }); - } - - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; - - if model_index >= MODEL_FILE_SET.models_count() { - return Err(Error::InvalidModelIndex { model_index }); - } - - let mut f0_discrete_array = - NdArray::new(ndarray::arr1(f0_discrete).into_shape([length, 1]).unwrap()); - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); - - let input_tensors: Vec<&mut dyn AnyArray> = vec![ - &mut f0_discrete_array, - &mut phoneme_vector_array, - &mut speaker_id_array, - ]; - - let (mut f0_contour, voiced) = - status.predict_contour_session_run(model_index, input_tensors)?; - for (f0_contour_item, voiced_item) in f0_contour.iter_mut().zip(voiced.iter()) { - if *voiced_item < 0.0 { - *f0_contour_item = 0.0; - } - } - - Ok(f0_contour) - } - pub fn decode( &mut self, length: usize, @@ -595,14 +611,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -642,6 +659,184 @@ impl InferenceCore { .map(|output| Self::trim_padding_from_output(output, padding_size)) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut consonant_array = NdArray::new(ndarray::arr1(consonant)); + let mut vowel_array = NdArray::new(ndarray::arr1(vowel)); + let mut note_duration_array = NdArray::new(ndarray::arr1(note_duration)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut consonant_array, + &mut vowel_array, + &mut note_duration_array, + &mut speaker_id_array, + ]; + + status.predict_sing_consonant_length_session_run(model_index, input_tensors) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; + + status.predict_sing_f0_session_run(model_index, input_tensors) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + _f0: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + // TODO: f0を使う + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut note_array = NdArray::new(ndarray::arr1(note)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; + + status.predict_sing_volume_session_run(model_index, input_tensors) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sf_decode_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sf_decode_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new(ndarray::arr1(phoneme)); + let mut f0_array = NdArray::new(ndarray::arr1(f0)); + let mut volume_array = NdArray::new(ndarray::arr1(volume)); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut phoneme_array, + &mut f0_array, + &mut volume_array, + &mut speaker_id_array, + ]; + + status.sf_decode_session_run(model_index, input_tensors) + } + fn make_f0_with_padding( f0_slice: &[f32], length_with_padding: usize, @@ -705,8 +900,22 @@ pub static SUPPORTED_DEVICES: Lazy = pub static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap()); -fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.speaker_id_map.get(&speaker_id).copied() +fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() +} + +fn get_sing_teacher_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sing_teacher_speaker_id_map + .get(&speaker_id) + .copied() +} + +fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sf_decode_speaker_id_map + .get(&speaker_id) + .copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { @@ -919,7 +1128,7 @@ mod tests { #[case] speaker_id: u32, #[case] expected: Option<(usize, u32)>, ) { - let actual = get_model_index_and_speaker_id(speaker_id); + let actual = get_talk_model_index_and_speaker_id(speaker_id); assert_eq!(expected, actual); } @@ -987,43 +1196,7 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } - #[rstest] - fn predict_contour_works() { - let internal = VoicevoxCore::new_with_mutex(); - internal - .lock() - .unwrap() - .initialize(InitializeOptions { - load_all_models: true, - acceleration_mode: AccelerationMode::Cpu, - ..Default::default() - }) - .unwrap(); - - // 「テスト」という文章に対応する入力 - const F0_LENGTH: usize = 69; - let mut f0_discrete = [0.; F0_LENGTH]; - f0_discrete[9..24].fill(5.905218); - f0_discrete[37..60].fill(5.565851); - - let mut phoneme = [0; F0_LENGTH]; - phoneme[0..9].fill(0); - phoneme[9..13].fill(37); - phoneme[13..24].fill(14); - phoneme[24..30].fill(35); - phoneme[30..37].fill(6); - phoneme[37..45].fill(37); - phoneme[45..60].fill(30); - phoneme[60..69].fill(0); - - let result = internal - .lock() - .unwrap() - .predict_contour(F0_LENGTH, &f0_discrete, &phoneme, 2); - - assert!(result.is_ok(), "{result:?}"); - assert_eq!(result.unwrap().len(), F0_LENGTH); - } + // TODO: sing系のテストを足す #[rstest] fn decode_works() { diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 81bd74509..d6a559947 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -31,19 +31,30 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { }); pub struct Status { - models: StatusModels, + talk_models: StatusTalkModels, + sing_teacher_models: StatusSingTeacherModels, + sf_decode_models: StatusSfModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, } -struct StatusModels { +struct StatusTalkModels { predict_duration: BTreeMap>, predict_intonation: BTreeMap>, - predict_contour: BTreeMap>>, decode: BTreeMap>, } +struct StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap>, + predict_sing_f0: BTreeMap>, + predict_sing_volume: BTreeMap>, +} + +struct StatusSfModels { + sf_decode: BTreeMap>, +} + #[derive(new, Getters)] struct SessionOptions { cpu_num_threads: u16, @@ -51,9 +62,13 @@ struct SessionOptions { } pub(crate) struct ModelFileSet { - pub(crate) speaker_id_map: BTreeMap, + pub(crate) talk_speaker_id_map: BTreeMap, + pub(crate) sing_teacher_speaker_id_map: BTreeMap, + pub(crate) sf_decode_speaker_id_map: BTreeMap, pub(crate) metas_str: String, - models: Vec, + talk_models: Vec, + sing_teacher_models: Vec, + sf_decode_models: Vec, } impl ModelFileSet { @@ -77,63 +92,124 @@ impl ModelFileSet { let metas_str = fs_err::read_to_string(path("metas.json"))?; - let models = model_file::MODEL_FILE_NAMES + let talk_models = model_file::TALK_MODEL_FILE_NAMES .iter() .map( - |&ModelFileNames { + |&TalkModelFileNames { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }| { let predict_duration_model = ModelFile::new(&path(predict_duration_model))?; let predict_intonation_model = ModelFile::new(&path(predict_intonation_model))?; - let predict_contour_model = predict_contour_model - .map(|s| ModelFile::new(&path(s))) - .transpose()?; let decode_model = ModelFile::new(&path(decode_model))?; - Ok(Model { + Ok(TalkModel { predict_duration_model, predict_intonation_model, - predict_contour_model, decode_model, }) }, ) .collect::>()?; + let sing_teacher_models = model_file::SING_TEACHER_MODEL_FILE_NAMES + .iter() + .map( + |&SingTeacherModelFileNames { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }| { + let predict_sing_consonant_length_model = + ModelFile::new(&path(predict_sing_consonant_length_model))?; + let predict_sing_f0_model = ModelFile::new(&path(predict_sing_f0_model))?; + let predict_sing_volume_model = + ModelFile::new(&path(predict_sing_volume_model))?; + Ok(SingTeacherModel { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }) + }, + ) + .collect::>()?; + + let sf_decode_models = model_file::SF_DECODE_MODEL_FILE_NAMES + .iter() + .map(|&SfDecodeModelFileNames { sf_decode_model }| { + let sf_decode_model = ModelFile::new(&path(sf_decode_model))?; + Ok(SfDecodeModel { sf_decode_model }) + }) + .collect::>()?; + return Ok(Self { - speaker_id_map: model_file::SPEAKER_ID_MAP.iter().copied().collect(), + talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), + sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), metas_str, - models, + talk_models, + sing_teacher_models, + sf_decode_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; } - pub(crate) fn models_count(&self) -> usize { - self.models.len() + pub(crate) fn talk_models_count(&self) -> usize { + self.talk_models.len() + } + + pub(crate) fn sing_teacher_models_count(&self) -> usize { + self.sing_teacher_models.len() + } + + pub(crate) fn sf_decode_models_count(&self) -> usize { + self.sf_decode_models.len() } } -struct ModelFileNames { +struct TalkModelFileNames { predict_duration_model: &'static str, predict_intonation_model: &'static str, - predict_contour_model: Option<&'static str>, decode_model: &'static str, } +struct SingTeacherModelFileNames { + predict_sing_consonant_length_model: &'static str, + predict_sing_f0_model: &'static str, + predict_sing_volume_model: &'static str, +} + +struct SfDecodeModelFileNames { + sf_decode_model: &'static str, +} + #[derive(thiserror::Error, Debug)] #[error("不正なモデルファイルです")] struct DecryptModelError; -struct Model { +struct TalkModel { predict_duration_model: ModelFile, predict_intonation_model: ModelFile, - predict_contour_model: Option, decode_model: ModelFile, } +struct SingTeacherModel { + predict_sing_consonant_length_model: ModelFile, + predict_sing_f0_model: ModelFile, + predict_sing_volume_model: ModelFile, +} + +struct SfDecodeModel { + sf_decode_model: ModelFile, +} + struct ModelFile { path: PathBuf, content: Vec, @@ -213,12 +289,19 @@ unsafe impl Send for Status {} impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - models: StatusModels { + talk_models: StatusTalkModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), - predict_contour: BTreeMap::new(), decode: BTreeMap::new(), }, + sing_teacher_models: StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap::new(), + predict_sing_f0: BTreeMap::new(), + predict_sing_volume: BTreeMap::new(), + }, + sf_decode_models: StatusSfModels { + sf_decode: BTreeMap::new(), + }, light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), supported_styles: BTreeSet::default(), @@ -238,32 +321,63 @@ impl Status { Ok(()) } - pub fn load_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.models.len() { - let model = &MODEL_FILE_SET.models[model_index]; + pub fn load_talk_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.talk_models.len() { + let model = &MODEL_FILE_SET.talk_models[model_index]; let predict_duration_session = self.new_session(&model.predict_duration_model, &self.light_session_options)?; let predict_intonation_session = self.new_session(&model.predict_intonation_model, &self.light_session_options)?; - let predict_contour_session = if let Some(model) = &model.predict_contour_model { - Some(self.new_session(model, &self.light_session_options)?) - } else { - None - }; let decode_model = self.new_session(&model.decode_model, &self.heavy_session_options)?; - self.models + self.talk_models .predict_duration .insert(model_index, predict_duration_session); - self.models + self.talk_models .predict_intonation .insert(model_index, predict_intonation_session); - self.models - .predict_contour - .insert(model_index, predict_contour_session); - self.models.decode.insert(model_index, decode_model); + self.talk_models.decode.insert(model_index, decode_model); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_talk_model_loaded(&self, model_index: usize) -> bool { + self.talk_models.predict_duration.contains_key(&model_index) + && self + .talk_models + .predict_intonation + .contains_key(&model_index) + && self.talk_models.decode.contains_key(&model_index) + } + + pub fn load_sing_teacher_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sing_teacher_models.len() { + let model = &MODEL_FILE_SET.sing_teacher_models[model_index]; + let predict_sing_consonant_length_session = self.new_session( + &model.predict_sing_consonant_length_model, + &self.light_session_options, + )?; + let predict_sing_f0_session = + self.new_session(&model.predict_sing_f0_model, &self.light_session_options)?; + let predict_sing_volume_session = self.new_session( + &model.predict_sing_volume_model, + &self.light_session_options, + )?; + + self.sing_teacher_models + .predict_sing_consonant_length + .insert(model_index, predict_sing_consonant_length_session); + self.sing_teacher_models + .predict_sing_f0 + .insert(model_index, predict_sing_f0_session); + self.sing_teacher_models + .predict_sing_volume + .insert(model_index, predict_sing_volume_session); Ok(()) } else { @@ -271,11 +385,38 @@ impl Status { } } - pub fn is_model_loaded(&self, model_index: usize) -> bool { - self.models.predict_duration.contains_key(&model_index) - && self.models.predict_intonation.contains_key(&model_index) - && self.models.predict_contour.contains_key(&model_index) - && self.models.decode.contains_key(&model_index) + pub fn is_sing_teacher_model_loaded(&self, model_index: usize) -> bool { + self.sing_teacher_models + .predict_sing_consonant_length + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_f0 + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_volume + .contains_key(&model_index) + } + + pub fn load_sf_decode_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sf_decode_models.len() { + let model = &MODEL_FILE_SET.sf_decode_models[model_index]; + let sf_decode_session = + self.new_session(&model.sf_decode_model, &self.heavy_session_options)?; + + self.sf_decode_models + .sf_decode + .insert(model_index, sf_decode_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_sf_decode_model_loaded(&self, model_index: usize) -> bool { + self.sf_decode_models.sf_decode.contains_key(&model_index) } fn new_session( @@ -329,7 +470,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_duration.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_duration.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -345,7 +486,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_intonation.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_intonation.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -356,35 +497,88 @@ impl Status { } } - pub fn predict_contour_session_run( + pub fn decode_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, - ) -> Result<(Vec, Vec)> { - if let Some(model) = self.models.predict_contour.get_mut(&model_index) { - if let Some(model) = model { - if let Ok(output_tensors) = model.run(inputs) { - Ok(( - output_tensors[0].as_slice().unwrap().to_owned(), - output_tensors[1].as_slice().unwrap().to_owned(), - )) - } else { - Err(Error::InferenceFailed) - } + ) -> Result> { + if let Some(model) = self.talk_models.decode.get_mut(&model_index) { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { - Err(Error::UnsupportedModel) + Err(Error::InferenceFailed) } } else { Err(Error::InvalidModelIndex { model_index }) } } - pub fn decode_session_run( + pub fn predict_sing_consonant_length_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_consonant_length + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_f0_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_f0 + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_volume_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_volume + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn sf_decode_session_run( &mut self, model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.decode.get_mut(&model_index) { + if let Some(model) = self.sf_decode_models.sf_decode.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -423,10 +617,9 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.predict_duration.is_empty()); - assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.predict_contour.is_empty()); - assert!(status.models.decode.is_empty()); + assert!(status.talk_models.predict_duration.is_empty()); + assert!(status.talk_models.predict_intonation.is_empty()); + assert!(status.talk_models.decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -447,29 +640,30 @@ mod tests { } #[rstest] - fn status_load_model_works() { + fn status_load_talk_model_works() { let mut status = Status::new(false, 0); - let result = status.load_model(0); + let result = status.load_talk_model(0); assert_debug_fmt_eq!(Ok(()), result); - assert_eq!(1, status.models.predict_duration.len()); - assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.predict_contour.len()); - assert_eq!(1, status.models.decode.len()); + assert_eq!(1, status.talk_models.predict_duration.len()); + assert_eq!(1, status.talk_models.predict_intonation.len()); + assert_eq!(1, status.talk_models.decode.len()); } #[rstest] - fn status_is_model_loaded_works() { + fn status_is_talk_model_loaded_works() { let mut status = Status::new(false, 0); let model_index = 0; assert!( - !status.is_model_loaded(model_index), + !status.is_talk_model_loaded(model_index), "model should not be loaded" ); - let result = status.load_model(model_index); + let result = status.load_talk_model(model_index); assert_debug_fmt_eq!(Ok(()), result); assert!( - status.is_model_loaded(model_index), + status.is_talk_model_loaded(model_index), "model should be loaded" ); } + + // TODO: sing系のテスト足す } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index aa618be76..eb8f8913c 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,23 +1,32 @@ -use super::{DecryptModelError, ModelFileNames}; +use super::{ + DecryptModelError, SfDecodeModelFileNames, SingTeacherModelFileNames, TalkModelFileNames, +}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) } -pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = - &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))]; +pub(super) const TALK_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const MODEL_FILE_NAMES: &[ModelFileNames] = &[ - ModelFileNames { - predict_duration_model: "predict_duration-0.onnx", - predict_intonation_model: "predict_intonation-0.onnx", - predict_contour_model: None, - decode_model: "decode-0.onnx", - }, - ModelFileNames { - predict_duration_model: "predict_duration-1.onnx", - predict_intonation_model: "predict_intonation-1.onnx", - predict_contour_model: Some("predict_contour-1.onnx"), - decode_model: "decode-1.onnx", - }, -]; +pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[TalkModelFileNames { + predict_duration_model: "predict_duration-0.onnx", + predict_intonation_model: "predict_intonation-0.onnx", + decode_model: "decode-0.onnx", +}]; + +// TODO: 変更する +pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = + &[SingTeacherModelFileNames { + predict_sing_consonant_length_model: "predict_duration-1.onnx", + predict_sing_f0_model: "predict_intonation-1.onnx", + predict_sing_volume_model: "predict_intonation-1.onnx", + }]; + +pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; + +pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = + &[SfDecodeModelFileNames { + sf_decode_model: "decode-1.onnx", + }]; diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 02a3b4efb..d919f72f0 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -128,22 +128,82 @@ pub extern "C" fn yukarin_sa_forward( } #[no_mangle] -pub extern "C" fn yukarin_sosf_forward( +pub extern "C" fn decode_forward( + length: i64, + phoneme_size: i64, + f0: *mut f32, + phoneme: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let phoneme_size = phoneme_size as usize; + let result = lock_internal().decode( + length, + phoneme_size, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_consonant_length_forward( + length: i64, + consonant: *mut i64, + vowel: *mut i64, + note_duration: *mut i64, + speaker_id: *mut i64, + output: *mut i64, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_consonant_length( + unsafe { std::slice::from_raw_parts(consonant, length) }, + unsafe { std::slice::from_raw_parts(vowel, length) }, + unsafe { std::slice::from_raw_parts(note_duration, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_f0_forward( length: i64, - f0_discrete: *mut f32, phoneme: *mut i64, + note: *mut i64, speaker_id: *mut i64, output: *mut f32, ) -> bool { - let result = lock_internal().predict_contour( - length as usize, - unsafe { std::slice::from_raw_parts(f0_discrete, length as usize) }, - unsafe { std::slice::from_raw_parts(phoneme, length as usize) }, + let length = length as usize; + let result = lock_internal().predict_sing_f0( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, unsafe { *speaker_id as u32 }, ); match result { Ok(output_vec) => { - let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; output_slice.clone_from_slice(&output_vec); true } @@ -155,21 +215,48 @@ pub extern "C" fn yukarin_sosf_forward( } #[no_mangle] -pub extern "C" fn decode_forward( +pub extern "C" fn predict_sing_volume_forward( length: i64, - phoneme_size: i64, + phoneme: *mut i64, + note: *mut i64, f0: *mut f32, - phoneme: *mut f32, speaker_id: *mut i64, output: *mut f32, ) -> bool { let length = length as usize; - let phoneme_size = phoneme_size as usize; - let result = lock_internal().decode( - length, - phoneme_size, + let result = lock_internal().predict_sing_volume( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, unsafe { std::slice::from_raw_parts(f0, length) }, - unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn sf_decode_forward( + length: i64, + phoneme: *mut i64, + f0: *mut f32, + volume: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().sf_decode( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(volume, length) }, unsafe { *speaker_id as u32 }, ); match result {