From aa803881d05dcdb408b3434607713927fc2e7d9c Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 24 Dec 2023 19:19:03 +0900 Subject: [PATCH 01/46] =?UTF-8?q?`to=5Fwav`=E3=82=92=E7=A7=BB=E5=8B=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/synthesizer.rs | 92 ++++++++++++------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 202e917c7..0098f3b37 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -381,52 +381,6 @@ pub(crate) mod blocking { pitch, ) } - - fn to_wav(wave: &[f32], audio_query: &AudioQueryModel) -> Vec { - let volume_scale = *audio_query.volume_scale(); - let output_stereo = *audio_query.output_stereo(); - let output_sampling_rate = *audio_query.output_sampling_rate(); - - // TODO: 44.1kHzなどの対応 - - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let bit_depth: u16 = 16; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let block_size: u16 = bit_depth * num_channels / 8; - - let bytes_size = wave.len() as u32 * repeat_count * 2; - let wave_size = bytes_size + 44; - - let buf: Vec = Vec::with_capacity(wave_size as usize); - let mut cur = Cursor::new(buf); - - cur.write_all("RIFF".as_bytes()).unwrap(); - cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); - cur.write_all("WAVEfmt ".as_bytes()).unwrap(); - cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length - cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM - cur.write_all(&num_channels.to_le_bytes()).unwrap(); - cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); - - let block_rate = output_sampling_rate * block_size as u32; - - cur.write_all(&block_rate.to_le_bytes()).unwrap(); - cur.write_all(&block_size.to_le_bytes()).unwrap(); - cur.write_all(&bit_depth.to_le_bytes()).unwrap(); - cur.write_all("data".as_bytes()).unwrap(); - cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); - } - } - - cur.into_inner() - } } /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 @@ -1207,6 +1161,52 @@ pub(crate) mod blocking { ) } } + + fn to_wav(wave: &[f32], audio_query: &AudioQueryModel) -> Vec { + let volume_scale = *audio_query.volume_scale(); + let output_stereo = *audio_query.output_stereo(); + let output_sampling_rate = *audio_query.output_sampling_rate(); + + // TODO: 44.1kHzなどの対応 + + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let bit_depth: u16 = 16; + let repeat_count: u32 = + (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; + let block_size: u16 = bit_depth * num_channels / 8; + + let bytes_size = wave.len() as u32 * repeat_count * 2; + let wave_size = bytes_size + 44; + + let buf: Vec = Vec::with_capacity(wave_size as usize); + let mut cur = Cursor::new(buf); + + cur.write_all("RIFF".as_bytes()).unwrap(); + cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); + cur.write_all("WAVEfmt ".as_bytes()).unwrap(); + cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length + cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM + cur.write_all(&num_channels.to_le_bytes()).unwrap(); + cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); + + let block_rate = output_sampling_rate * block_size as u32; + + cur.write_all(&block_rate.to_le_bytes()).unwrap(); + cur.write_all(&block_size.to_le_bytes()).unwrap(); + cur.write_all(&bit_depth.to_le_bytes()).unwrap(); + cur.write_all("data".as_bytes()).unwrap(); + cur.write_all(&bytes_size.to_le_bytes()).unwrap(); + + for value in wave { + let v = (value * volume_scale).clamp(-1., 1.); + let data = (v * 0x7fff as f32) as i16; + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); + } + } + + cur.into_inner() + } } pub(crate) mod tokio { From 883803a407deab503892cba2a923b976d0f89e77 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 24 Dec 2023 22:21:16 +0900 Subject: [PATCH 02/46] =?UTF-8?q?=E3=83=A2=E3=83=BC=E3=83=95=E3=82=A3?= =?UTF-8?q?=E3=83=B3=E3=82=B0=E6=A9=9F=E8=83=BD=E3=82=92=E8=BF=BD=E5=8A=A0?= =?UTF-8?q?=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 70 +++++- Cargo.toml | 7 + crates/voicevox_core/Cargo.toml | 4 + crates/voicevox_core/src/engine/mod.rs | 4 +- crates/voicevox_core/src/engine/model.rs | 5 + crates/voicevox_core/src/engine/morph.rs | 212 ++++++++++++++++++ crates/voicevox_core/src/error.rs | 9 +- crates/voicevox_core/src/lib.rs | 2 +- crates/voicevox_core/src/metas.rs | 22 ++ crates/voicevox_core/src/synthesizer.rs | 124 ++++++++-- crates/voicevox_core_c_api/src/helpers.rs | 1 + crates/voicevox_core_c_api/src/result_code.rs | 3 + crates/voicevox_core_java_api/src/common.rs | 1 + .../voicevox_core_python_api/src/convert.rs | 1 + 14 files changed, 445 insertions(+), 20 deletions(-) create mode 100644 crates/voicevox_core/src/engine/morph.rs diff --git a/Cargo.lock b/Cargo.lock index 72f8b7cca..5b239fdb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -374,6 +374,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "az" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" + [[package]] name = "backtrace" version = "0.3.66" @@ -436,6 +442,28 @@ dependencies = [ "which", ] +[[package]] +name = "bindgen" +version = "0.64.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 1.0.102", + "which", +] + [[package]] name = "binstall-tar" version = "0.4.39" @@ -569,11 +597,12 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.73" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", + "libc", ] [[package]] @@ -2527,7 +2556,7 @@ name = "open_jtalk-sys" version = "0.16.111" source = "git+https://github.com/VOICEVOX/open_jtalk-rs.git?rev=a16714ce16dec76fd0e3041a7acfa484921db3b5#a16714ce16dec76fd0e3041a7acfa484921db3b5" dependencies = [ - "bindgen", + "bindgen 0.60.1", "cmake", "link-cplusplus", ] @@ -3068,6 +3097,17 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "readonly" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8f439da1766942fe069954da6058b2e6c1760eb878bae76f5be9fc29f56f574" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.38", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -4344,6 +4384,7 @@ version = "0.0.0" dependencies = [ "anyhow", "async_zip", + "az", "derive-getters", "derive-new", "derive_more", @@ -4359,12 +4400,14 @@ dependencies = [ "itertools 0.10.5", "nanoid", "ndarray", + "num-traits", "once_cell", "onnxruntime", "open_jtalk", "ouroboros", "pretty_assertions", "rayon", + "readonly", "regex", "rstest", "serde", @@ -4377,6 +4420,7 @@ dependencies = [ "uuid", "voicevox_core_macros", "windows", + "world", "zip", ] @@ -4870,6 +4914,26 @@ dependencies = [ "winapi", ] +[[package]] +name = "world" +version = "0.1.0" +source = "git+https://github.com/White-Green/WORLD_rs.git?rev=2337a30bfa47eebd32ef418c60ae5c7b39e43b99#2337a30bfa47eebd32ef418c60ae5c7b39e43b99" +dependencies = [ + "once_cell", + "world_sys", +] + +[[package]] +name = "world_sys" +version = "0.1.0" +source = "git+https://github.com/White-Green/WORLD_rs.git?rev=2337a30bfa47eebd32ef418c60ae5c7b39e43b99#2337a30bfa47eebd32ef418c60ae5c7b39e43b99" +dependencies = [ + "bindgen 0.64.0", + "cc", + "once_cell", + "regex", +] + [[package]] name = "xattr" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index acaa1300b..f4e6e6f22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ anyhow = "1.0.65" assert_cmd = "2.0.8" async-std = "1.12.0" async_zip = "0.0.11" +az = "1.2.1" binstall-tar = "0.4.39" bytes = "1.1.0" cbindgen = "0.24.3" @@ -46,6 +47,7 @@ log = "0.4.17" nanoid = "0.4.0" ndarray = "0.15.6" ndarray-stats = "0.5.1" +num-traits = "0.2.15" octocrab = { version = "0.19.0", default-features = false } once_cell = "1.18.0" ouroboros = "0.18.0" @@ -57,6 +59,7 @@ pyo3-asyncio = "0.19.0" pyo3-log = "0.9.0" quote = "1.0.33" rayon = "1.6.1" +readonly = "0.2.11" regex = "1.10.0" reqwest = { version = "0.11.13", default-features = false } rstest = "0.15.0" @@ -94,6 +97,10 @@ rev = "a16714ce16dec76fd0e3041a7acfa484921db3b5" git = "https://github.com/VOICEVOX/process_path.git" rev = "de226a26e8e18edbdb1d6f986afe37bbbf35fbf4" +[workspace.dependencies.world] +git = "https://github.com/White-Green/WORLD_rs.git" +rev = "2337a30bfa47eebd32ef418c60ae5c7b39e43b99" + [workspace.package] version = "0.0.0" edition = "2021" diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index 8d7c70e7e..f632bd3bd 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -11,6 +11,7 @@ directml = ["onnxruntime/directml"] [dependencies] anyhow.workspace = true async_zip = { workspace = true, features = ["full"] } +az.workspace = true derive-getters.workspace = true derive-new.workspace = true derive_more.workspace = true @@ -24,11 +25,13 @@ indexmap = { workspace = true, features = ["serde"] } itertools.workspace = true nanoid.workspace = true ndarray.workspace = true +num-traits.workspace = true once_cell.workspace = true onnxruntime.workspace = true open_jtalk.workspace = true ouroboros.workspace = true rayon.workspace = true +readonly.workspace = true regex.workspace = true serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["preserve_order"] } @@ -38,6 +41,7 @@ tokio = { workspace = true, features = ["rt"] } # FIXME: feature-gateする tracing.workspace = true uuid = { workspace = true, features = ["v4", "serde"] } voicevox_core_macros = { path = "../voicevox_core_macros" } +world.workspace = true zip.workspace = true [dev-dependencies] diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 1c7422e76..3e18273c8 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -3,11 +3,13 @@ mod full_context_label; mod kana_parser; mod model; mod mora_list; +mod morph; pub(crate) mod open_jtalk; pub(crate) use self::acoustic_feature_extractor::OjtPhoneme; pub(crate) use self::full_context_label::{FullContextLabelError, Utterance}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; -pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel}; +pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel, MorphableTargetInfo}; pub(crate) use self::mora_list::mora2text; +pub(crate) use self::morph::{MorphError, MorphingPair}; pub use self::open_jtalk::FullcontextExtractor; diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs index 77adbebe7..705ffe668 100644 --- a/crates/voicevox_core/src/engine/model.rs +++ b/crates/voicevox_core/src/engine/model.rs @@ -82,6 +82,11 @@ impl AudioQueryModel { } } +#[derive(Deserialize, Serialize)] +pub struct MorphableTargetInfo { + pub is_morphable: bool, +} + #[cfg(test)] mod tests { use pretty_assertions::assert_eq; diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs new file mode 100644 index 000000000..47ee0798f --- /dev/null +++ b/crates/voicevox_core/src/engine/morph.rs @@ -0,0 +1,212 @@ +use thiserror::Error; +use world::{ + signal_analyzer::{AnalyzeResult, SignalAnalyzerBuilder}, + spectrogram_like::SpectrogramLike, +}; + +use crate::{error::ErrorRepr, AudioQueryModel, SpeakerMeta, StyleId}; + +use self::permission::Permission; + +// FIXME: 許可対象外のときと、WORLDがなんかエラーを吐いたときとに分割する +#[derive(Error, Debug)] +#[error("指定された話者ペアでのモーフィングに失敗しました")] +pub(crate) struct MorphError; + +impl crate::blocking::Synthesizer { + pub(crate) fn is_synthesis_morphing_permitted( + &self, + style_ids: MorphingPair, + metas: &[SpeakerMeta], + ) -> crate::Result { + let metas = style_ids.lookup_speakers(metas)?; + Ok(Permission::new(metas).is_ok()) + } + + pub(crate) fn synthesis_morphing_( + &self, + audio_query: &AudioQueryModel, + style_ids: MorphingPair, + morph_rate: f32, + ) -> crate::Result> { + let metas = &self.metas(); + let metas = style_ids.lookup_speakers(metas)?; + + Permission::new(metas)?.synthesis_morphing(self, audio_query, style_ids, morph_rate) + } +} + +impl<'speakers> Permission<'speakers> { + fn synthesis_morphing( + self, + synthesizer: &crate::blocking::Synthesizer, + audio_query: &AudioQueryModel, + style_ids: MorphingPair, + morph_rate: f32, + ) -> crate::Result> { + let morph_rate = f64::from(morph_rate); + + if *audio_query.output_sampling_rate() != 24000 || *audio_query.output_stereo() { + todo!(); + } + + let synthesis = + |style_id| synthesizer.synthesis_impl(audio_query, style_id, &Default::default()); + + let waves = MorphingPair { + base: &*synthesis(style_ids.base)?, + target: &synthesis(style_ids.target)?, + }; + + let morph_param = MorphingParameter::new(waves); + + let mut morph_spectrogram = SpectrogramLike::::new( + morph_param.base_spectrogram.time_axis_size(), + morph_param.base_spectrogram.frequency_axis_size(), + ); + + // FIXME: サイズ違いの場合は"resize"する + for (morph_spectrogram, (base_spectrogram, target_spectrogram)) in itertools::zip_eq( + morph_spectrogram.lines_mut(), + itertools::zip_eq( + morph_param.base_spectrogram.lines(), + morph_param.target_spectrogram.lines(), + ), + ) { + for (morph_spectrogram, (base_spectrogram, target_spectrogram)) in itertools::zip_eq( + morph_spectrogram, + itertools::zip_eq(base_spectrogram, target_spectrogram), + ) { + *morph_spectrogram = base_spectrogram * (1. - morph_rate) + target_spectrogram; + } + } + + return world::synthesis::synthesis( + &morph_param.base_f0, + &morph_spectrogram, + &morph_param.base_aperiodicity, + None, + FRAME_PERIOD, + 24000, + ) + .map_err(|_| todo!()); + + const FRAME_PERIOD: f64 = 1.; + + struct MorphingParameter { + base_f0: Box<[f64]>, + base_aperiodicity: SpectrogramLike, + base_spectrogram: SpectrogramLike, + target_spectrogram: SpectrogramLike, + } + + impl MorphingParameter { + fn new(waves_24khb: MorphingPair<&[f32]>) -> Self { + let (base_f0, base_spectrogram, base_aperiodicity) = analyze(waves_24khb.base); + let (_, target_spectrogram, _) = analyze(waves_24khb.target); + + Self { + base_f0, + base_aperiodicity, + base_spectrogram, + target_spectrogram, + } + } + } + + fn analyze(wave: &[f32]) -> (Box<[f64]>, SpectrogramLike, SpectrogramLike) { + let analyzer = { + let mut analyzer = SignalAnalyzerBuilder::new(24000); + analyzer.harvest_option_mut().set_frame_period(FRAME_PERIOD); + analyzer.build(wave.iter().copied().map(Into::into).collect()) + }; + + analyzer.calc_all(); + + let AnalyzeResult { + f0, + spectrogram, + aperiodicity, + .. + } = analyzer.into_result(); + + let f0 = f0.expect("should be present"); + let spectrogram = spectrogram.expect("should be present"); + let aperiodicity = aperiodicity.expect("should be present"); + + (f0, spectrogram, aperiodicity) + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct MorphingPair { + pub(crate) base: T, + pub(crate) target: T, +} + +impl MorphingPair { + fn lookup_speakers(self, metas: &[SpeakerMeta]) -> crate::Result> { + let lookup_speaker = |style_id| { + metas + .iter() + .find(|m| m.styles().iter().any(|m| *m.id() == style_id)) + .ok_or(ErrorRepr::StyleNotFound { style_id }) + }; + + let base = lookup_speaker(self.base)?; + let target = lookup_speaker(self.target)?; + + Ok(MorphingPair { base, target }) + } +} + +// ==========================================WARNING============================================== +// +// DO NOT BYPASS THIS OR YOU MAY VIOLATE THE ToS OF THE MODELS +// +// =============================================================================================== +mod permission { + use crate::{metas::PermittedSynthesisMorphing, SpeakerMeta}; + + use super::MorphError; + use super::MorphingPair; + + #[readonly::make] + pub(super) struct Permission<'speakers> { + pub(super) metas: MorphingPair<&'speakers SpeakerMeta>, + } + + impl<'speakers> Permission<'speakers> { + pub(super) fn new( + metas: MorphingPair<&'speakers SpeakerMeta>, + ) -> std::result::Result { + match metas.permissions() { + MorphingPair { + base: PermittedSynthesisMorphing::All, + target: PermittedSynthesisMorphing::All, + } => {} + + MorphingPair { + base: PermittedSynthesisMorphing::SelfOnly, + target: PermittedSynthesisMorphing::SelfOnly, + } if metas.base.speaker_uuid() == metas.target.speaker_uuid() => {} + + _ => return Err(MorphError), + } + + Ok(Self { metas }) + } + } + + impl<'speakers> MorphingPair<&'speakers SpeakerMeta> { + fn permissions(self) -> MorphingPair { + let Self { base, target } = self; + + MorphingPair { + base: base.supported_features().permitted_synthesis_morphing, + target: target.supported_features().permitted_synthesis_morphing, + } + } + } +} diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 19d464d21..96d95a052 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -1,5 +1,5 @@ use crate::{ - engine::{FullContextLabelError, KanaParseError}, + engine::{FullContextLabelError, KanaParseError, MorphError}, user_dict::InvalidWordError, StyleId, VoiceModelId, }; @@ -20,6 +20,7 @@ pub struct Error(#[from] ErrorRepr); [ FullContextLabelError ]; [ KanaParseError ]; [ InvalidWordError ]; + [ MorphError ]; )] impl From for Error { fn from(err: E) -> Self { @@ -51,6 +52,7 @@ impl Error { ErrorRepr::WordNotFound(_) => ErrorKind::WordNotFound, ErrorRepr::UseUserDict(_) => ErrorKind::UseUserDict, ErrorRepr::InvalidWord(_) => ErrorKind::InvalidWord, + ErrorRepr::Morph(_) => ErrorKind::Morph, } } } @@ -104,6 +106,9 @@ pub(crate) enum ErrorRepr { #[error(transparent)] InvalidWord(#[from] InvalidWordError), + + #[error(transparent)] + Morph(#[from] MorphError), } /// エラーの種類。 @@ -145,6 +150,8 @@ pub enum ErrorKind { UseUserDict, /// ユーザー辞書の単語のバリデーションに失敗した。 InvalidWord, + /// 指定された話者ペアでのモーフィングに失敗した。 + Morph, } pub(crate) type LoadModelResult = std::result::Result; diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index ea74c9f7c..de5a4301a 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -25,7 +25,7 @@ mod test_util; pub use self::{ devices::SupportedDevices, - engine::{AccentPhraseModel, AudioQueryModel, FullcontextExtractor}, + engine::{AccentPhraseModel, AudioQueryModel, FullcontextExtractor, MorphableTargetInfo}, error::{Error, ErrorKind}, metas::{ RawStyleId, RawStyleVersion, SpeakerMeta, StyleId, StyleMeta, StyleVersion, VoiceModelMeta, diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs index 77cb3a9fc..828ba7cd4 100644 --- a/crates/voicevox_core/src/metas.rs +++ b/crates/voicevox_core/src/metas.rs @@ -65,6 +65,9 @@ pub struct SpeakerMeta { version: StyleVersion, /// 話者のUUID。 speaker_uuid: String, + /// 話者の対応機能。 + #[serde(default)] + supported_features: SpeakerSupportedFeatures, } /// **スタイル**(_style_)のメタ情報。 @@ -75,3 +78,22 @@ pub struct StyleMeta { /// スタイル名。 name: String, } + +#[derive(Default, Deserialize, Serialize, Clone)] +pub struct SpeakerSupportedFeatures { + pub(crate) permitted_synthesis_morphing: PermittedSynthesisMorphing, +} + +#[derive(Deserialize, Serialize, Default, Clone, Copy)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub(crate) enum PermittedSynthesisMorphing { + /// 全て許可。 + All, + + /// 同じ話者内でのみ許可。 + SelfOnly, + + /// 全て禁止。 + #[default] + Nothing, +} diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 0098f3b37..8754fa1f3 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -9,6 +9,12 @@ pub struct SynthesisOptions { pub enable_interrogative_upspeak: bool, } +impl Default for SynthesisOptions { + fn default() -> Self { + (&TtsOptions::default()).into() + } +} + impl AsRef for SynthesisOptions { fn as_ref(&self) -> &SynthesisOptions { self @@ -75,12 +81,20 @@ pub(crate) mod blocking { // (ブロッキング版をpublic APIにするならの話ではあるが)ブロッキング版はブロッキング版でコード例 // を用意する - use std::io::{Cursor, Write as _}; + use std::{ + collections::BTreeMap, + io::{Cursor, Write as _}, + }; + use az::{Az as _, Cast}; use enum_map::enum_map; + use num_traits::Float; use crate::{ - engine::{self, create_kana, parse_kana, MoraModel, OjtPhoneme, Utterance}, + engine::{ + self, create_kana, parse_kana, MoraModel, MorphableTargetInfo, MorphingPair, + OjtPhoneme, Utterance, + }, error::ErrorRepr, infer::{ domain::{ @@ -92,8 +106,8 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, - AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, - SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, + AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, SpeakerMeta, StyleId, + StyleMeta, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; use super::{AccelerationMode, InferenceRuntimeImpl, InitializeOptions, TtsOptions}; @@ -222,6 +236,27 @@ pub(crate) mod blocking { self.status.metas() } + pub fn morphable_targets( + &self, + style_id: StyleId, + ) -> Result> { + let metas = &self.metas(); + + metas + .iter() + .flat_map(SpeakerMeta::styles) + .map(StyleMeta::id) + .map(|&target| { + let style_ids = MorphingPair { + base: style_id, + target, + }; + let is_morphable = self.is_synthesis_morphing_permitted(style_ids, metas)?; + Ok((target, MorphableTargetInfo { is_morphable })) + }) + .collect() + } + /// AudioQueryから音声合成を行う。 pub fn synthesis( &self, @@ -229,6 +264,31 @@ pub(crate) mod blocking { style_id: StyleId, options: &SynthesisOptions, ) -> Result> { + let wave = &self.synthesis_impl(audio_query, style_id, options)?; + Ok(to_wav(wave, audio_query)) + } + + pub fn synthesis_morphing( + &self, + audio_query: &AudioQueryModel, + base_style_id: StyleId, + target_style_id: StyleId, + morph_rate: f32, + ) -> crate::Result> { + let style_ids = MorphingPair { + base: base_style_id, + target: target_style_id, + }; + let wave = &self.synthesis_morphing_(audio_query, style_ids, morph_rate)?; + Ok(to_wav(wave, audio_query)) + } + + pub(crate) fn synthesis_impl( + &self, + audio_query: &AudioQueryModel, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { let speed_scale = *audio_query.speed_scale(); let pitch_scale = *audio_query.pitch_scale(); let intonation_scale = *audio_query.intonation_scale(); @@ -324,14 +384,13 @@ pub(crate) mod blocking { // 2次元のvectorを1次元に変換し、アドレスを連続させる let flatten_phoneme = phoneme.into_iter().flatten().collect::>(); - let wave = &self.decode( + return self.decode( f0.len(), OjtPhoneme::num_phoneme(), &f0, &flatten_phoneme, style_id, - )?; - return Ok(to_wav(wave, audio_query)); + ); fn adjust_interrogative_accent_phrases( accent_phrases: &[AccentPhraseModel], @@ -1162,7 +1221,12 @@ pub(crate) mod blocking { } } - fn to_wav(wave: &[f32], audio_query: &AudioQueryModel) -> Vec { + fn to_wav + From + Cast>( + wave: &[T], + audio_query: &AudioQueryModel, + ) -> Vec { + // TODO: ライブラリ(e.g. https://docs.rs/hound)を使う + let volume_scale = *audio_query.volume_scale(); let output_stereo = *audio_query.output_stereo(); let output_sampling_rate = *audio_query.output_sampling_rate(); @@ -1197,9 +1261,13 @@ pub(crate) mod blocking { cur.write_all("data".as_bytes()).unwrap(); cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - for value in wave { - let v = (value * volume_scale).clamp(-1., 1.); - let data = (v * 0x7fff as f32) as i16; + for &value in wave { + let v = num_traits::clamp( + value * >::from(volume_scale), + -T::one(), + T::one(), + ); + let data = (v * >::from(0x7fff)).az::(); for _ in 0..repeat_count { cur.write_all(&data.to_le_bytes()).unwrap(); } @@ -1210,11 +1278,11 @@ pub(crate) mod blocking { } pub(crate) mod tokio { - use std::sync::Arc; + use std::{collections::BTreeMap, sync::Arc}; use crate::{ - AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, - SynthesisOptions, VoiceModelId, VoiceModelMeta, + AccentPhraseModel, AudioQueryModel, FullcontextExtractor, MorphableTargetInfo, Result, + StyleId, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; use super::{InitializeOptions, TtsOptions}; @@ -1257,6 +1325,13 @@ pub(crate) mod tokio { self.0.metas() } + pub fn morphable_targets( + &self, + style_id: StyleId, + ) -> Result> { + self.0.morphable_targets(style_id) + } + pub async fn synthesis( &self, audio_query: &AudioQueryModel, @@ -1271,6 +1346,27 @@ pub(crate) mod tokio { .await } + pub async fn synthesis_morphing( + &self, + audio_query: &AudioQueryModel, + base_style_id: StyleId, + target_style_id: StyleId, + morph_rate: f32, + ) -> crate::Result> { + let blocking = self.0.clone(); + let audio_query = audio_query.clone(); + + crate::task::asyncify(move || { + blocking.synthesis_morphing( + &audio_query, + base_style_id, + target_style_id, + morph_rate, + ) + }) + .await + } + pub async fn create_accent_phrases_from_kana( &self, kana: &str, diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index d69641c34..9cc346033 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -51,6 +51,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes WordNotFound => VOICEVOX_RESULT_USER_DICT_WORD_NOT_FOUND_ERROR, UseUserDict => VOICEVOX_RESULT_USE_USER_DICT_ERROR, InvalidWord => VOICEVOX_RESULT_INVALID_USER_DICT_WORD_ERROR, + Morph => VOICEVOX_RESULT_MORPH_ERROR, }, Err(InvalidUtf8Input) => VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR, Err(InvalidAudioQuery(_)) => VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR, diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs index 65236ada4..be66cb702 100644 --- a/crates/voicevox_core_c_api/src/result_code.rs +++ b/crates/voicevox_core_c_api/src/result_code.rs @@ -55,6 +55,8 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_INVALID_USER_DICT_WORD_ERROR = 24, /// UUIDの変換に失敗した VOICEVOX_RESULT_INVALID_UUID_ERROR = 25, + /// 指定された話者ペアでのモーフィングが不可能 + VOICEVOX_RESULT_MORPH_ERROR = 28, } pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static CStr { @@ -107,5 +109,6 @@ pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> cstr!("ユーザー辞書の単語のバリデーションに失敗しました") } VOICEVOX_RESULT_INVALID_UUID_ERROR => cstr!("UUIDの変換に失敗しました"), + VOICEVOX_RESULT_MORPH_ERROR => cstr!("指定された話者ペアでのモーフィングはできません"), } } diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs index c2987e207..fb93c030a 100644 --- a/crates/voicevox_core_java_api/src/common.rs +++ b/crates/voicevox_core_java_api/src/common.rs @@ -146,6 +146,7 @@ where WordNotFound, UseUserDict, InvalidWord, + Morph, // TODO ); let mut sources = diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 4b908c48b..758a8e513 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -203,6 +203,7 @@ pub impl voicevox_core::Result { ErrorKind::WordNotFound => WordNotFoundError::new_err(msg), ErrorKind::UseUserDict => UseUserDictError::new_err(msg), ErrorKind::InvalidWord => InvalidWordError::new_err(msg), + ErrorKind::Morph => todo!(), }; [top] From 06556c935ae847cce2c3f55201df1a843c938cb9 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 13:51:21 +0900 Subject: [PATCH 03/46] =?UTF-8?q?`Permission`=E3=81=AB`StyleId`=E3=82=92?= =?UTF-8?q?=E6=8C=81=E3=81=9F=E3=81=9B=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 90 +++++++++++++----------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 47ee0798f..388710047 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -30,9 +30,9 @@ impl crate::blocking::Synthesizer { morph_rate: f32, ) -> crate::Result> { let metas = &self.metas(); - let metas = style_ids.lookup_speakers(metas)?; + let pair = style_ids.lookup_speakers(metas)?; - Permission::new(metas)?.synthesis_morphing(self, audio_query, style_ids, morph_rate) + Permission::new(pair)?.synthesis_morphing(self, audio_query, morph_rate) } } @@ -41,7 +41,6 @@ impl<'speakers> Permission<'speakers> { self, synthesizer: &crate::blocking::Synthesizer, audio_query: &AudioQueryModel, - style_ids: MorphingPair, morph_rate: f32, ) -> crate::Result> { let morph_rate = f64::from(morph_rate); @@ -50,13 +49,9 @@ impl<'speakers> Permission<'speakers> { todo!(); } - let synthesis = - |style_id| synthesizer.synthesis_impl(audio_query, style_id, &Default::default()); - - let waves = MorphingPair { - base: &*synthesis(style_ids.base)?, - target: &synthesis(style_ids.target)?, - }; + let waves = &self.styles.try_map(|style_id| { + synthesizer.synthesis_impl(audio_query, style_id, &Default::default()) + })?; let morph_param = MorphingParameter::new(waves); @@ -101,9 +96,9 @@ impl<'speakers> Permission<'speakers> { } impl MorphingParameter { - fn new(waves_24khb: MorphingPair<&[f32]>) -> Self { - let (base_f0, base_spectrogram, base_aperiodicity) = analyze(waves_24khb.base); - let (_, target_spectrogram, _) = analyze(waves_24khb.target); + fn new(waves_24khz: &MorphingPair>) -> Self { + let (base_f0, base_spectrogram, base_aperiodicity) = analyze(&waves_24khz.base); + let (_, target_spectrogram, _) = analyze(&waves_24khz.target); Self { base_f0, @@ -145,19 +140,35 @@ pub(crate) struct MorphingPair { pub(crate) target: T, } +impl MorphingPair { + fn map(self, mut f: impl FnMut(T) -> S) -> MorphingPair { + let base = f(self.base); + let target = f(self.target); + MorphingPair { base, target } + } + + fn try_map( + self, + mut f: impl FnMut(T) -> std::result::Result, + ) -> std::result::Result, E> { + let base = f(self.base)?; + let target = f(self.target)?; + Ok(MorphingPair { base, target }) + } +} + impl MorphingPair { - fn lookup_speakers(self, metas: &[SpeakerMeta]) -> crate::Result> { - let lookup_speaker = |style_id| { + fn lookup_speakers( + self, + metas: &[SpeakerMeta], + ) -> crate::Result> { + self.try_map(|style_id| { metas .iter() .find(|m| m.styles().iter().any(|m| *m.id() == style_id)) - .ok_or(ErrorRepr::StyleNotFound { style_id }) - }; - - let base = lookup_speaker(self.base)?; - let target = lookup_speaker(self.target)?; - - Ok(MorphingPair { base, target }) + .ok_or_else(|| ErrorRepr::StyleNotFound { style_id }.into()) + .map(|speaker| (style_id, speaker)) + }) } } @@ -167,21 +178,26 @@ impl MorphingPair { // // =============================================================================================== mod permission { - use crate::{metas::PermittedSynthesisMorphing, SpeakerMeta}; + use std::marker::PhantomData; + + use crate::{metas::PermittedSynthesisMorphing, SpeakerMeta, StyleId}; - use super::MorphError; - use super::MorphingPair; + use super::{MorphError, MorphingPair}; + // FIXME: Rust Analyzerが脱糖後の可視性を勘違いして激怒するので、`readonly`はやめて普通に + // getterを生やす #[readonly::make] pub(super) struct Permission<'speakers> { - pub(super) metas: MorphingPair<&'speakers SpeakerMeta>, + pub(super) styles: MorphingPair, + marker: PhantomData<&'speakers ()>, } impl<'speakers> Permission<'speakers> { pub(super) fn new( - metas: MorphingPair<&'speakers SpeakerMeta>, + pair: MorphingPair<(StyleId, &'speakers SpeakerMeta)>, ) -> std::result::Result { - match metas.permissions() { + match pair.map(|(_, speaker)| speaker.supported_features().permitted_synthesis_morphing) + { MorphingPair { base: PermittedSynthesisMorphing::All, target: PermittedSynthesisMorphing::All, @@ -190,23 +206,15 @@ mod permission { MorphingPair { base: PermittedSynthesisMorphing::SelfOnly, target: PermittedSynthesisMorphing::SelfOnly, - } if metas.base.speaker_uuid() == metas.target.speaker_uuid() => {} + } if pair.base.1.speaker_uuid() == pair.target.1.speaker_uuid() => {} _ => return Err(MorphError), } - Ok(Self { metas }) - } - } - - impl<'speakers> MorphingPair<&'speakers SpeakerMeta> { - fn permissions(self) -> MorphingPair { - let Self { base, target } = self; - - MorphingPair { - base: base.supported_features().permitted_synthesis_morphing, - target: target.supported_features().permitted_synthesis_morphing, - } + Ok(Self { + styles: pair.map(|(style_id, _)| style_id), + marker: PhantomData, + }) } } } From 21e0715ee1d0b11303dcb60a23747029a99cc6d1 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 13:53:30 +0900 Subject: [PATCH 04/46] Minor refactor --- crates/voicevox_core/src/engine/morph.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 388710047..3c9ae67b1 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -196,17 +196,21 @@ mod permission { pub(super) fn new( pair: MorphingPair<(StyleId, &'speakers SpeakerMeta)>, ) -> std::result::Result { - match pair.map(|(_, speaker)| speaker.supported_features().permitted_synthesis_morphing) - { + match pair.map(|(_, speaker)| { + ( + speaker.supported_features().permitted_synthesis_morphing, + speaker.speaker_uuid(), + ) + }) { MorphingPair { - base: PermittedSynthesisMorphing::All, - target: PermittedSynthesisMorphing::All, + base: (PermittedSynthesisMorphing::All, _), + target: (PermittedSynthesisMorphing::All, _), } => {} MorphingPair { - base: PermittedSynthesisMorphing::SelfOnly, - target: PermittedSynthesisMorphing::SelfOnly, - } if pair.base.1.speaker_uuid() == pair.target.1.speaker_uuid() => {} + base: (PermittedSynthesisMorphing::SelfOnly, base_speaker_uuid), + target: (PermittedSynthesisMorphing::SelfOnly, target_speaker_uuid), + } if base_speaker_uuid == target_speaker_uuid => {} _ => return Err(MorphError), } From a31bd96f76202dd1e7683592b0979f9ec60c8a5b Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 13:53:56 +0900 Subject: [PATCH 05/46] =?UTF-8?q?voicevox=5Fcore.h=E3=82=92=E3=82=A2?= =?UTF-8?q?=E3=83=83=E3=83=97=E3=83=87=E3=83=BC=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core_c_api/include/voicevox_core.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index 2275bd1d7..20d962845 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -178,6 +178,10 @@ enum VoicevoxResultCode * UUIDの変換に失敗した */ VOICEVOX_RESULT_INVALID_UUID_ERROR = 25, + /** + * 指定された話者ペアでのモーフィングが不可能 + */ + VOICEVOX_RESULT_MORPH_ERROR = 28, }; #ifndef __cplusplus typedef int32_t VoicevoxResultCode; From 2acd5e83b57b745200300fccea3c6be716924e81 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 13:57:35 +0900 Subject: [PATCH 06/46] =?UTF-8?q?`readonly`=E3=82=92=E3=82=84=E3=82=81?= =?UTF-8?q?=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 12 ------------ Cargo.toml | 1 - crates/voicevox_core/Cargo.toml | 1 - crates/voicevox_core/src/engine/morph.rs | 11 ++++++----- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b239fdb8..86780eaec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3097,17 +3097,6 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "readonly" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8f439da1766942fe069954da6058b2e6c1760eb878bae76f5be9fc29f56f574" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.38", -] - [[package]] name = "redox_syscall" version = "0.2.16" @@ -4407,7 +4396,6 @@ dependencies = [ "ouroboros", "pretty_assertions", "rayon", - "readonly", "regex", "rstest", "serde", diff --git a/Cargo.toml b/Cargo.toml index f4e6e6f22..2039d941d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,7 +59,6 @@ pyo3-asyncio = "0.19.0" pyo3-log = "0.9.0" quote = "1.0.33" rayon = "1.6.1" -readonly = "0.2.11" regex = "1.10.0" reqwest = { version = "0.11.13", default-features = false } rstest = "0.15.0" diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index f632bd3bd..1d27d2616 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -31,7 +31,6 @@ onnxruntime.workspace = true open_jtalk.workspace = true ouroboros.workspace = true rayon.workspace = true -readonly.workspace = true regex.workspace = true serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["preserve_order"] } diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 3c9ae67b1..67905ff6b 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -49,7 +49,7 @@ impl<'speakers> Permission<'speakers> { todo!(); } - let waves = &self.styles.try_map(|style_id| { + let waves = &self.styles().try_map(|style_id| { synthesizer.synthesis_impl(audio_query, style_id, &Default::default()) })?; @@ -184,11 +184,8 @@ mod permission { use super::{MorphError, MorphingPair}; - // FIXME: Rust Analyzerが脱糖後の可視性を勘違いして激怒するので、`readonly`はやめて普通に - // getterを生やす - #[readonly::make] pub(super) struct Permission<'speakers> { - pub(super) styles: MorphingPair, + styles: MorphingPair, marker: PhantomData<&'speakers ()>, } @@ -220,5 +217,9 @@ mod permission { marker: PhantomData, }) } + + pub(super) fn styles(&self) -> MorphingPair { + self.styles + } } } From 11621028601d576831a082abbedcb165400d6979 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 14:03:29 +0900 Subject: [PATCH 07/46] =?UTF-8?q?`Permission`=20=E2=86=92=20`MorphablePair?= =?UTF-8?q?`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 67905ff6b..f38dcea29 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -6,7 +6,7 @@ use world::{ use crate::{error::ErrorRepr, AudioQueryModel, SpeakerMeta, StyleId}; -use self::permission::Permission; +use self::permit::MorphablePair; // FIXME: 許可対象外のときと、WORLDがなんかエラーを吐いたときとに分割する #[derive(Error, Debug)] @@ -20,7 +20,7 @@ impl crate::blocking::Synthesizer { metas: &[SpeakerMeta], ) -> crate::Result { let metas = style_ids.lookup_speakers(metas)?; - Ok(Permission::new(metas).is_ok()) + Ok(MorphablePair::permit(metas).is_ok()) } pub(crate) fn synthesis_morphing_( @@ -32,11 +32,11 @@ impl crate::blocking::Synthesizer { let metas = &self.metas(); let pair = style_ids.lookup_speakers(metas)?; - Permission::new(pair)?.synthesis_morphing(self, audio_query, morph_rate) + MorphablePair::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) } } -impl<'speakers> Permission<'speakers> { +impl<'speakers> MorphablePair<'speakers> { fn synthesis_morphing( self, synthesizer: &crate::blocking::Synthesizer, @@ -49,7 +49,7 @@ impl<'speakers> Permission<'speakers> { todo!(); } - let waves = &self.styles().try_map(|style_id| { + let waves = &self.get().try_map(|style_id| { synthesizer.synthesis_impl(audio_query, style_id, &Default::default()) })?; @@ -177,20 +177,20 @@ impl MorphingPair { // DO NOT BYPASS THIS OR YOU MAY VIOLATE THE ToS OF THE MODELS // // =============================================================================================== -mod permission { +mod permit { use std::marker::PhantomData; use crate::{metas::PermittedSynthesisMorphing, SpeakerMeta, StyleId}; use super::{MorphError, MorphingPair}; - pub(super) struct Permission<'speakers> { - styles: MorphingPair, + pub(super) struct MorphablePair<'speakers> { + inner: MorphingPair, marker: PhantomData<&'speakers ()>, } - impl<'speakers> Permission<'speakers> { - pub(super) fn new( + impl<'speakers> MorphablePair<'speakers> { + pub(super) fn permit( pair: MorphingPair<(StyleId, &'speakers SpeakerMeta)>, ) -> std::result::Result { match pair.map(|(_, speaker)| { @@ -213,13 +213,13 @@ mod permission { } Ok(Self { - styles: pair.map(|(style_id, _)| style_id), + inner: pair.map(|(style_id, _)| style_id), marker: PhantomData, }) } - pub(super) fn styles(&self) -> MorphingPair { - self.styles + pub(super) fn get(&self) -> MorphingPair { + self.inner } } } From 82260caa9c182e4c7f34e1e77ab94d6ed436a12d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 14:04:20 +0900 Subject: [PATCH 08/46] =?UTF-8?q?[skip=20ci]=20`MorphablePair`=20=E2=86=92?= =?UTF-8?q?=20`MorphableTargets`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index f38dcea29..36e831e85 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -6,7 +6,7 @@ use world::{ use crate::{error::ErrorRepr, AudioQueryModel, SpeakerMeta, StyleId}; -use self::permit::MorphablePair; +use self::permit::MorphableTargets; // FIXME: 許可対象外のときと、WORLDがなんかエラーを吐いたときとに分割する #[derive(Error, Debug)] @@ -20,7 +20,7 @@ impl crate::blocking::Synthesizer { metas: &[SpeakerMeta], ) -> crate::Result { let metas = style_ids.lookup_speakers(metas)?; - Ok(MorphablePair::permit(metas).is_ok()) + Ok(MorphableTargets::permit(metas).is_ok()) } pub(crate) fn synthesis_morphing_( @@ -32,11 +32,11 @@ impl crate::blocking::Synthesizer { let metas = &self.metas(); let pair = style_ids.lookup_speakers(metas)?; - MorphablePair::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) + MorphableTargets::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) } } -impl<'speakers> MorphablePair<'speakers> { +impl<'speakers> MorphableTargets<'speakers> { fn synthesis_morphing( self, synthesizer: &crate::blocking::Synthesizer, @@ -184,12 +184,12 @@ mod permit { use super::{MorphError, MorphingPair}; - pub(super) struct MorphablePair<'speakers> { + pub(super) struct MorphableTargets<'speakers> { inner: MorphingPair, marker: PhantomData<&'speakers ()>, } - impl<'speakers> MorphablePair<'speakers> { + impl<'speakers> MorphableTargets<'speakers> { pub(super) fn permit( pair: MorphingPair<(StyleId, &'speakers SpeakerMeta)>, ) -> std::result::Result { From ae080b598e4b2c6944cf653b6cea1158c0fd7c7b Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 14:24:52 +0900 Subject: [PATCH 09/46] [skip ci] Minor refactor --- crates/voicevox_core/src/engine/morph.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 36e831e85..ad9ad61f8 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -19,8 +19,8 @@ impl crate::blocking::Synthesizer { style_ids: MorphingPair, metas: &[SpeakerMeta], ) -> crate::Result { - let metas = style_ids.lookup_speakers(metas)?; - Ok(MorphableTargets::permit(metas).is_ok()) + let pair = style_ids.lookup_speakers(metas)?; + Ok(MorphableTargets::permit(pair).is_ok()) } pub(crate) fn synthesis_morphing_( @@ -36,7 +36,7 @@ impl crate::blocking::Synthesizer { } } -impl<'speakers> MorphableTargets<'speakers> { +impl<'metas> MorphableTargets<'metas> { fn synthesis_morphing( self, synthesizer: &crate::blocking::Synthesizer, @@ -184,14 +184,14 @@ mod permit { use super::{MorphError, MorphingPair}; - pub(super) struct MorphableTargets<'speakers> { + pub(super) struct MorphableTargets<'metas> { inner: MorphingPair, - marker: PhantomData<&'speakers ()>, + marker: PhantomData<&'metas ()>, } - impl<'speakers> MorphableTargets<'speakers> { + impl<'metas> MorphableTargets<'metas> { pub(super) fn permit( - pair: MorphingPair<(StyleId, &'speakers SpeakerMeta)>, + pair: MorphingPair<(StyleId, &'metas SpeakerMeta)>, ) -> std::result::Result { match pair.map(|(_, speaker)| { ( From 26a72e02ed66915a53ad9a95dcb0e1020fa86154 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Mon, 25 Dec 2023 14:26:56 +0900 Subject: [PATCH 10/46] [skip ci] Minor refactor --- crates/voicevox_core/src/engine/morph.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index ad9ad61f8..7eed13793 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -163,11 +163,11 @@ impl MorphingPair { metas: &[SpeakerMeta], ) -> crate::Result> { self.try_map(|style_id| { - metas + let speaker = metas .iter() .find(|m| m.styles().iter().any(|m| *m.id() == style_id)) - .ok_or_else(|| ErrorRepr::StyleNotFound { style_id }.into()) - .map(|speaker| (style_id, speaker)) + .ok_or(ErrorRepr::StyleNotFound { style_id })?; + Ok((style_id, speaker)) }) } } From 58d6d7d098fdbfad455cbf08578c6800ee2c1406 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 30 Dec 2023 10:29:40 +0900 Subject: [PATCH 11/46] =?UTF-8?q?snapshots.toml=E3=82=92=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/e2e/snapshots.toml | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 8f3fa4f3b..262c5ec0c 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -10,7 +10,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7" + "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } }, { "name": "dummy2", @@ -21,7 +24,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905" + "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } }, { "name": "dummy3", @@ -36,7 +42,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3" + "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } } ]''' stderr.windows = ''' @@ -93,7 +102,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7" + "speaker_uuid": "574bc678-8370-44be-b941-08e46e7b47d7", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } }, { "name": "dummy2", @@ -104,7 +116,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905" + "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } }, { "name": "dummy3", @@ -119,7 +134,10 @@ metas = ''' } ], "version": "0.0.1", - "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3" + "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", + "supported_features": { + "permitted_synthesis_morphing": "NOTHING" + } } ]''' stderr.windows = ''' From 66be03fec9b6fd163db747c2770433c6e054570e Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 30 Dec 2023 23:28:50 +0900 Subject: [PATCH 12/46] =?UTF-8?q?`mingw-w64-x86=5F64-clang`=E3=82=92?= =?UTF-8?q?=E3=82=A4=E3=83=B3=E3=82=B9=E3=83=88=E3=83=BC=E3=83=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build_and_deploy.yml | 5 +++++ .github/workflows/test.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 80a7f807c..b643d3769 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -208,6 +208,11 @@ jobs: git fetch private refs/tags/${{ env.PRODUCTION_REPOSITORY_TAG }} git -c user.name=dummy -c user.email=dummy@dummy.dummy merge FETCH_HEAD ) > /dev/null 2>&1 + - name: Install mingw-w64-x86_64-clang + if: matrix.os == 'windows-2019' + uses: msys2/setup-msys2@v2 + with: + install: mingw-w64-x86_64-clang - name: Set up Python 3.8 if: matrix.whl_local_version uses: actions/setup-python@v4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3e47256f3..98a04c1c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -125,6 +125,11 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 + - name: Install mingw-w64-x86_64-clang + if: matrix.os == 'windows-2019' + uses: msys2/setup-msys2@v2 + with: + install: mingw-w64-x86_64-clang - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From ccd3c81eeb836066a4aba35172fc331906c2bc6d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 30 Dec 2023 23:29:22 +0900 Subject: [PATCH 13/46] =?UTF-8?q?`windows-x86-cpu`=E3=81=AE`can=5Fskip=5Fi?= =?UTF-8?q?n=5Fsimple=5Ftest`=E3=82=92=E5=A4=96=E3=81=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build_and_deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index b643d3769..1d6ac586f 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -86,7 +86,7 @@ jobs: "artifact_name": "windows-x86-cpu", "whl_local_version": "cpu", "use_cuda": false, - "can_skip_in_simple_test": true + "can_skip_in_simple_test": false }, { "os": "ubuntu-20.04", From 459a88144b96ac3431d3818bf28a8266fe22109b Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 00:04:47 +0900 Subject: [PATCH 14/46] =?UTF-8?q?KyleMayes/install-llvm-action=E3=82=92?= =?UTF-8?q?=E4=BD=BF=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build_and_deploy.yml | 8 ++++---- .github/workflows/test.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 1d6ac586f..1e74d7532 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -208,11 +208,11 @@ jobs: git fetch private refs/tags/${{ env.PRODUCTION_REPOSITORY_TAG }} git -c user.name=dummy -c user.email=dummy@dummy.dummy merge FETCH_HEAD ) > /dev/null 2>&1 - - name: Install mingw-w64-x86_64-clang - if: matrix.os == 'windows-2019' - uses: msys2/setup-msys2@v2 + - if: matrix.os == 'windows-2019' + name: Install Clang + uses: KyleMayes/install-llvm-action@v1 with: - install: mingw-w64-x86_64-clang + version: "16.0" - name: Set up Python 3.8 if: matrix.whl_local_version uses: actions/setup-python@v4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 98a04c1c8..53cf4457f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -125,11 +125,11 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 - - name: Install mingw-w64-x86_64-clang - if: matrix.os == 'windows-2019' - uses: msys2/setup-msys2@v2 + - if: matrix.os == 'windows-2019' + name: Install Clang + uses: KyleMayes/install-llvm-action@v1 with: - install: mingw-w64-x86_64-clang + version: "16.0" - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From 7205c39c0b728171ccd56ccbeaaa6b1285c7829e Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 00:31:12 +0900 Subject: [PATCH 15/46] =?UTF-8?q?`i686-pc-windows-msvc`=E3=81=8B=E3=82=89C?= =?UTF-8?q?lang=E3=81=AE=E3=82=A4=E3=83=B3=E3=82=B9=E3=83=88=E3=83=BC?= =?UTF-8?q?=E3=83=AB=E3=82=92=E5=A4=96=E3=81=97=E3=81=A6=E3=81=BF=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build_and_deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 1e74d7532..553b5574e 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -208,7 +208,7 @@ jobs: git fetch private refs/tags/${{ env.PRODUCTION_REPOSITORY_TAG }} git -c user.name=dummy -c user.email=dummy@dummy.dummy merge FETCH_HEAD ) > /dev/null 2>&1 - - if: matrix.os == 'windows-2019' + - if: matrix.os == 'windows-2019' && matrix.target == 'x86_64-pc-windows-msvc' name: Install Clang uses: KyleMayes/install-llvm-action@v1 with: From 471264dd731a426d8ead629278842e81360d85d8 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 00:45:00 +0900 Subject: [PATCH 16/46] =?UTF-8?q?Revert=20"`windows-x86-cpu`=E3=81=AE`can?= =?UTF-8?q?=5Fskip=5Fin=5Fsimple=5Ftest`=E3=82=92=E5=A4=96=E3=81=99"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit ccd3c81eeb836066a4aba35172fc331906c2bc6d. --- .github/workflows/build_and_deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 553b5574e..7dc125740 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -86,7 +86,7 @@ jobs: "artifact_name": "windows-x86-cpu", "whl_local_version": "cpu", "use_cuda": false, - "can_skip_in_simple_test": false + "can_skip_in_simple_test": true }, { "os": "ubuntu-20.04", From 58f6f906fef232ff5a07abaa869cf805c0af9919 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 00:45:21 +0900 Subject: [PATCH 17/46] =?UTF-8?q?Revert=20"`i686-pc-windows-msvc`=E3=81=8B?= =?UTF-8?q?=E3=82=89Clang=E3=81=AE=E3=82=A4=E3=83=B3=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=82=92=E5=A4=96=E3=81=97=E3=81=A6=E3=81=BF?= =?UTF-8?q?=E3=82=8B"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7205c39c0b728171ccd56ccbeaaa6b1285c7829e. --- .github/workflows/build_and_deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 7dc125740..e64509bb1 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -208,7 +208,7 @@ jobs: git fetch private refs/tags/${{ env.PRODUCTION_REPOSITORY_TAG }} git -c user.name=dummy -c user.email=dummy@dummy.dummy merge FETCH_HEAD ) > /dev/null 2>&1 - - if: matrix.os == 'windows-2019' && matrix.target == 'x86_64-pc-windows-msvc' + - if: matrix.os == 'windows-2019' name: Install Clang uses: KyleMayes/install-llvm-action@v1 with: From 8ff5a5e9cd0184e143bc566d3f933f249b8eb75c Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 03:05:31 +0900 Subject: [PATCH 18/46] =?UTF-8?q?sample.vvm=E3=82=92=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/e2e/snapshots.toml | 8 ++++---- model/sample.vvm | Bin 53465033 -> 53465028 bytes 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 262c5ec0c..f86103385 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -26,7 +26,7 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905", "supported_features": { - "permitted_synthesis_morphing": "NOTHING" + "permitted_synthesis_morphing": "ALL" } }, { @@ -44,7 +44,7 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "supported_features": { - "permitted_synthesis_morphing": "NOTHING" + "permitted_synthesis_morphing": "SELF_ONLY" } } ]''' @@ -118,7 +118,7 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "dd9ccd75-75f6-40ce-a3db-960cbed2e905", "supported_features": { - "permitted_synthesis_morphing": "NOTHING" + "permitted_synthesis_morphing": "ALL" } }, { @@ -136,7 +136,7 @@ metas = ''' "version": "0.0.1", "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "supported_features": { - "permitted_synthesis_morphing": "NOTHING" + "permitted_synthesis_morphing": "SELF_ONLY" } } ]''' diff --git a/model/sample.vvm b/model/sample.vvm index 48d23745d99f811b58e6dab20d5dafb5b7f78e31..381db2577c15a9b9bb7cb42269c684715de74cf0 100644 GIT binary patch delta 2696 zcmZXRY1Ge!)`tK0{d>%FWXen#G8IXt5XzJ}rBdN3l*nAjvPLfU7>(lkXdF$VY1|OaqItB48{?*E8Lgsq zw28KHbF_>0(IGlUr|2AA;+E(dx5jPJEpCrH;?C$Ecg5XtPxOeM(JRv9-ncJ%N1x~$ z_s0X#FCL8kF(4j_fiWls$B-Br!(w&ybVLTnrL`FOti(+v+7fa&#cp+YlrLioQ z$BI}PFU8C8O00@kV|Bb1ug98L8|z|yY=}2vW4sxg;;q;mZ^xE+C*F;%u`S+<_v3@u z9y{X0_$YS9uGk$P$DY_5`{I+>9|z)K9Ewlla2$!x;%IyxU&NR3ReT-C;+yz3zKieU zhxjpmil5_`_%(iu<8dNR#_w?|PRAedXPk+%@mKsE=i;9@9~a`{Bq3puh%}K!WEI&& zc9BEm6uCrhkw@ee`9ywEKwKgUibCR2QCJiaMMW`DTwEqfh?1g|C@so}vZ9h=%Tg(x2#XK=z zED#ID)8ZMCA)XbB#A5NBSR$SmFNhb#Qn5@d7c0a{@sfC1ydqYKSH)`ans{BT5o^Ud zv0iKtZ-|ZJO|eP5B{qw<#TN07cvox{+r)d~eer?VE_R3y#YbYN*d=z0kHsFbSL_p? zi2dS#I4BN@PsL$zM0_TWiqFLt;!E+B_*xtj--vI;cj9~TgZNSWBz_jZh+oBT;B~2+)+LSS6O*vEER4|vDD@;XG$y7E~%$269 zsb;F18m6YX%3N(~ncC(WQ^#Cu>YD3JJyYK_Fbz#3bG>P7nwX~M2Gh(mH!aMK<|fn9 zv@)$t8`IX@Y}%Rjri1BdI+@O2IE%`+y$JZl!2 z#pXG)#5`|aFfW>=W|>)TR+yFMCG)a*#jG-~n$_kt^SW7M)|z!@z1d*iFdNOAW|Mi# zY&LJ3E#@8buGwm~nfJ{5<^!|c>@XjikIYW9%j`BEn>}W)*=Ig6`^^D!&>S+Kn#1OZ z`OF+OpPMhtm*y+;wK-fA2G3U(%bJ3F$l;DIUAuST zP%xoT!lem?6N)4hO(>R7JmIp05(y;}N+py|D3ee&py9rD`RqRL)e1RKN7W_YA7ucToQayL8TxN-B1n zkzTgVjPxPHbET9v_NP?p|6@(Zwyn3;%)jYO*I5hew?4e2?XjKXpP5ps!S>a&Mz%S+ z`24e_o6bCceC_@*9SaZ3SE}jUYfnu)(|z@skr#V5daGUEE@d~IoLq9n=)O5yo|y3G zyQe3f=lH?HZ7=p%S!>#>LtE89c6wu@m(Fad_CSYCJ?2b5R(uW*69)T_O_zy zpUAP~*6fv*r%f|oSo6|;-mxhhi9~Joup;t+g&3mI(@+M>dEE- z%WEWEDzr`trBbQvssH_b(ab)T%2bhBGWP4G)sz1;k>%ee{>wr}j!b$L-IP%~^L9#` j*r!I4u_`l)Wu)(_krXJDCF}nYvt-VUOl+ylE|vNh&yEJ} delta 2620 zcmZXQXZ+WL8is%O{VffZ(jMBS(z16bvy7ZlqqvleQX~|Ty-9;)WJP2P zWy?$`#rdCeUYr-_#r-^=&;5L^=f$;b_LBO`%W_#1r%1V!&sM3Bim8;!**aBHHPy0B zs;5S_P0iFw?bJ!#)Jy#|NW(PBc4?d@X_{tfo)&4DR%xB>(UPmaxTIX)-k#GI7A>6eq!Kc{3sPR(fX2(PIa6|Nrsld_pBr*x zre%6=%8bm+&ABDBa%*Piw%ndOGADC$XYR_~nU{MqKlkRo+@A;XU>?fDc_feKu{@q9 z@?@UM(^-&bvM`IXI7{+up3BlapJjO=FXpAZoaK2XujaM9o;R{0Z|1G6%-eYqM2weT8NgSm1r%t7i~ma(N63j+KUdNqv#|$iyg&IVrQ|7 z*j4N%b{BhyJ;h#PZ?TWqS9B5kiT%X^qO0g84ipE8gT*1@P;r>(E_#T=#S!92ag^vO zjuyv=UZS_?BaRiviQ~ly;zV(h=qvh(lSO}ViWneH6{m@TVvsmpoFUE>XNke$Y%xR( z6~n}EagI1woF_(zkz$lMUyK$PhzrFo%(kYcsby-LI;O6vXX={< zrlDzMwlj@Q6Vuc*GtEs4)6%put2FRk1I(%BG&9f)GN+p}%$epaGuWJMhM1vdm>F)) zG3T1|%m_2mj56n&(dGhkp}EM6F&CRl%%x_mxy)Q{#+mUZW`eoGTxlknN#-hZwZU9t zCYvedS~JyLXRbFlm>bPBGu_-|W|*1gW^;>~Wo|XI&28p(bBCE@=9)XrUFL2x&)j3? zn|sZD=6>^ldC)v$9yX7dN6lmAar1g ztTkVlFU?oxYx9ly)_iBaH|xv~=123B`Puwpel_dOZ)St}-TYzxG#kw(^OxCdwwS-o zKb}~i0v8kog1QCu z3hEa$C}>#Fs9?K-#sy6Znie!GXkO5wpk+a;g4PAw7qls8ThOjxhl2J69g2k=i*@rF zU)g^1v26w{`E+%uRGQ*asd8yZY3QJl14eWlJYx8;*<A3VPDkVbzE*=OUX88w@{wB^;$TimtV z;5lEm@BdMg>ITmY*|l~1Nh6p4`F*XK{nl6O*`p{|tIS2k zvNS6-&xFmh7fExVWgd;NcJHOh!mNu{P`bIrVt#s4h!I zcllC From 706fdac1fba02c6b294f841d9f8f71f64d0c03ea Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 03:13:02 +0900 Subject: [PATCH 19/46] =?UTF-8?q?`morphable=5Ftargets`=E3=81=AE=E5=8D=98?= =?UTF-8?q?=E4=BD=93=E3=83=86=E3=82=B9=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 7 ++ Cargo.toml | 1 + crates/voicevox_core/Cargo.toml | 1 + crates/voicevox_core/src/engine/model.rs | 2 +- crates/voicevox_core/src/metas.rs | 4 +- crates/voicevox_core/src/synthesizer.rs | 102 ++++++++++++++++++++++- 6 files changed, 112 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 86780eaec..72ca4f57c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2200,6 +2200,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "lit2" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcd0c289759ba04eac993bb32289580c7b37955957f8928cab7b29d54cdf89de" + [[package]] name = "lock_api" version = "0.4.9" @@ -4387,6 +4393,7 @@ dependencies = [ "humansize", "indexmap 2.0.0", "itertools 0.10.5", + "lit2", "nanoid", "ndarray", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 2039d941d..cd6dd157c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ jni = "0.21.1" libc = "0.2.134" libloading = "0.7.3" libtest-mimic = "0.6.0" +lit2 = "1.0.9" log = "0.4.17" nanoid = "0.4.0" ndarray = "0.15.6" diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index 1d27d2616..55f6419ef 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -45,6 +45,7 @@ zip.workspace = true [dev-dependencies] heck.workspace = true +lit2.workspace = true pretty_assertions.workspace = true rstest.workspace = true test_util.workspace = true diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs index 705ffe668..6cb1272cc 100644 --- a/crates/voicevox_core/src/engine/model.rs +++ b/crates/voicevox_core/src/engine/model.rs @@ -82,7 +82,7 @@ impl AudioQueryModel { } } -#[derive(Deserialize, Serialize)] +#[derive(Deserialize, Serialize, PartialEq, Debug)] pub struct MorphableTargetInfo { pub is_morphable: bool, } diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs index 828ba7cd4..31f75dff4 100644 --- a/crates/voicevox_core/src/metas.rs +++ b/crates/voicevox_core/src/metas.rs @@ -15,7 +15,7 @@ pub type RawStyleId = u32; /// /// [**話者**(_speaker_)]: SpeakerMeta /// [**スタイル**(_style_)]: StyleMeta -#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)] +#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Hash, Deserialize, Serialize, new, Debug)] pub struct StyleId(RawStyleId); impl StyleId { @@ -84,7 +84,7 @@ pub struct SpeakerSupportedFeatures { pub(crate) permitted_synthesis_morphing: PermittedSynthesisMorphing, } -#[derive(Deserialize, Serialize, Default, Clone, Copy)] +#[derive(Deserialize, Serialize, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] #[serde(rename_all = "SCREAMING_SNAKE_CASE")] pub(crate) enum PermittedSynthesisMorphing { /// 全て許可。 diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 8754fa1f3..eb13fafb3 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1481,10 +1481,12 @@ mod tests { use super::{blocking::PerformInference as _, AccelerationMode, InitializeOptions}; use crate::{ - engine::MoraModel, macros::tests::assert_debug_fmt_eq, test_util::open_default_vvm_file, - AccentPhraseModel, Result, StyleId, + engine::MoraModel, macros::tests::assert_debug_fmt_eq, metas::PermittedSynthesisMorphing, + test_util::open_default_vvm_file, AccentPhraseModel, MorphableTargetInfo, Result, StyleId, }; use ::test_util::OPEN_JTALK_DIC_DIR; + use indexmap::{indexmap, IndexMap}; + use lit2::btreemap; use rstest::rstest; #[rstest] @@ -1555,6 +1557,102 @@ mod tests { ); } + #[tokio::test] + async fn morphable_targets_works() { + let (permissions, morphable_targets) = { + let synthesizer = super::tokio::Synthesizer::new( + (), + &InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }, + ) + .unwrap(); + + let model = &crate::tokio::VoiceModel::sample().await.unwrap(); + synthesizer.load_voice_model(model).await.unwrap(); + + let permissions = synthesizer + .metas() + .iter() + .map(|speaker| { + let permissions = speaker + .styles() + .iter() + .map(move |style| { + ( + *style.id(), + speaker.supported_features().permitted_synthesis_morphing, + ) + }) + .collect(); + (speaker.speaker_uuid().clone(), permissions) + }) + .collect::>>(); + + let morphable_targets = + move |style_id| synthesizer.morphable_targets(style_id).unwrap(); + + (permissions, morphable_targets) + }; + + pretty_assertions::assert_eq!( + indexmap! { + "574bc678-8370-44be-b941-08e46e7b47d7".to_owned() => indexmap! { + StyleId::new(0) => PermittedSynthesisMorphing::Nothing, + }, + "dd9ccd75-75f6-40ce-a3db-960cbed2e905".to_owned() => indexmap! { + StyleId::new(1) => PermittedSynthesisMorphing::All, + }, + "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3".to_owned() => indexmap! { + StyleId::new(302) => PermittedSynthesisMorphing::SelfOnly, + StyleId::new(303) => PermittedSynthesisMorphing::SelfOnly, + }, + }, + permissions, + ); + + pretty_assertions::assert_eq!( + btreemap! { + StyleId::new(0) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(1) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(302) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(303) => MorphableTargetInfo { is_morphable: false }, + }, + morphable_targets(StyleId::new(0)), + ); + + pretty_assertions::assert_eq!( + btreemap! { + StyleId::new(0) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(1) => MorphableTargetInfo { is_morphable: true }, + StyleId::new(302) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(303) => MorphableTargetInfo { is_morphable: false }, + }, + morphable_targets(StyleId::new(1)), + ); + + pretty_assertions::assert_eq!( + btreemap! { + StyleId::new(0) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(1) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(302) => MorphableTargetInfo { is_morphable: true }, + StyleId::new(303) => MorphableTargetInfo { is_morphable: true }, + }, + morphable_targets(StyleId::new(302)), + ); + + pretty_assertions::assert_eq!( + btreemap! { + StyleId::new(0) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(1) => MorphableTargetInfo { is_morphable: false }, + StyleId::new(302) => MorphableTargetInfo { is_morphable: true }, + StyleId::new(303) => MorphableTargetInfo { is_morphable: true }, + }, + morphable_targets(StyleId::new(303)), + ); + } + #[rstest] #[tokio::test] async fn predict_duration_works() { From e21c61c4cafb091074010f48a601446b13f77023 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 22:22:15 +0900 Subject: [PATCH 20/46] =?UTF-8?q?`24000`=20=E2=86=92=20`DEFAULT=5FSAMPLING?= =?UTF-8?q?=5FRATE`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 18 +++++++++++------- crates/voicevox_core/src/synthesizer.rs | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 7eed13793..47e94daf6 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -4,7 +4,9 @@ use world::{ spectrogram_like::SpectrogramLike, }; -use crate::{error::ErrorRepr, AudioQueryModel, SpeakerMeta, StyleId}; +use crate::{ + error::ErrorRepr, synthesizer::DEFAULT_SAMPLING_RATE, AudioQueryModel, SpeakerMeta, StyleId, +}; use self::permit::MorphableTargets; @@ -45,7 +47,9 @@ impl<'metas> MorphableTargets<'metas> { ) -> crate::Result> { let morph_rate = f64::from(morph_rate); - if *audio_query.output_sampling_rate() != 24000 || *audio_query.output_stereo() { + if *audio_query.output_sampling_rate() != DEFAULT_SAMPLING_RATE + || *audio_query.output_stereo() + { todo!(); } @@ -82,7 +86,7 @@ impl<'metas> MorphableTargets<'metas> { &morph_param.base_aperiodicity, None, FRAME_PERIOD, - 24000, + DEFAULT_SAMPLING_RATE, ) .map_err(|_| todo!()); @@ -96,9 +100,9 @@ impl<'metas> MorphableTargets<'metas> { } impl MorphingParameter { - fn new(waves_24khz: &MorphingPair>) -> Self { - let (base_f0, base_spectrogram, base_aperiodicity) = analyze(&waves_24khz.base); - let (_, target_spectrogram, _) = analyze(&waves_24khz.target); + fn new(wave: &MorphingPair>) -> Self { + let (base_f0, base_spectrogram, base_aperiodicity) = analyze(&wave.base); + let (_, target_spectrogram, _) = analyze(&wave.target); Self { base_f0, @@ -111,7 +115,7 @@ impl<'metas> MorphableTargets<'metas> { fn analyze(wave: &[f32]) -> (Box<[f64]>, SpectrogramLike, SpectrogramLike) { let analyzer = { - let mut analyzer = SignalAnalyzerBuilder::new(24000); + let mut analyzer = SignalAnalyzerBuilder::new(DEFAULT_SAMPLING_RATE); analyzer.harvest_option_mut().set_frame_period(FRAME_PERIOD); analyzer.build(wave.iter().copied().map(Into::into).collect()) }; diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index eb13fafb3..7a9375fcd 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,5 +1,7 @@ use crate::infer::runtimes::Onnxruntime; +pub(crate) const DEFAULT_SAMPLING_RATE: u32 = 24000; + /// [`blocking::Synthesizer::synthesis`]および[`tokio::Synthesizer::synthesis`]のオプション。 /// /// [`blocking::Synthesizer::synthesis`]: blocking::Synthesizer::synthesis @@ -112,8 +114,6 @@ pub(crate) mod blocking { use super::{AccelerationMode, InferenceRuntimeImpl, InitializeOptions, TtsOptions}; - const DEFAULT_SAMPLING_RATE: u32 = 24000; - /// 音声シンセサイザ。 pub struct Synthesizer { pub(super) status: Status, From f53fa113f494b47552aaadac2f77990d8d275d11 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 22:33:14 +0900 Subject: [PATCH 21/46] =?UTF-8?q?FIXME=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 47e94daf6..09966c216 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -88,7 +88,11 @@ impl<'metas> MorphableTargets<'metas> { FRAME_PERIOD, DEFAULT_SAMPLING_RATE, ) - .map_err(|_| todo!()); + .map_err(|_| { + // FIXME: ここをどうするか考える。ただしここのエラーは入力配列が巨大すぎる + // (`world::synthesis::SynthesisError::TooLargeValue`)ときに限るはず + todo!() + }); const FRAME_PERIOD: f64 = 1.; From 57a81f33ca3cb2d66bf857e5703eed6f8c966a19 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 22:35:09 +0900 Subject: [PATCH 22/46] =?UTF-8?q?=E5=86=85=E9=83=A8=E3=83=A1=E3=82=BD?= =?UTF-8?q?=E3=83=83=E3=83=89=E5=90=8D=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 4 ++-- crates/voicevox_core/src/synthesizer.rs | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 09966c216..cdfe2dc79 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -25,7 +25,7 @@ impl crate::blocking::Synthesizer { Ok(MorphableTargets::permit(pair).is_ok()) } - pub(crate) fn synthesis_morphing_( + pub(crate) fn synthesis_morphing_wave( &self, audio_query: &AudioQueryModel, style_ids: MorphingPair, @@ -54,7 +54,7 @@ impl<'metas> MorphableTargets<'metas> { } let waves = &self.get().try_map(|style_id| { - synthesizer.synthesis_impl(audio_query, style_id, &Default::default()) + synthesizer.synthesis_wave(audio_query, style_id, &Default::default()) })?; let morph_param = MorphingParameter::new(waves); diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 7a9375fcd..b117b7c30 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -112,7 +112,10 @@ pub(crate) mod blocking { StyleMeta, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; - use super::{AccelerationMode, InferenceRuntimeImpl, InitializeOptions, TtsOptions}; + use super::{ + AccelerationMode, InferenceRuntimeImpl, InitializeOptions, TtsOptions, + DEFAULT_SAMPLING_RATE, + }; /// 音声シンセサイザ。 pub struct Synthesizer { @@ -264,7 +267,7 @@ pub(crate) mod blocking { style_id: StyleId, options: &SynthesisOptions, ) -> Result> { - let wave = &self.synthesis_impl(audio_query, style_id, options)?; + let wave = &self.synthesis_wave(audio_query, style_id, options)?; Ok(to_wav(wave, audio_query)) } @@ -279,11 +282,11 @@ pub(crate) mod blocking { base: base_style_id, target: target_style_id, }; - let wave = &self.synthesis_morphing_(audio_query, style_ids, morph_rate)?; + let wave = &self.synthesis_morphing_wave(audio_query, style_ids, morph_rate)?; Ok(to_wav(wave, audio_query)) } - pub(crate) fn synthesis_impl( + pub(crate) fn synthesis_wave( &self, audio_query: &AudioQueryModel, style_id: StyleId, From 0b896ead54bff38e8afa3511c4da475c5a0b6c2d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 23:23:55 +0900 Subject: [PATCH 23/46] =?UTF-8?q?`Morph`=20=E2=86=92=20`SpeakerFeature`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/mod.rs | 2 +- crates/voicevox_core/src/engine/morph.rs | 38 ++++++++++++------- crates/voicevox_core/src/error.rs | 29 +++++++++++--- crates/voicevox_core_c_api/src/helpers.rs | 2 +- crates/voicevox_core_c_api/src/result_code.rs | 8 ++-- crates/voicevox_core_java_api/src/common.rs | 2 +- .../voicevox_core_python_api/src/convert.rs | 2 +- 7 files changed, 56 insertions(+), 27 deletions(-) diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 3e18273c8..b99bf811a 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -11,5 +11,5 @@ pub(crate) use self::full_context_label::{FullContextLabelError, Utterance}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel, MorphableTargetInfo}; pub(crate) use self::mora_list::mora2text; -pub(crate) use self::morph::{MorphError, MorphingPair}; +pub(crate) use self::morph::MorphingPair; pub use self::open_jtalk::FullcontextExtractor; diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index cdfe2dc79..31dbccd7b 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -1,4 +1,3 @@ -use thiserror::Error; use world::{ signal_analyzer::{AnalyzeResult, SignalAnalyzerBuilder}, spectrogram_like::SpectrogramLike, @@ -10,11 +9,6 @@ use crate::{ use self::permit::MorphableTargets; -// FIXME: 許可対象外のときと、WORLDがなんかエラーを吐いたときとに分割する -#[derive(Error, Debug)] -#[error("指定された話者ペアでのモーフィングに失敗しました")] -pub(crate) struct MorphError; - impl crate::blocking::Synthesizer { pub(crate) fn is_synthesis_morphing_permitted( &self, @@ -188,9 +182,13 @@ impl MorphingPair { mod permit { use std::marker::PhantomData; - use crate::{metas::PermittedSynthesisMorphing, SpeakerMeta, StyleId}; + use crate::{ + error::{SpeakerFeatureError, SpeakerFeatureErrorKind}, + metas::PermittedSynthesisMorphing, + SpeakerMeta, StyleId, + }; - use super::{MorphError, MorphingPair}; + use super::MorphingPair; pub(super) struct MorphableTargets<'metas> { inner: MorphingPair, @@ -200,11 +198,11 @@ mod permit { impl<'metas> MorphableTargets<'metas> { pub(super) fn permit( pair: MorphingPair<(StyleId, &'metas SpeakerMeta)>, - ) -> std::result::Result { + ) -> std::result::Result { match pair.map(|(_, speaker)| { ( speaker.supported_features().permitted_synthesis_morphing, - speaker.speaker_uuid(), + speaker, ) }) { MorphingPair { @@ -213,11 +211,23 @@ mod permit { } => {} MorphingPair { - base: (PermittedSynthesisMorphing::SelfOnly, base_speaker_uuid), - target: (PermittedSynthesisMorphing::SelfOnly, target_speaker_uuid), - } if base_speaker_uuid == target_speaker_uuid => {} + base: (PermittedSynthesisMorphing::SelfOnly, base), + target: (PermittedSynthesisMorphing::SelfOnly, target), + } if base.speaker_uuid() == target.speaker_uuid() => {} - _ => return Err(MorphError), + MorphingPair { + base: (_, base), + target: (_, target), + } => { + return Err(SpeakerFeatureError { + speaker_name: base.name().clone(), + speaker_uuid: base.speaker_uuid().clone(), + context: SpeakerFeatureErrorKind::Morph { + target_speaker_name: target.name().clone(), + target_speaker_uuid: target.speaker_uuid().clone(), + }, + }) + } } Ok(Self { diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 96d95a052..b4ef177c6 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -1,5 +1,5 @@ use crate::{ - engine::{FullContextLabelError, KanaParseError, MorphError}, + engine::{FullContextLabelError, KanaParseError}, user_dict::InvalidWordError, StyleId, VoiceModelId, }; @@ -20,7 +20,7 @@ pub struct Error(#[from] ErrorRepr); [ FullContextLabelError ]; [ KanaParseError ]; [ InvalidWordError ]; - [ MorphError ]; + [ SpeakerFeatureError ]; )] impl From for Error { fn from(err: E) -> Self { @@ -52,7 +52,7 @@ impl Error { ErrorRepr::WordNotFound(_) => ErrorKind::WordNotFound, ErrorRepr::UseUserDict(_) => ErrorKind::UseUserDict, ErrorRepr::InvalidWord(_) => ErrorKind::InvalidWord, - ErrorRepr::Morph(_) => ErrorKind::Morph, + ErrorRepr::SpeakerFeature(_) => ErrorKind::SpeakerFeature, } } } @@ -108,7 +108,7 @@ pub(crate) enum ErrorRepr { InvalidWord(#[from] InvalidWordError), #[error(transparent)] - Morph(#[from] MorphError), + SpeakerFeature(#[from] SpeakerFeatureError), } /// エラーの種類。 @@ -150,8 +150,8 @@ pub enum ErrorKind { UseUserDict, /// ユーザー辞書の単語のバリデーションに失敗した。 InvalidWord, - /// 指定された話者ペアでのモーフィングに失敗した。 - Morph, + /// 要求された機能を話者が持っていない。 + SpeakerFeature, } pub(crate) type LoadModelResult = std::result::Result; @@ -179,3 +179,20 @@ pub(crate) enum LoadModelErrorKind { #[display(fmt = "モデルデータを読むことができませんでした")] InvalidModelData, } + +#[derive(Error, Debug)] +#[error("`{speaker_name}` ({speaker_uuid})は以下の機能を持ちません: {context}")] +pub(crate) struct SpeakerFeatureError { + pub(crate) speaker_name: String, + pub(crate) speaker_uuid: String, + pub(crate) context: SpeakerFeatureErrorKind, +} + +#[derive(derive_more::Display, Debug)] +pub(crate) enum SpeakerFeatureErrorKind { + #[display(fmt = "`{target_speaker_name}` ({target_speaker_uuid})に対するモーフィング")] + Morph { + target_speaker_name: String, + target_speaker_uuid: String, + }, +} diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 9cc346033..5c70ce5e5 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -51,7 +51,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes WordNotFound => VOICEVOX_RESULT_USER_DICT_WORD_NOT_FOUND_ERROR, UseUserDict => VOICEVOX_RESULT_USE_USER_DICT_ERROR, InvalidWord => VOICEVOX_RESULT_INVALID_USER_DICT_WORD_ERROR, - Morph => VOICEVOX_RESULT_MORPH_ERROR, + SpeakerFeature => VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR, }, Err(InvalidUtf8Input) => VOICEVOX_RESULT_INVALID_UTF8_INPUT_ERROR, Err(InvalidAudioQuery(_)) => VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR, diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs index be66cb702..44c2b860d 100644 --- a/crates/voicevox_core_c_api/src/result_code.rs +++ b/crates/voicevox_core_c_api/src/result_code.rs @@ -55,8 +55,8 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_INVALID_USER_DICT_WORD_ERROR = 24, /// UUIDの変換に失敗した VOICEVOX_RESULT_INVALID_UUID_ERROR = 25, - /// 指定された話者ペアでのモーフィングが不可能 - VOICEVOX_RESULT_MORPH_ERROR = 28, + /// 要求された機能を話者が持っていない + VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR = 28, } pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static CStr { @@ -109,6 +109,8 @@ pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> cstr!("ユーザー辞書の単語のバリデーションに失敗しました") } VOICEVOX_RESULT_INVALID_UUID_ERROR => cstr!("UUIDの変換に失敗しました"), - VOICEVOX_RESULT_MORPH_ERROR => cstr!("指定された話者ペアでのモーフィングはできません"), + VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR => { + cstr!("要求された機能を話者は持っていません") + } } } diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs index fb93c030a..3fe791e38 100644 --- a/crates/voicevox_core_java_api/src/common.rs +++ b/crates/voicevox_core_java_api/src/common.rs @@ -146,7 +146,7 @@ where WordNotFound, UseUserDict, InvalidWord, - Morph, // TODO + SpeakerFeature, // TODO ); let mut sources = diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 758a8e513..6a1037982 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -203,7 +203,7 @@ pub impl voicevox_core::Result { ErrorKind::WordNotFound => WordNotFoundError::new_err(msg), ErrorKind::UseUserDict => UseUserDictError::new_err(msg), ErrorKind::InvalidWord => InvalidWordError::new_err(msg), - ErrorKind::Morph => todo!(), + ErrorKind::SpeakerFeature => todo!(), }; [top] From c8c85b0893a66c9169bf7e565ca84439dd600909 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 23:41:17 +0900 Subject: [PATCH 24/46] =?UTF-8?q?`to=5Fwav`=E3=82=92=E7=A7=BB=E5=8B=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/audio_file.rs | 60 +++++++++++++++++ crates/voicevox_core/src/engine/mod.rs | 2 + crates/voicevox_core/src/engine/morph.rs | 12 ++-- crates/voicevox_core/src/synthesizer.rs | 67 +------------------ 4 files changed, 72 insertions(+), 69 deletions(-) create mode 100644 crates/voicevox_core/src/engine/audio_file.rs diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs new file mode 100644 index 000000000..f6facea6d --- /dev/null +++ b/crates/voicevox_core/src/engine/audio_file.rs @@ -0,0 +1,60 @@ +use std::io::{Cursor, Write as _}; + +use az::{Az as _, Cast}; +use num_traits::Float; + +use crate::{synthesizer::DEFAULT_SAMPLING_RATE, AudioQueryModel}; + +pub(crate) fn to_wav + From + Cast>( + wave: &[T], + audio_query: &AudioQueryModel, +) -> Vec { + // TODO: ライブラリ(e.g. https://docs.rs/hound)を使う + + let volume_scale = *audio_query.volume_scale(); + let output_stereo = *audio_query.output_stereo(); + let output_sampling_rate = *audio_query.output_sampling_rate(); + + // TODO: 44.1kHzなどの対応 + + let num_channels: u16 = if output_stereo { 2 } else { 1 }; + let bit_depth: u16 = 16; + let repeat_count: u32 = (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; + let block_size: u16 = bit_depth * num_channels / 8; + + let bytes_size = wave.len() as u32 * repeat_count * 2; + let wave_size = bytes_size + 44; + + let buf: Vec = Vec::with_capacity(wave_size as usize); + let mut cur = Cursor::new(buf); + + cur.write_all("RIFF".as_bytes()).unwrap(); + cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); + cur.write_all("WAVEfmt ".as_bytes()).unwrap(); + cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length + cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM + cur.write_all(&num_channels.to_le_bytes()).unwrap(); + cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); + + let block_rate = output_sampling_rate * block_size as u32; + + cur.write_all(&block_rate.to_le_bytes()).unwrap(); + cur.write_all(&block_size.to_le_bytes()).unwrap(); + cur.write_all(&bit_depth.to_le_bytes()).unwrap(); + cur.write_all("data".as_bytes()).unwrap(); + cur.write_all(&bytes_size.to_le_bytes()).unwrap(); + + for &value in wave { + let v = num_traits::clamp( + value * >::from(volume_scale), + -T::one(), + T::one(), + ); + let data = (v * >::from(0x7fff)).az::(); + for _ in 0..repeat_count { + cur.write_all(&data.to_le_bytes()).unwrap(); + } + } + + cur.into_inner() +} diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index b99bf811a..64b36d8c7 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -1,4 +1,5 @@ mod acoustic_feature_extractor; +pub(crate) mod audio_file; mod full_context_label; mod kana_parser; mod model; @@ -7,6 +8,7 @@ mod morph; pub(crate) mod open_jtalk; pub(crate) use self::acoustic_feature_extractor::OjtPhoneme; +pub(crate) use self::audio_file::to_wav; pub(crate) use self::full_context_label::{FullContextLabelError, Utterance}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel, MorphableTargetInfo}; diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 31dbccd7b..5bb1d0948 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -19,12 +19,12 @@ impl crate::blocking::Synthesizer { Ok(MorphableTargets::permit(pair).is_ok()) } - pub(crate) fn synthesis_morphing_wave( + pub(crate) fn synthesis_morphing_( &self, audio_query: &AudioQueryModel, style_ids: MorphingPair, morph_rate: f32, - ) -> crate::Result> { + ) -> crate::Result> { let metas = &self.metas(); let pair = style_ids.lookup_speakers(metas)?; @@ -38,7 +38,7 @@ impl<'metas> MorphableTargets<'metas> { synthesizer: &crate::blocking::Synthesizer, audio_query: &AudioQueryModel, morph_rate: f32, - ) -> crate::Result> { + ) -> crate::Result> { let morph_rate = f64::from(morph_rate); if *audio_query.output_sampling_rate() != DEFAULT_SAMPLING_RATE @@ -74,7 +74,7 @@ impl<'metas> MorphableTargets<'metas> { } } - return world::synthesis::synthesis( + let wave = &world::synthesis::synthesis( &morph_param.base_f0, &morph_spectrogram, &morph_param.base_aperiodicity, @@ -82,12 +82,14 @@ impl<'metas> MorphableTargets<'metas> { FRAME_PERIOD, DEFAULT_SAMPLING_RATE, ) - .map_err(|_| { + .unwrap_or_else(|_| { // FIXME: ここをどうするか考える。ただしここのエラーは入力配列が巨大すぎる // (`world::synthesis::SynthesisError::TooLargeValue`)ときに限るはず todo!() }); + return Ok(super::to_wav(wave, audio_query)); + const FRAME_PERIOD: f64 = 1.; struct MorphingParameter { diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index b117b7c30..2236c0a68 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -83,14 +83,9 @@ pub(crate) mod blocking { // (ブロッキング版をpublic APIにするならの話ではあるが)ブロッキング版はブロッキング版でコード例 // を用意する - use std::{ - collections::BTreeMap, - io::{Cursor, Write as _}, - }; + use std::collections::BTreeMap; - use az::{Az as _, Cast}; use enum_map::enum_map; - use num_traits::Float; use crate::{ engine::{ @@ -268,7 +263,7 @@ pub(crate) mod blocking { options: &SynthesisOptions, ) -> Result> { let wave = &self.synthesis_wave(audio_query, style_id, options)?; - Ok(to_wav(wave, audio_query)) + Ok(engine::to_wav(wave, audio_query)) } pub fn synthesis_morphing( @@ -282,8 +277,7 @@ pub(crate) mod blocking { base: base_style_id, target: target_style_id, }; - let wave = &self.synthesis_morphing_wave(audio_query, style_ids, morph_rate)?; - Ok(to_wav(wave, audio_query)) + self.synthesis_morphing_(audio_query, style_ids, morph_rate) } pub(crate) fn synthesis_wave( @@ -1223,61 +1217,6 @@ pub(crate) mod blocking { ) } } - - fn to_wav + From + Cast>( - wave: &[T], - audio_query: &AudioQueryModel, - ) -> Vec { - // TODO: ライブラリ(e.g. https://docs.rs/hound)を使う - - let volume_scale = *audio_query.volume_scale(); - let output_stereo = *audio_query.output_stereo(); - let output_sampling_rate = *audio_query.output_sampling_rate(); - - // TODO: 44.1kHzなどの対応 - - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let bit_depth: u16 = 16; - let repeat_count: u32 = - (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let block_size: u16 = bit_depth * num_channels / 8; - - let bytes_size = wave.len() as u32 * repeat_count * 2; - let wave_size = bytes_size + 44; - - let buf: Vec = Vec::with_capacity(wave_size as usize); - let mut cur = Cursor::new(buf); - - cur.write_all("RIFF".as_bytes()).unwrap(); - cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); - cur.write_all("WAVEfmt ".as_bytes()).unwrap(); - cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length - cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM - cur.write_all(&num_channels.to_le_bytes()).unwrap(); - cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); - - let block_rate = output_sampling_rate * block_size as u32; - - cur.write_all(&block_rate.to_le_bytes()).unwrap(); - cur.write_all(&block_size.to_le_bytes()).unwrap(); - cur.write_all(&bit_depth.to_le_bytes()).unwrap(); - cur.write_all("data".as_bytes()).unwrap(); - cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - - for &value in wave { - let v = num_traits::clamp( - value * >::from(volume_scale), - -T::one(), - T::one(), - ); - let data = (v * >::from(0x7fff)).az::(); - for _ in 0..repeat_count { - cur.write_all(&data.to_le_bytes()).unwrap(); - } - } - - cur.into_inner() - } } pub(crate) mod tokio { From e1f94b122179e34e2e4e862aaa2c086144572a83 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 23:43:27 +0900 Subject: [PATCH 25/46] =?UTF-8?q?FIXME=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88?= =?UTF-8?q?=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 5bb1d0948..ee459897a 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -83,7 +83,7 @@ impl<'metas> MorphableTargets<'metas> { DEFAULT_SAMPLING_RATE, ) .unwrap_or_else(|_| { - // FIXME: ここをどうするか考える。ただしここのエラーは入力配列が巨大すぎる + // FIXME: ここをどうするか考える。ただしここのエラーはspectrogramが巨大すぎる // (`world::synthesis::SynthesisError::TooLargeValue`)ときに限るはず todo!() }); From 38b8732b785f283e8a06129be1d0d1d1e7e396b1 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 23:50:40 +0900 Subject: [PATCH 26/46] =?UTF-8?q?"WARNING"=E3=82=92=E6=B6=88=E3=81=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index ee459897a..1991b90c1 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -176,11 +176,6 @@ impl MorphingPair { } } -// ==========================================WARNING============================================== -// -// DO NOT BYPASS THIS OR YOU MAY VIOLATE THE ToS OF THE MODELS -// -// =============================================================================================== mod permit { use std::marker::PhantomData; From e283209f4c96b823bf27d8fd2f7498a82af26b3c Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 31 Dec 2023 23:52:00 +0900 Subject: [PATCH 27/46] =?UTF-8?q?voicevox=5Fcore.h=E3=82=92=E3=82=A2?= =?UTF-8?q?=E3=83=83=E3=83=97=E3=83=87=E3=83=BC=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core_c_api/include/voicevox_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index 20d962845..7b837b9e9 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -179,9 +179,9 @@ enum VoicevoxResultCode */ VOICEVOX_RESULT_INVALID_UUID_ERROR = 25, /** - * 指定された話者ペアでのモーフィングが不可能 + * 要求された機能を話者が持っていない */ - VOICEVOX_RESULT_MORPH_ERROR = 28, + VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR = 28, }; #ifndef __cplusplus typedef int32_t VoicevoxResultCode; From bdf874f88885e8ec834c83a9369cf61d2895c09b Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 07:42:44 +0900 Subject: [PATCH 28/46] =?UTF-8?q?C=20API=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../include/voicevox_core.h | 34 +++ crates/voicevox_core_c_api/src/lib.rs | 60 ++++++ .../tests/e2e/snapshots.toml | 27 +++ .../voicevox_core_c_api/tests/e2e/symbols.rs | 25 ++- .../tests/e2e/testcases.rs | 1 + .../tests/e2e/testcases/morph.rs | 197 ++++++++++++++++++ 6 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index 7b837b9e9..9b242596e 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -643,6 +643,19 @@ __declspec(dllimport) #endif VoicevoxResultCode voicevox_create_supported_devices_json(char **output_supported_devices_json); +/** + * \safety{ + * - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 + * - `output`は書き込みについて有効でなければならない。 + * } + */ +#ifdef _WIN32 +__declspec(dllimport) +#endif +VoicevoxResultCode voicevox_synthesizer_create_morphable_targets_json(const struct VoicevoxSynthesizer *synthesizer, + VoicevoxStyleId style_id, + char **output); + /** * AquesTalk風記法から、AudioQueryをJSONとして生成する。 * @@ -902,6 +915,25 @@ VoicevoxResultCode voicevox_synthesizer_synthesis(const struct VoicevoxSynthesiz uintptr_t *output_wav_length, uint8_t **output_wav); +/** + * \safety{ + * - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 + * - `audio_query_json`はヌル終端文字列を指し、かつ読み込みについて有効でなければならない。 + * - `output_wav_length`は書き込みについて有効でなければならない。 + * - `output_wav`は書き込みについて有効でなければならない。 + * } + */ +#ifdef _WIN32 +__declspec(dllimport) +#endif +VoicevoxResultCode voicevox_synthesizer_synthesis_morphing(const struct VoicevoxSynthesizer *synthesizer, + const char *audio_query_json, + VoicevoxStyleId base_style_id, + VoicevoxStyleId target_style_id, + float morph_rate, + uintptr_t *output_wav_length, + uint8_t **output_wav); + /** * デフォルトのテキスト音声合成オプションを生成する * @return テキスト音声合成オプション @@ -982,6 +1014,7 @@ VoicevoxResultCode voicevox_synthesizer_tts(const struct VoicevoxSynthesizer *sy * - `json`は以下のAPIで得られたポインタでなくてはいけない。 * - ::voicevox_create_supported_devices_json * - ::voicevox_synthesizer_create_metas_json + * - ::voicevox_synthesizer_create_morphable_targets_json * - ::voicevox_synthesizer_create_audio_query * - ::voicevox_synthesizer_create_accent_phrases * - ::voicevox_synthesizer_replace_mora_data @@ -1006,6 +1039,7 @@ void voicevox_json_free(char *json); * \safety{ * - `wav`は以下のAPIで得られたポインタでなくてはいけない。 * - ::voicevox_synthesizer_synthesis + * - ::voicevox_synthesizer_synthesis_morphing * - ::voicevox_synthesizer_tts * - `wav`は読み込みと書き込みについて有効でなければならない。 * - `wav`は以後ダングリングポインタ(_dangling pointer_)として扱われなくてはならない。 diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index d2946f02b..dbec261fa 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -505,6 +505,32 @@ pub unsafe extern "C" fn voicevox_create_supported_devices_json( })()) } +/// \safety{ +/// - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 +/// - `output`は書き込みについて有効でなければならない。 +/// } +#[no_mangle] +pub unsafe extern "C" fn voicevox_synthesizer_create_morphable_targets_json( + synthesizer: &VoicevoxSynthesizer, + style_id: VoicevoxStyleId, + output: NonNull<*mut c_char>, +) -> VoicevoxResultCode { + init_logger_once(); + into_result_code_with_error((|| { + let morphable_targets = &synthesizer + .synthesizer + .morphable_targets(StyleId::new(style_id))?; + let morphable_targets = serde_json::to_string(morphable_targets).expect("should not fail"); + let morphable_targets = CString::new(morphable_targets).expect("should not end with NUL"); + output.as_ptr().write_unaligned( + C_STRING_DROP_CHECKER + .whitelist(morphable_targets) + .into_raw(), + ); + Ok(()) + })()) +} + /// AquesTalk風記法から、AudioQueryをJSONとして生成する。 /// /// 生成したJSON文字列を解放するには ::voicevox_json_free を使う。 @@ -878,6 +904,38 @@ pub unsafe extern "C" fn voicevox_synthesizer_synthesis( })()) } +/// \safety{ +/// - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 +/// - `audio_query_json`はヌル終端文字列を指し、かつ読み込みについて有効でなければならない。 +/// - `output_wav_length`は書き込みについて有効でなければならない。 +/// - `output_wav`は書き込みについて有効でなければならない。 +/// } +#[no_mangle] +pub unsafe extern "C" fn voicevox_synthesizer_synthesis_morphing( + synthesizer: &VoicevoxSynthesizer, + audio_query_json: *const c_char, + base_style_id: VoicevoxStyleId, + target_style_id: VoicevoxStyleId, + morph_rate: f32, + output_wav_length: NonNull, + output_wav: NonNull<*mut u8>, +) -> VoicevoxResultCode { + init_logger_once(); + into_result_code_with_error((|| { + let audio_query_json = ensure_utf8(CStr::from_ptr(audio_query_json))?; + let audio_query = &serde_json::from_str::(audio_query_json) + .map_err(CApiError::InvalidAudioQuery)?; + let wav = synthesizer.synthesizer().synthesis_morphing( + audio_query, + StyleId::new(base_style_id), + StyleId::new(target_style_id), + morph_rate, + )?; + U8_SLICE_OWNER.own_and_lend(wav, output_wav, output_wav_length); + Ok(()) + })()) +} + /// ::voicevox_synthesizer_tts のオプション。 #[repr(C)] pub struct VoicevoxTtsOptions { @@ -983,6 +1041,7 @@ pub unsafe extern "C" fn voicevox_synthesizer_tts( /// - `json`は以下のAPIで得られたポインタでなくてはいけない。 /// - ::voicevox_create_supported_devices_json /// - ::voicevox_synthesizer_create_metas_json +/// - ::voicevox_synthesizer_create_morphable_targets_json /// - ::voicevox_synthesizer_create_audio_query /// - ::voicevox_synthesizer_create_accent_phrases /// - ::voicevox_synthesizer_replace_mora_data @@ -1006,6 +1065,7 @@ pub unsafe extern "C" fn voicevox_json_free(json: *mut c_char) { /// \safety{ /// - `wav`は以下のAPIで得られたポインタでなくてはいけない。 /// - ::voicevox_synthesizer_synthesis +/// - ::voicevox_synthesizer_synthesis_morphing /// - ::voicevox_synthesizer_tts /// - `wav`は読み込みと書き込みについて有効でなければならない。 /// - `wav`は以後ダングリングポインタ(_dangling pointer_)として扱われなくてはならない。 diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index f86103385..6236955da 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -81,8 +81,35 @@ result_messages.22 = "ユーザー辞書に単語が見つかりませんでし result_messages.23 = "OpenJTalkのユーザー辞書の設定に失敗しました" result_messages.24 = "ユーザー辞書の単語のバリデーションに失敗しました" result_messages.25 = "UUIDの変換に失敗しました" +result_messages.28 = "要求された機能を話者は持っていません" stderr = "" +[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":1}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + +[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":302,"target_style":303}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + +[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":302}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' + [simple_tts] output."こんにちは、音声合成の世界へようこそ".wav_length = 176172 stderr.windows = ''' diff --git a/crates/voicevox_core_c_api/tests/e2e/symbols.rs b/crates/voicevox_core_c_api/tests/e2e/symbols.rs index f6ecd4a3b..17d06bbdc 100644 --- a/crates/voicevox_core_c_api/tests/e2e/symbols.rs +++ b/crates/voicevox_core_c_api/tests/e2e/symbols.rs @@ -59,6 +59,14 @@ pub(crate) struct Symbols<'lib> { Symbol<'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer) -> *mut c_char>, pub(crate) voicevox_create_supported_devices_json: Symbol<'lib, unsafe extern "C" fn(*mut *mut c_char) -> VoicevoxResultCode>, + pub(crate) voicevox_synthesizer_create_morphable_targets_json: Symbol< + 'lib, + unsafe extern "C" fn( + *const VoicevoxSynthesizer, + VoicevoxStyleId, + *mut *mut c_char, + ) -> VoicevoxResultCode, + >, pub(crate) voicevox_synthesizer_create_audio_query_from_kana: Symbol< 'lib, unsafe extern "C" fn( @@ -90,6 +98,18 @@ pub(crate) struct Symbols<'lib> { *mut *mut u8, ) -> VoicevoxResultCode, >, + pub(crate) voicevox_synthesizer_synthesis_morphing: Symbol< + 'lib, + unsafe extern "C" fn( + *const VoicevoxSynthesizer, + *const c_char, + VoicevoxStyleId, + VoicevoxStyleId, + f32, + *mut usize, + *mut *mut u8, + ) -> VoicevoxResultCode, + >, pub(crate) voicevox_make_default_tts_options: Symbol<'lib, unsafe extern "C" fn() -> VoicevoxTtsOptions>, pub(crate) voicevox_synthesizer_tts_from_kana: Symbol< @@ -222,10 +242,12 @@ impl<'lib> Symbols<'lib> { voicevox_synthesizer_is_loaded_voice_model, voicevox_synthesizer_create_metas_json, voicevox_create_supported_devices_json, + voicevox_synthesizer_create_morphable_targets_json, voicevox_synthesizer_create_audio_query_from_kana, voicevox_synthesizer_create_audio_query, voicevox_make_default_synthesis_options, voicevox_synthesizer_synthesis, + voicevox_synthesizer_synthesis_morphing, voicevox_make_default_tts_options, voicevox_synthesizer_tts_from_kana, voicevox_synthesizer_tts, @@ -260,7 +282,7 @@ type OpenJtalkRc = c_void; type VoicevoxVoiceModel = c_void; type VoicevoxVoiceModelId = *const c_char; type VoicevoxSynthesizer = c_void; -type VoicevoxStyleId = u32; +pub(crate) type VoicevoxStyleId = u32; #[repr(i32)] #[derive(Debug, PartialEq, Eq, Clone, Copy, EnumIter)] @@ -289,6 +311,7 @@ pub(crate) enum VoicevoxResultCode { VOICEVOX_RESULT_USE_USER_DICT_ERROR = 23, VOICEVOX_RESULT_INVALID_USER_DICT_WORD_ERROR = 24, VOICEVOX_RESULT_INVALID_UUID_ERROR = 25, + VOICEVOX_RESULT_SPEAKER_FEATURE = 28, } #[repr(i32)] diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases.rs b/crates/voicevox_core_c_api/tests/e2e/testcases.rs index 31eb9cdfe..36cc1fc8b 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases.rs @@ -1,6 +1,7 @@ mod compatible_engine; mod compatible_engine_load_model_before_initialize; mod global_info; +mod morph; mod simple_tts; mod synthesizer_new_output_json; mod tts_via_audio_query; diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs new file mode 100644 index 000000000..d73a12f88 --- /dev/null +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -0,0 +1,197 @@ +use std::{ + collections::HashMap, + ffi::{CStr, CString}, + fmt::{self, Display}, + mem::MaybeUninit, +}; + +use anyhow::bail; +use assert_cmd::assert::AssertResult; +use cstr::cstr; +use libloading::Library; +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; +use test_util::OPEN_JTALK_DIC_DIR; + +use crate::{ + assert_cdylib::{self, case, Utf8Output}, + snapshots, + symbols::{ + Symbols, VoicevoxAccelerationMode, VoicevoxInitializeOptions, VoicevoxResultCode, + VoicevoxStyleId, + }, +}; + +case!(TestCase { + text: "こんにちは、音声合成の世界へようこそ".to_owned(), + base_style: 1, + target_style: 1, +}); +case!(TestCase { + text: "こんにちは、音声合成の世界へようこそ".to_owned(), + base_style: 302, + target_style: 303, +}); +case!(TestCase { + text: "こんにちは、音声合成の世界へようこそ".to_owned(), + base_style: 1, + target_style: 302, +}); + +#[derive(Serialize, Deserialize)] +struct TestCase { + text: String, + base_style: VoicevoxStyleId, + target_style: VoicevoxStyleId, +} + +impl Display for TestCase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&serde_json::to_string(self).unwrap()) + } +} + +#[typetag::serde(name = "morph")] +impl assert_cdylib::TestCase for TestCase { + unsafe fn exec(&self, lib: &Library) -> anyhow::Result<()> { + let Symbols { + voicevox_open_jtalk_rc_new, + voicevox_open_jtalk_rc_delete, + voicevox_make_default_initialize_options, + voicevox_voice_model_new_from_path, + voicevox_voice_model_delete, + voicevox_synthesizer_new, + voicevox_synthesizer_delete, + voicevox_synthesizer_load_voice_model, + voicevox_synthesizer_create_morphable_targets_json, + voicevox_synthesizer_create_audio_query, + voicevox_synthesizer_synthesis_morphing, + voicevox_json_free, + voicevox_wav_free, + .. + } = Symbols::new(lib)?; + + let model = { + let mut model = MaybeUninit::uninit(); + assert_ok(voicevox_voice_model_new_from_path( + cstr!("../../model/sample.vvm").as_ptr(), + model.as_mut_ptr(), + )); + model.assume_init() + }; + + let openjtalk = { + let mut openjtalk = MaybeUninit::uninit(); + let open_jtalk_dic_dir = CString::new(OPEN_JTALK_DIC_DIR).unwrap(); + assert_ok(voicevox_open_jtalk_rc_new( + open_jtalk_dic_dir.as_ptr(), + openjtalk.as_mut_ptr(), + )); + openjtalk.assume_init() + }; + + let synthesizer = { + let mut synthesizer = MaybeUninit::uninit(); + assert_ok(voicevox_synthesizer_new( + openjtalk, + VoicevoxInitializeOptions { + acceleration_mode: VoicevoxAccelerationMode::VOICEVOX_ACCELERATION_MODE_CPU, + ..voicevox_make_default_initialize_options() + }, + synthesizer.as_mut_ptr(), + )); + synthesizer.assume_init() + }; + + assert_ok(voicevox_synthesizer_load_voice_model(synthesizer, model)); + + let audio_query = { + let mut audio_query = MaybeUninit::uninit(); + let text = CString::new(&*self.text).unwrap(); + assert_ok(voicevox_synthesizer_create_audio_query( + synthesizer, + text.as_ptr(), + self.base_style, + audio_query.as_mut_ptr(), + )); + audio_query.assume_init() + }; + + let morphable_targets = { + let mut morphable_target = MaybeUninit::uninit(); + assert_ok(voicevox_synthesizer_create_morphable_targets_json( + synthesizer, + self.base_style, + morphable_target.as_mut_ptr(), + )); + morphable_target.assume_init() + }; + + let MorphableTargetInfo { is_morphable } = + serde_json::from_slice::>( + CStr::from_ptr(morphable_targets).to_bytes(), + )?[&self.target_style]; + + let result = { + const MORPH_RATE: f32 = 0.5; + + let mut wav_length = MaybeUninit::uninit(); + let mut wav = MaybeUninit::uninit(); + let result = voicevox_synthesizer_synthesis_morphing( + synthesizer, + audio_query, + self.base_style, + self.target_style, + MORPH_RATE, + wav_length.as_mut_ptr(), + wav.as_mut_ptr(), + ); + match result { + VoicevoxResultCode::VOICEVOX_RESULT_OK => Ok(wav.assume_init()), + VoicevoxResultCode::VOICEVOX_RESULT_SPEAKER_FEATURE => Err(()), + result => bail!("code = {result:?}"), + } + }; + + std::assert_eq!(is_morphable, result.is_ok()); + std::assert_eq!(SNAPSHOTS[&self.to_string()].ok, result.is_ok()); + + voicevox_voice_model_delete(model); + voicevox_open_jtalk_rc_delete(openjtalk); + voicevox_synthesizer_delete(synthesizer); + voicevox_json_free(audio_query); + voicevox_json_free(morphable_targets); + if let Ok(wav) = result { + voicevox_wav_free(wav); + } + return Ok(()); + + fn assert_ok(result_code: VoicevoxResultCode) { + std::assert_eq!(VoicevoxResultCode::VOICEVOX_RESULT_OK, result_code); + } + + #[derive(Deserialize)] + struct MorphableTargetInfo { + is_morphable: bool, + } + } + + fn assert_output(&self, output: Utf8Output) -> AssertResult { + output + .mask_timestamps() + .mask_windows_video_cards() + .assert() + .try_success()? + .try_stdout("")? + .try_stderr(&*SNAPSHOTS[&self.to_string()].stderr) + } +} + +static SNAPSHOTS: Lazy> = snapshots::section!(morph); + +#[derive(Deserialize)] +struct Snapshot { + ok: bool, + #[serde(deserialize_with = "snapshots::deserialize_platform_specific_snapshot")] + stderr: String, +} From 503f035479e6dfb03a884fce287b6dc72467f3f9 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 13:39:35 +0900 Subject: [PATCH 29/46] =?UTF-8?q?Python=20API=E3=81=AE=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/metas.rs | 1 + .../python/voicevox_core/__init__.py | 6 ++ .../python/voicevox_core/_models.py | 26 ++++++ .../python/voicevox_core/_rust/__init__.pyi | 5 ++ .../python/voicevox_core/_rust/asyncio.pyi | 11 +++ .../python/voicevox_core/_rust/blocking.pyi | 11 +++ .../voicevox_core_python_api/src/convert.rs | 6 +- crates/voicevox_core_python_api/src/lib.rs | 87 +++++++++++++++++++ 8 files changed, 150 insertions(+), 3 deletions(-) diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs index 31f75dff4..eaf059076 100644 --- a/crates/voicevox_core/src/metas.rs +++ b/crates/voicevox_core/src/metas.rs @@ -81,6 +81,7 @@ pub struct StyleMeta { #[derive(Default, Deserialize, Serialize, Clone)] pub struct SpeakerSupportedFeatures { + #[serde(default)] pub(crate) permitted_synthesis_morphing: PermittedSynthesisMorphing, } diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py index 4ccbad3fe..9b93407fd 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py @@ -6,7 +6,10 @@ AccentPhrase, AudioQuery, Mora, + MorphableTargetInfo, + PermittedSynthesisMorphing, SpeakerMeta, + SpeakerSupportedFeatures, StyleId, StyleVersion, SupportedDevices, @@ -56,12 +59,15 @@ "ModelAlreadyLoadedError", "ModelNotFoundError", "Mora", + "MorphableTargetInfo", "NotLoadedOpenjtalkDictError", "OpenZipFileError", "ParseKanaError", + "PermittedSynthesisMorphing", "ReadZipEntryError", "SaveUserDictError", "SpeakerMeta", + "SpeakerSupportedFeatures", "StyleAlreadyLoadedError", "StyleId", "StyleNotFoundError", diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py index 195154629..1a40dc0d2 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py @@ -34,6 +34,24 @@ """ +class PermittedSynthesisMorphing(str, Enum): + ALL = "ALL" + """全て許可。""" + + SELF_ONLY = "SELF_ONLY" + """同じ話者内でのみ許可。""" + + NOTHING = "NOTHING" + """全て禁止。""" + + +@pydantic.dataclasses.dataclass +class SpeakerSupportedFeatures: + permitted_synthesis_morphing: PermittedSynthesisMorphing = ( + PermittedSynthesisMorphing.NOTHING + ) + + @pydantic.dataclasses.dataclass class StyleMeta: """**スタイル** (_style_)のメタ情報。""" @@ -61,6 +79,14 @@ class SpeakerMeta: version: StyleVersion """話者のUUID。""" + supported_features: SpeakerSupportedFeatures = SpeakerSupportedFeatures() + """話者の対応機能。""" + + +@pydantic.dataclasses.dataclass(frozen=True) +class MorphableTargetInfo: + is_morphable: bool + @pydantic.dataclasses.dataclass class SupportedDevices: diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi index 3a47ef02b..0d61eb2aa 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi @@ -107,5 +107,10 @@ class InvalidWordError(ValueError): ... +class SpeakerFeatureError(ValueError): + """要求された機能を話者が持っていない。""" + + ... + def _validate_pronunciation(pronunciation: str) -> None: ... def _to_zenkaku(text: str) -> str: ... diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi index 7a6596008..58ed675c8 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi @@ -7,6 +7,7 @@ if TYPE_CHECKING: AccelerationMode, AccentPhrase, AudioQuery, + MorphableTargetInfo, SpeakerMeta, StyleId, UserDictWord, @@ -99,6 +100,9 @@ class Synthesizer: def metas(self) -> List[SpeakerMeta]: """メタ情報。""" ... + def morphable_targets( + self, style_id: Union[StyleId, int] + ) -> Dict[StyleId, MorphableTargetInfo]: ... async def load_voice_model(self, model: VoiceModel) -> None: """ モデルを読み込む。 @@ -294,6 +298,13 @@ class Synthesizer: WAVデータ。 """ ... + async def synthesis_morphing( + self, + audio_query: AudioQuery, + base_style_id: Union[StyleId, int], + target_style_id: Union[StyleId, int], + morph_rate: float, + ) -> bytes: ... async def tts_from_kana( self, kana: str, diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi index 3a208fb33..74d488884 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi @@ -7,6 +7,7 @@ if TYPE_CHECKING: AccelerationMode, AccentPhrase, AudioQuery, + MorphableTargetInfo, SpeakerMeta, StyleId, UserDictWord, @@ -94,6 +95,9 @@ class Synthesizer: def metas(self) -> List[SpeakerMeta]: """メタ情報。""" ... + def morphable_targets( + self, style_id: Union[StyleId, int] + ) -> Dict[StyleId, MorphableTargetInfo]: ... def load_voice_model(self, model: VoiceModel) -> None: """ モデルを読み込む。 @@ -289,6 +293,13 @@ class Synthesizer: WAVデータ。 """ ... + def synthesis_morphing( + self, + audio_query: AudioQuery, + base_style_id: Union[StyleId, int], + target_style_id: Union[StyleId, int], + morph_rate: float, + ) -> bytes: ... def tts_from_kana( self, kana: str, diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 6a1037982..1a957cd14 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -17,8 +17,8 @@ use crate::{ ExtractFullContextLabelError, GetSupportedDevicesError, GpuSupportError, InferenceFailedError, InvalidModelDataError, InvalidWordError, LoadUserDictError, ModelAlreadyLoadedError, ModelNotFoundError, NotLoadedOpenjtalkDictError, OpenZipFileError, ParseKanaError, - ReadZipEntryError, SaveUserDictError, StyleAlreadyLoadedError, StyleNotFoundError, - UseUserDictError, WordNotFoundError, + ReadZipEntryError, SaveUserDictError, SpeakerFeatureError, StyleAlreadyLoadedError, + StyleNotFoundError, UseUserDictError, WordNotFoundError, }; pub fn from_acceleration_mode(ob: &PyAny) -> PyResult { @@ -203,7 +203,7 @@ pub impl voicevox_core::Result { ErrorKind::WordNotFound => WordNotFoundError::new_err(msg), ErrorKind::UseUserDict => UseUserDictError::new_err(msg), ErrorKind::InvalidWord => InvalidWordError::new_err(msg), - ErrorKind::SpeakerFeature => todo!(), + ErrorKind::SpeakerFeature => SpeakerFeatureError::new_err(msg), }; [top] diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 9d36cafcb..32cd1113a 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -92,6 +92,7 @@ exceptions! { WordNotFoundError: PyKeyError; UseUserDictError: PyException; InvalidWordError: PyValueError; + SpeakerFeatureError: PyValueError; } #[pyclass] @@ -226,6 +227,23 @@ impl Synthesizer { to_pydantic_voice_model_meta(&synthesizer.metas(), py) } + fn morphable_targets<'py>(&self, style_id: u32, py: Python<'py>) -> PyResult<&'py PyDict> { + let class = py.import("voicevox_core")?.getattr("MorphableTargetInfo")?; + + let morphable_targets = self + .synthesizer + .get()? + .morphable_targets(StyleId::new(style_id)) + .into_py_result(py)? + .into_iter() + .map(|(k, v)| { + let v = crate::convert::to_pydantic_dataclass(v, class)?; + Ok((k.raw_id(), v)) + }) + .collect::>>()?; + Ok(morphable_targets.into_py_dict(py)) + } + fn load_voice_model<'py>( &mut self, model: &'py PyAny, @@ -430,6 +448,37 @@ impl Synthesizer { ) } + fn synthesis_morphing<'py>( + &self, + #[pyo3(from_py_with = "crate::convert::from_dataclass")] audio_query: AudioQueryModel, + base_style_id: u32, + target_style_id: u32, + morph_rate: f32, + py: Python<'py>, + ) -> PyResult<&'py PyAny> { + let synthesizer = self.synthesizer.get()?.clone(); + + pyo3_asyncio::tokio::future_into_py_with_locals( + py, + pyo3_asyncio::tokio::get_current_locals(py)?, + async move { + let wav = synthesizer + .synthesis_morphing( + &audio_query, + StyleId::new(base_style_id), + StyleId::new(target_style_id), + morph_rate, + ) + .await; + + Python::with_gil(|py| { + let wav = wav.into_py_result(py)?; + Ok(PyBytes::new(py, &wav).to_object(py)) + }) + }, + ) + } + #[pyo3(signature=( kana, style_id, @@ -769,6 +818,23 @@ mod blocking { crate::convert::to_pydantic_voice_model_meta(&synthesizer.metas(), py) } + fn morphable_targets<'py>(&self, style_id: u32, py: Python<'py>) -> PyResult<&'py PyDict> { + let class = py.import("voicevox_core")?.getattr("MorphableTargetInfo")?; + + let morphable_targets = self + .synthesizer + .get()? + .morphable_targets(StyleId::new(style_id)) + .into_py_result(py)? + .into_iter() + .map(|(k, v)| { + let v = crate::convert::to_pydantic_dataclass(v, class)?; + Ok((k.raw_id(), v)) + }) + .collect::>>()?; + Ok(morphable_targets.into_py_dict(py)) + } + fn load_voice_model(&mut self, model: &PyAny, py: Python<'_>) -> PyResult<()> { let model: VoiceModel = model.extract()?; self.synthesizer @@ -932,6 +998,27 @@ mod blocking { Ok(PyBytes::new(py, wav)) } + fn synthesis_morphing<'py>( + &self, + #[pyo3(from_py_with = "crate::convert::from_dataclass")] audio_query: AudioQueryModel, + base_style_id: u32, + target_style_id: u32, + morph_rate: f32, + py: Python<'py>, + ) -> PyResult<&'py PyBytes> { + let wav = &self + .synthesizer + .get()? + .synthesis_morphing( + &audio_query, + StyleId::new(base_style_id), + StyleId::new(target_style_id), + morph_rate, + ) + .into_py_result(py)?; + Ok(PyBytes::new(py, wav)) + } + #[pyo3(signature=( kana, style_id, From 9c70222b6a246342aad631026bcdb083d373f73e Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 13:42:33 +0900 Subject: [PATCH 30/46] =?UTF-8?q?`morph=5Frate`=E3=82=92`f32`=E3=81=8B?= =?UTF-8?q?=E3=82=89`f64`=E3=81=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 6 ++---- crates/voicevox_core/src/synthesizer.rs | 4 ++-- crates/voicevox_core_c_api/include/voicevox_core.h | 2 +- crates/voicevox_core_c_api/src/lib.rs | 2 +- crates/voicevox_core_c_api/tests/e2e/symbols.rs | 2 +- crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs | 2 +- crates/voicevox_core_python_api/src/lib.rs | 4 ++-- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 1991b90c1..73b9175a6 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -23,7 +23,7 @@ impl crate::blocking::Synthesizer { &self, audio_query: &AudioQueryModel, style_ids: MorphingPair, - morph_rate: f32, + morph_rate: f64, ) -> crate::Result> { let metas = &self.metas(); let pair = style_ids.lookup_speakers(metas)?; @@ -37,10 +37,8 @@ impl<'metas> MorphableTargets<'metas> { self, synthesizer: &crate::blocking::Synthesizer, audio_query: &AudioQueryModel, - morph_rate: f32, + morph_rate: f64, ) -> crate::Result> { - let morph_rate = f64::from(morph_rate); - if *audio_query.output_sampling_rate() != DEFAULT_SAMPLING_RATE || *audio_query.output_stereo() { diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 2236c0a68..e99cd6948 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -271,7 +271,7 @@ pub(crate) mod blocking { audio_query: &AudioQueryModel, base_style_id: StyleId, target_style_id: StyleId, - morph_rate: f32, + morph_rate: f64, ) -> crate::Result> { let style_ids = MorphingPair { base: base_style_id, @@ -1293,7 +1293,7 @@ pub(crate) mod tokio { audio_query: &AudioQueryModel, base_style_id: StyleId, target_style_id: StyleId, - morph_rate: f32, + morph_rate: f64, ) -> crate::Result> { let blocking = self.0.clone(); let audio_query = audio_query.clone(); diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index 9b242596e..fe4b18443 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -930,7 +930,7 @@ VoicevoxResultCode voicevox_synthesizer_synthesis_morphing(const struct Voicevox const char *audio_query_json, VoicevoxStyleId base_style_id, VoicevoxStyleId target_style_id, - float morph_rate, + double morph_rate, uintptr_t *output_wav_length, uint8_t **output_wav); diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index dbec261fa..865e1f971 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -916,7 +916,7 @@ pub unsafe extern "C" fn voicevox_synthesizer_synthesis_morphing( audio_query_json: *const c_char, base_style_id: VoicevoxStyleId, target_style_id: VoicevoxStyleId, - morph_rate: f32, + morph_rate: f64, output_wav_length: NonNull, output_wav: NonNull<*mut u8>, ) -> VoicevoxResultCode { diff --git a/crates/voicevox_core_c_api/tests/e2e/symbols.rs b/crates/voicevox_core_c_api/tests/e2e/symbols.rs index 17d06bbdc..cc3d5e1e3 100644 --- a/crates/voicevox_core_c_api/tests/e2e/symbols.rs +++ b/crates/voicevox_core_c_api/tests/e2e/symbols.rs @@ -105,7 +105,7 @@ pub(crate) struct Symbols<'lib> { *const c_char, VoicevoxStyleId, VoicevoxStyleId, - f32, + f64, *mut usize, *mut *mut u8, ) -> VoicevoxResultCode, diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs index d73a12f88..b2622c46c 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -133,7 +133,7 @@ impl assert_cdylib::TestCase for TestCase { )?[&self.target_style]; let result = { - const MORPH_RATE: f32 = 0.5; + const MORPH_RATE: f64 = 0.5; let mut wav_length = MaybeUninit::uninit(); let mut wav = MaybeUninit::uninit(); diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 32cd1113a..c30d8af62 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -453,7 +453,7 @@ impl Synthesizer { #[pyo3(from_py_with = "crate::convert::from_dataclass")] audio_query: AudioQueryModel, base_style_id: u32, target_style_id: u32, - morph_rate: f32, + morph_rate: f64, py: Python<'py>, ) -> PyResult<&'py PyAny> { let synthesizer = self.synthesizer.get()?.clone(); @@ -1003,7 +1003,7 @@ mod blocking { #[pyo3(from_py_with = "crate::convert::from_dataclass")] audio_query: AudioQueryModel, base_style_id: u32, target_style_id: u32, - morph_rate: f32, + morph_rate: f64, py: Python<'py>, ) -> PyResult<&'py PyBytes> { let wav = &self From 27a4c7a3a190fb94ad54063f1d9538c5342632ac Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 16:00:25 +0900 Subject: [PATCH 31/46] =?UTF-8?q?Java=20API=E3=81=AE=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hiroshiba/voicevoxcore/Synthesizer.java | 40 ++++++++++++ .../voicevoxcore/SynthesizerTest.java | 41 ++++++++++++ .../voicevox_core_java_api/src/synthesizer.rs | 62 ++++++++++++++++++- 3 files changed, 141 insertions(+), 2 deletions(-) diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java index a3fe0de6c..c0c998c5c 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java @@ -1,10 +1,14 @@ package jp.hiroshiba.voicevoxcore; import com.google.gson.Gson; +import com.google.gson.annotations.Expose; +import com.google.gson.annotations.SerializedName; +import com.google.gson.reflect.TypeToken; import jakarta.annotation.Nonnull; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import jp.hiroshiba.voicevoxcore.exceptions.InferenceFailedException; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; @@ -50,6 +54,17 @@ public VoiceModel.SpeakerMeta[] metas() { return rawMetas; } + @Nonnull + public Map morphableTargets(int styleId) { + String json = rsMorphableTargetsJson(styleId); + Map ret = + new Gson().fromJson(json, new TypeToken>() {}.getType()); + if (ret == null) { + throw new NullPointerException(); + } + return ret; + } + /** * モデルを読み込む。 * @@ -239,6 +254,14 @@ public SynthesisConfigurator synthesis(AudioQuery audioQuery, int styleId) { return new SynthesisConfigurator(this, audioQuery, styleId); } + @Nonnull + public byte[] synthesisMorphing( + AudioQuery audioQuery, int baseStyleId, int targetStyleId, double morphRate) + throws InferenceFailedException { + String audioQueryJson = new Gson().toJson(audioQuery); + return rsSynthesisMorphing(audioQueryJson, baseStyleId, targetStyleId, morphRate); + } + /** * AquesTalk風記法をもとに音声合成を実行するためのオブジェクトを生成する。 * @@ -272,6 +295,8 @@ public TtsConfigurator tts(String text, int styleId) { @Nonnull private native String rsGetMetasJson(); + private native String rsMorphableTargetsJson(int styleId); + private native void rsLoadVoiceModel(VoiceModel voiceModel) throws InvalidModelDataException; private native void rsUnloadVoiceModel(String voiceModelId); @@ -309,6 +334,11 @@ private native byte[] rsSynthesis( String queryJson, int styleId, boolean enableInterrogativeUpspeak) throws InferenceFailedException; + @Nonnull + private native byte[] rsSynthesisMorphing( + String queryJson, int baseStyleId, int targetStyleId, double morphRate) + throws InferenceFailedException; + @Nonnull private native byte[] rsTtsFromKana(String kana, int styleId, boolean enableInterrogativeUpspeak) throws InferenceFailedException; @@ -476,6 +506,16 @@ public byte[] execute() throws InferenceFailedException { } } + public static class MorphableTargetInfo { + @SerializedName("is_morphable") + @Expose + public final boolean isMorphable; + + private MorphableTargetInfo() { + isMorphable = false; + } + } + /** {@link Synthesizer#tts} のオプション。 */ public class TtsConfigurator { private Synthesizer synthesizer; diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java index 1eb8fe057..7fc6cb5df 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java @@ -8,8 +8,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; +import java.util.Map; +import jp.hiroshiba.voicevoxcore.Synthesizer.MorphableTargetInfo; import jp.hiroshiba.voicevoxcore.exceptions.InferenceFailedException; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; +import jp.hiroshiba.voicevoxcore.exceptions.StyleNotFoundException; import org.junit.jupiter.api.Test; class SynthesizerTest extends TestUtils { @@ -26,6 +29,44 @@ void checkIsGpuMode() { assertFalse(synthesizer.isGpuMode()); } + @Test + void checkMorphableTargets() throws InvalidModelDataException { + OpenJtalk openJtalk = loadOpenJtalk(); + Synthesizer synthesizer = + Synthesizer.builder(openJtalk).accelerationMode(Synthesizer.AccelerationMode.CPU).build(); + + synthesizer.loadVoiceModel(loadModel()); + + Map morphableTargets = synthesizer.morphableTargets(0); + assertFalse(morphableTargets.get(0).isMorphable); + assertFalse(morphableTargets.get(1).isMorphable); + assertFalse(morphableTargets.get(302).isMorphable); + assertFalse(morphableTargets.get(303).isMorphable); + + morphableTargets = synthesizer.morphableTargets(1); + assertFalse(morphableTargets.get(0).isMorphable); + assertTrue(morphableTargets.get(1).isMorphable); + assertFalse(morphableTargets.get(302).isMorphable); + assertFalse(morphableTargets.get(303).isMorphable); + + morphableTargets = synthesizer.morphableTargets(302); + assertFalse(morphableTargets.get(0).isMorphable); + assertFalse(morphableTargets.get(1).isMorphable); + assertTrue(morphableTargets.get(302).isMorphable); + assertTrue(morphableTargets.get(303).isMorphable); + + morphableTargets = synthesizer.morphableTargets(303); + assertFalse(morphableTargets.get(0).isMorphable); + assertFalse(morphableTargets.get(1).isMorphable); + assertTrue(morphableTargets.get(302).isMorphable); + assertTrue(morphableTargets.get(303).isMorphable); + + try { + synthesizer.morphableTargets(2); + } catch (StyleNotFoundException e) { + } + } + boolean checkAllMoras( List accentPhrases, List otherAccentPhrases, diff --git a/crates/voicevox_core_java_api/src/synthesizer.rs b/crates/voicevox_core_java_api/src/synthesizer.rs index fee5bc132..abeb8ab8f 100644 --- a/crates/voicevox_core_java_api/src/synthesizer.rs +++ b/crates/voicevox_core_java_api/src/synthesizer.rs @@ -5,10 +5,10 @@ use crate::{ use jni::{ objects::{JObject, JString}, - sys::{jboolean, jint, jobject}, + sys::{jboolean, jdouble, jint, jobject}, JNIEnv, }; -use std::sync::Arc; +use std::{borrow::Cow, sync::Arc}; #[no_mangle] unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'local>( @@ -91,6 +91,29 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJ }) } +#[no_mangle] +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsMorphableTargetsJson< + 'local, +>( + env: JNIEnv<'local>, + this: JObject<'local>, + style_id: jint, +) -> jobject { + throw_if_err(env, std::ptr::null_mut(), |env| { + let internal = env + .get_rust_field::<_, _, Arc>>( + &this, "handle", + )? + .clone(); + + let style_id = voicevox_core::StyleId::new(style_id as _); + + let json = &internal.morphable_targets(style_id)?; + let json = env.new_string(serde_json::to_string(json).expect("should not fail"))?; + Ok(json.into_raw()) + }) +} + #[no_mangle] unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoiceModel<'local>( env: JNIEnv<'local>, @@ -398,6 +421,41 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis }) } +#[no_mangle] +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesisMorphing<'local>( + env: JNIEnv<'local>, + this: JObject<'local>, + audio_query: JString<'local>, + base_style_id: jint, + target_style_id: jint, + morph_rate: jdouble, +) -> jobject { + throw_if_err(env, std::ptr::null_mut(), |env| { + let audio_query = &env.get_string(&audio_query)?; + let audio_query = &Cow::::from(audio_query); + let audio_query = &serde_json::from_str::(audio_query) + .map_err(JavaApiError::DeJson)?; + + let base_style_id = voicevox_core::StyleId::new(base_style_id as _); + let target_style_id = voicevox_core::StyleId::new(target_style_id as _); + + let internal = env + .get_rust_field::<_, _, Arc>>( + &this, "handle", + )? + .clone(); + + let wav = &internal.synthesis_morphing( + audio_query, + base_style_id, + target_style_id, + morph_rate, + )?; + let wav = env.byte_array_from_slice(wav)?; + Ok(wav.into_raw()) + }) +} + #[no_mangle] unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKana<'local>( env: JNIEnv<'local>, From 5b014ec80caa3d9eefd6498dab467035aa7298e0 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 17:34:14 +0900 Subject: [PATCH 32/46] =?UTF-8?q?docstring=E3=82=92=E6=9B=B8=E3=81=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/synthesizer.rs | 10 +++++ .../include/voicevox_core.h | 33 ++++++++++++++++ crates/voicevox_core_c_api/src/lib.rs | 33 ++++++++++++++++ .../hiroshiba/voicevoxcore/Synthesizer.java | 16 ++++++++ .../python/voicevox_core/_rust/asyncio.pyi | 38 ++++++++++++++++++- .../python/voicevox_core/_rust/blocking.pyi | 38 ++++++++++++++++++- 6 files changed, 164 insertions(+), 4 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index e99cd6948..70f033f99 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -234,6 +234,10 @@ pub(crate) mod blocking { self.status.metas() } + /// 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + /// + /// 話者およびそのメタ情報の`.supported_features.permitted_synthesis_morphing`の組み合わせに + /// よって決定される。 pub fn morphable_targets( &self, style_id: StyleId, @@ -266,6 +270,7 @@ pub(crate) mod blocking { Ok(engine::to_wav(wave, audio_query)) } + /// 2人の話者でモーフィングした音声を合成する。 pub fn synthesis_morphing( &self, audio_query: &AudioQueryModel, @@ -1267,6 +1272,10 @@ pub(crate) mod tokio { self.0.metas() } + /// 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + /// + /// 話者およびそのメタ情報の`.supported_features.permitted_synthesis_morphing`の組み合わせに + /// よって決定される。 pub fn morphable_targets( &self, style_id: StyleId, @@ -1288,6 +1297,7 @@ pub(crate) mod tokio { .await } + /// 2人の話者でモーフィングした音声を合成する。 pub async fn synthesis_morphing( &self, audio_query: &AudioQueryModel, diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index fe4b18443..5c6bde781 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -644,6 +644,25 @@ __declspec(dllimport) VoicevoxResultCode voicevox_create_supported_devices_json(char **output_supported_devices_json); /** + * 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + * + * 話者およびそのメタ情報の `.supported_features.permitted_synthesis_morphing` の組み合わせによって決定される。 + * + * JSONの解放は ::voicevox_json_free で行う。 + * + * @param [in] synthesizer 音声シンセサイザ + * @param [in] style_id スタイルID + * + * @returns 結果コード + * + * \example{ + * ```c + * char *morphable_targets; + * VoicevoxResultCode result = voicevox_synthesizer_create_morphable_targets_json( + * synthesizer, style_id, &morphable_targets); + * ``` + * } + * * \safety{ * - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 * - `output`は書き込みについて有効でなければならない。 @@ -916,6 +935,20 @@ VoicevoxResultCode voicevox_synthesizer_synthesis(const struct VoicevoxSynthesiz uint8_t **output_wav); /** + * 2人の話者でモーフィングした音声を合成する。 + * + * 生成したWAVデータを解放するには ::voicevox_wav_free を使う。 + * + * @param [in] synthesizer 音声シンセサイザ + * @param [in] audio_query_json AudioQueryのJSON文字列 + * @param [in] base_style_id ベースのスタイルのID + * @param [in] target_style_id モーフィング先スタイルのID + * @param [in] morph_rate モーフィングの割合 + * @param [out] output_wav_length 出力のバイト長 + * @param [out] output_wav 出力先 + * + * @returns 結果コード + * * \safety{ * - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 * - `audio_query_json`はヌル終端文字列を指し、かつ読み込みについて有効でなければならない。 diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index 865e1f971..804338e83 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -505,6 +505,25 @@ pub unsafe extern "C" fn voicevox_create_supported_devices_json( })()) } +/// 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 +/// +/// 話者およびそのメタ情報の `.supported_features.permitted_synthesis_morphing` の組み合わせによって決定される。 +/// +/// JSONの解放は ::voicevox_json_free で行う。 +/// +/// @param [in] synthesizer 音声シンセサイザ +/// @param [in] style_id スタイルID +/// +/// @returns 結果コード +/// +/// \example{ +/// ```c +/// char *morphable_targets; +/// VoicevoxResultCode result = voicevox_synthesizer_create_morphable_targets_json( +/// synthesizer, style_id, &morphable_targets); +/// ``` +/// } +/// /// \safety{ /// - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 /// - `output`は書き込みについて有効でなければならない。 @@ -904,6 +923,20 @@ pub unsafe extern "C" fn voicevox_synthesizer_synthesis( })()) } +/// 2人の話者でモーフィングした音声を合成する。 +/// +/// 生成したWAVデータを解放するには ::voicevox_wav_free を使う。 +/// +/// @param [in] synthesizer 音声シンセサイザ +/// @param [in] audio_query_json AudioQueryのJSON文字列 +/// @param [in] base_style_id ベースのスタイルのID +/// @param [in] target_style_id モーフィング先スタイルのID +/// @param [in] morph_rate モーフィングの割合 +/// @param [out] output_wav_length 出力のバイト長 +/// @param [out] output_wav 出力先 +/// +/// @returns 結果コード +/// /// \safety{ /// - `synthesizer`は ::voicevox_synthesizer_new で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 /// - `audio_query_json`はヌル終端文字列を指し、かつ読み込みについて有効でなければならない。 diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java index c0c998c5c..54731f17a 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java @@ -54,6 +54,12 @@ public VoiceModel.SpeakerMeta[] metas() { return rawMetas; } + /** + * 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + * + * @param styleId スタイルID。 + * @return モーフィング機能の利用可否の一覧。 + */ @Nonnull public Map morphableTargets(int styleId) { String json = rsMorphableTargetsJson(styleId); @@ -254,6 +260,16 @@ public SynthesisConfigurator synthesis(AudioQuery audioQuery, int styleId) { return new SynthesisConfigurator(this, audioQuery, styleId); } + /** + * 2人の話者でモーフィングした音声を合成する。 + * + * @param audioQuery {@link AudioQuery}。 + * @param baseStyleId ベースのスタイルのID。 + * @param targetStyleId モーフィング先スタイルのID。 + * @param morphRate モーフィングの割合。 + * @return WAVデータ。 + * @throws InferenceFailedException 推論に失敗した場合。 + */ @Nonnull public byte[] synthesisMorphing( AudioQuery audioQuery, int baseStyleId, int targetStyleId, double morphRate) diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi index 58ed675c8..78cd43a66 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/asyncio.pyi @@ -102,7 +102,22 @@ class Synthesizer: ... def morphable_targets( self, style_id: Union[StyleId, int] - ) -> Dict[StyleId, MorphableTargetInfo]: ... + ) -> Dict[StyleId, MorphableTargetInfo]: + """ + 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + + 話者およびそのメタ情報の ``.supported_features.permitted_synthesis_morphing`` の組み合わせによって決定される。 + + Parameters + ---------- + style_id + スタイルID。 + + Returns + ------- + モーフィング機能の利用可否の一覧。 + """ + ... async def load_voice_model(self, model: VoiceModel) -> None: """ モデルを読み込む。 @@ -304,7 +319,26 @@ class Synthesizer: base_style_id: Union[StyleId, int], target_style_id: Union[StyleId, int], morph_rate: float, - ) -> bytes: ... + ) -> bytes: + """ + 2人の話者でモーフィングした音声を合成する。 + + Parameters + ---------- + audio_query + :class:`AudioQuery` 。 + base_style_id + ベースのスタイルのID。 + target_style_id + モーフィング先スタイルのID。 + morph_rate + モーフィングの割合。 + + Returns + ------- + WAVデータ。 + """ + ... async def tts_from_kana( self, kana: str, diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi index 74d488884..a37f66f2b 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/blocking.pyi @@ -97,7 +97,22 @@ class Synthesizer: ... def morphable_targets( self, style_id: Union[StyleId, int] - ) -> Dict[StyleId, MorphableTargetInfo]: ... + ) -> Dict[StyleId, MorphableTargetInfo]: + """ + 全スタイルごとに、指定されたスタイルとのペアでモーフィング機能が利用可能かどうかを返す。 + + 話者およびそのメタ情報の ``.supported_features.permitted_synthesis_morphing`` の組み合わせによって決定される。 + + Parameters + ---------- + style_id + スタイルID。 + + Returns + ------- + モーフィング機能の利用可否の一覧。 + """ + ... def load_voice_model(self, model: VoiceModel) -> None: """ モデルを読み込む。 @@ -299,7 +314,26 @@ class Synthesizer: base_style_id: Union[StyleId, int], target_style_id: Union[StyleId, int], morph_rate: float, - ) -> bytes: ... + ) -> bytes: + """ + 2人の話者でモーフィングした音声を合成する。 + + Parameters + ---------- + audio_query + :class:`AudioQuery` 。 + base_style_id + ベースのスタイルのID。 + target_style_id + モーフィング先スタイルのID。 + morph_rate + モーフィングの割合。 + + Returns + ------- + WAVデータ。 + """ + ... def tts_from_kana( self, kana: str, From 13dcca2675b2471afaa05d4c83679d8cac0446dc Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Tue, 2 Jan 2024 17:42:02 +0900 Subject: [PATCH 33/46] =?UTF-8?q?=E3=82=B9=E3=83=9A=E3=82=AF=E3=83=88?= =?UTF-8?q?=E3=83=AD=E3=82=B0=E3=83=A9=E3=83=A0=E3=81=AE=E8=A8=88=E7=AE=97?= =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 73b9175a6..ac5ae5eb5 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -68,7 +68,8 @@ impl<'metas> MorphableTargets<'metas> { morph_spectrogram, itertools::zip_eq(base_spectrogram, target_spectrogram), ) { - *morph_spectrogram = base_spectrogram * (1. - morph_rate) + target_spectrogram; + *morph_spectrogram = + base_spectrogram * (1. - morph_rate) + target_spectrogram * morph_rate; } } From 6d2eb8046a01955437780cf20087968bd25d579d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Wed, 3 Jan 2024 11:30:10 +0900 Subject: [PATCH 34/46] =?UTF-8?q?`SpeakerFeatureException`=E3=81=AE?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../exceptions/SpeakerFeatureException.java | 11 +++++++++++ crates/voicevox_core_java_api/src/common.rs | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/SpeakerFeatureException.java diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/SpeakerFeatureException.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/SpeakerFeatureException.java new file mode 100644 index 000000000..1353d3db9 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/exceptions/SpeakerFeatureException.java @@ -0,0 +1,11 @@ +package jp.hiroshiba.voicevoxcore.exceptions; + +public class SpeakerFeatureException extends UnsupportedOperationException { + public SpeakerFeatureException(String message) { + super(message); + } + + public SpeakerFeatureException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/crates/voicevox_core_java_api/src/common.rs b/crates/voicevox_core_java_api/src/common.rs index 3fe791e38..a9a62e3a2 100644 --- a/crates/voicevox_core_java_api/src/common.rs +++ b/crates/voicevox_core_java_api/src/common.rs @@ -146,7 +146,7 @@ where WordNotFound, UseUserDict, InvalidWord, - SpeakerFeature, // TODO + SpeakerFeature, ); let mut sources = From dbbf89c5bdc946b4145e3b1ba311f2365102031d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Wed, 3 Jan 2024 12:56:07 +0900 Subject: [PATCH 35/46] =?UTF-8?q?=E4=B8=8D=E8=A6=81=E3=81=AA`todo!`?= =?UTF-8?q?=E5=88=86=E5=B2=90=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/audio_file.rs | 2 +- crates/voicevox_core/src/engine/morph.rs | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs index f6facea6d..73f0b9cf1 100644 --- a/crates/voicevox_core/src/engine/audio_file.rs +++ b/crates/voicevox_core/src/engine/audio_file.rs @@ -9,7 +9,7 @@ pub(crate) fn to_wav + From + Cast>( wave: &[T], audio_query: &AudioQueryModel, ) -> Vec { - // TODO: ライブラリ(e.g. https://docs.rs/hound)を使う + // TODO: ライブラリ(e.g. https://docs.rs/rubato & https://docs.rs/hound)を使う let volume_scale = *audio_query.volume_scale(); let output_stereo = *audio_query.output_stereo(); diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index ac5ae5eb5..1b5cb8338 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -39,12 +39,6 @@ impl<'metas> MorphableTargets<'metas> { audio_query: &AudioQueryModel, morph_rate: f64, ) -> crate::Result> { - if *audio_query.output_sampling_rate() != DEFAULT_SAMPLING_RATE - || *audio_query.output_stereo() - { - todo!(); - } - let waves = &self.get().try_map(|style_id| { synthesizer.synthesis_wave(audio_query, style_id, &Default::default()) })?; From f04380eb610daea333f6482c42d30edd7fddfdec Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Wed, 3 Jan 2024 16:44:10 +0900 Subject: [PATCH 36/46] =?UTF-8?q?`Synthesizer`=E3=81=AEimpl=E3=82=92`morph?= =?UTF-8?q?`=E5=81=B4=E3=81=AB=E5=AF=84=E3=81=9B=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/mod.rs | 1 - crates/voicevox_core/src/engine/morph.rs | 44 ++++++++++++++++++++---- crates/voicevox_core/src/synthesizer.rs | 29 +++------------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 64b36d8c7..f2daeaa75 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -13,5 +13,4 @@ pub(crate) use self::full_context_label::{FullContextLabelError, Utterance}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel, MorphableTargetInfo}; pub(crate) use self::mora_list::mora2text; -pub(crate) use self::morph::MorphingPair; pub use self::open_jtalk::FullcontextExtractor; diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 1b5cb8338..30a16b6df 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -1,16 +1,40 @@ +use std::collections::BTreeMap; + use world::{ signal_analyzer::{AnalyzeResult, SignalAnalyzerBuilder}, spectrogram_like::SpectrogramLike, }; use crate::{ - error::ErrorRepr, synthesizer::DEFAULT_SAMPLING_RATE, AudioQueryModel, SpeakerMeta, StyleId, + error::ErrorRepr, synthesizer::DEFAULT_SAMPLING_RATE, AudioQueryModel, MorphableTargetInfo, + SpeakerMeta, StyleId, StyleMeta, }; use self::permit::MorphableTargets; impl crate::blocking::Synthesizer { - pub(crate) fn is_synthesis_morphing_permitted( + pub(crate) fn morphable_targets_( + &self, + style_id: StyleId, + ) -> crate::Result> { + let metas = &self.metas(); + + metas + .iter() + .flat_map(SpeakerMeta::styles) + .map(StyleMeta::id) + .map(|&target| { + let style_ids = MorphingPair { + base: style_id, + target, + }; + let is_morphable = self.is_synthesis_morphing_permitted(style_ids, metas)?; + Ok((target, MorphableTargetInfo { is_morphable })) + }) + .collect() + } + + fn is_synthesis_morphing_permitted( &self, style_ids: MorphingPair, metas: &[SpeakerMeta], @@ -22,11 +46,17 @@ impl crate::blocking::Synthesizer { pub(crate) fn synthesis_morphing_( &self, audio_query: &AudioQueryModel, - style_ids: MorphingPair, + base_style_id: StyleId, + target_style_id: StyleId, morph_rate: f64, ) -> crate::Result> { let metas = &self.metas(); - let pair = style_ids.lookup_speakers(metas)?; + + let pair = MorphingPair { + base: base_style_id, + target: target_style_id, + } + .lookup_speakers(metas)?; MorphableTargets::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) } @@ -132,9 +162,9 @@ impl<'metas> MorphableTargets<'metas> { } #[derive(Clone, Copy)] -pub(crate) struct MorphingPair { - pub(crate) base: T, - pub(crate) target: T, +struct MorphingPair { + base: T, + target: T, } impl MorphingPair { diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 70f033f99..8d5360314 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -89,8 +89,7 @@ pub(crate) mod blocking { use crate::{ engine::{ - self, create_kana, parse_kana, MoraModel, MorphableTargetInfo, MorphingPair, - OjtPhoneme, Utterance, + self, create_kana, parse_kana, MoraModel, MorphableTargetInfo, OjtPhoneme, Utterance, }, error::ErrorRepr, infer::{ @@ -103,8 +102,8 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, - AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, SpeakerMeta, StyleId, - StyleMeta, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, + AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, + SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; use super::{ @@ -242,21 +241,7 @@ pub(crate) mod blocking { &self, style_id: StyleId, ) -> Result> { - let metas = &self.metas(); - - metas - .iter() - .flat_map(SpeakerMeta::styles) - .map(StyleMeta::id) - .map(|&target| { - let style_ids = MorphingPair { - base: style_id, - target, - }; - let is_morphable = self.is_synthesis_morphing_permitted(style_ids, metas)?; - Ok((target, MorphableTargetInfo { is_morphable })) - }) - .collect() + self.morphable_targets_(style_id) } /// AudioQueryから音声合成を行う。 @@ -278,11 +263,7 @@ pub(crate) mod blocking { target_style_id: StyleId, morph_rate: f64, ) -> crate::Result> { - let style_ids = MorphingPair { - base: base_style_id, - target: target_style_id, - }; - self.synthesis_morphing_(audio_query, style_ids, morph_rate) + self.synthesis_morphing_(audio_query, base_style_id, target_style_id, morph_rate) } pub(crate) fn synthesis_wave( From 03d5055c823cdeca5515e98f00eed80598e5e376 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Thu, 4 Jan 2024 09:21:58 +0900 Subject: [PATCH 37/46] =?UTF-8?q?FIXME=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core_c_api/tests/e2e/snapshots.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 6236955da..ccecff692 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -100,6 +100,8 @@ stderr.unix = "" [morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":302}'] ok = false +# FIXME: 以下の部分が`#[error(transparent)]`ではなく`#[error("{0}")]`となっているために、このようなエラー表示になってしまっている +# https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_c_api/src/helpers.rs#L67 stderr.windows = ''' {windows-video-cards} {timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング From 9c41398fdbad2244f9e2ea8953ab78233dc11546 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Thu, 4 Jan 2024 11:37:46 +0900 Subject: [PATCH 38/46] =?UTF-8?q?`synthesis=5Fmorphing`=E3=81=AE=E3=83=86?= =?UTF-8?q?=E3=82=B9=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/e2e/testcases/morph.rs | 1 + .../voicevoxcore/SynthesizerTest.java | 149 +++++++++++++----- .../python/test/test_asyncio_morph.py | 110 +++++++++++++ .../python/test/test_blocking_morph.py | 108 +++++++++++++ .../python/voicevox_core/__init__.py | 2 + 5 files changed, 332 insertions(+), 38 deletions(-) create mode 100644 crates/voicevox_core_python_api/python/test/test_asyncio_morph.py create mode 100644 crates/voicevox_core_python_api/python/test/test_blocking_morph.py diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs index b2622c46c..20b79004d 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -132,6 +132,7 @@ impl assert_cdylib::TestCase for TestCase { CStr::from_ptr(morphable_targets).to_bytes(), )?[&self.target_style]; + // TODO: スナップショットテストをやる let result = { const MORPH_RATE: f64 = 0.5; diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java index 7fc6cb5df..282df0264 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java @@ -9,11 +9,17 @@ import java.util.List; import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.stream.Stream; import jp.hiroshiba.voicevoxcore.Synthesizer.MorphableTargetInfo; import jp.hiroshiba.voicevoxcore.exceptions.InferenceFailedException; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; +import jp.hiroshiba.voicevoxcore.exceptions.SpeakerFeatureException; import jp.hiroshiba.voicevoxcore.exceptions.StyleNotFoundException; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; class SynthesizerTest extends TestUtils { @FunctionalInterface @@ -29,44 +35,6 @@ void checkIsGpuMode() { assertFalse(synthesizer.isGpuMode()); } - @Test - void checkMorphableTargets() throws InvalidModelDataException { - OpenJtalk openJtalk = loadOpenJtalk(); - Synthesizer synthesizer = - Synthesizer.builder(openJtalk).accelerationMode(Synthesizer.AccelerationMode.CPU).build(); - - synthesizer.loadVoiceModel(loadModel()); - - Map morphableTargets = synthesizer.morphableTargets(0); - assertFalse(morphableTargets.get(0).isMorphable); - assertFalse(morphableTargets.get(1).isMorphable); - assertFalse(morphableTargets.get(302).isMorphable); - assertFalse(morphableTargets.get(303).isMorphable); - - morphableTargets = synthesizer.morphableTargets(1); - assertFalse(morphableTargets.get(0).isMorphable); - assertTrue(morphableTargets.get(1).isMorphable); - assertFalse(morphableTargets.get(302).isMorphable); - assertFalse(morphableTargets.get(303).isMorphable); - - morphableTargets = synthesizer.morphableTargets(302); - assertFalse(morphableTargets.get(0).isMorphable); - assertFalse(morphableTargets.get(1).isMorphable); - assertTrue(morphableTargets.get(302).isMorphable); - assertTrue(morphableTargets.get(303).isMorphable); - - morphableTargets = synthesizer.morphableTargets(303); - assertFalse(morphableTargets.get(0).isMorphable); - assertFalse(morphableTargets.get(1).isMorphable); - assertTrue(morphableTargets.get(302).isMorphable); - assertTrue(morphableTargets.get(303).isMorphable); - - try { - synthesizer.morphableTargets(2); - } catch (StyleNotFoundException e) { - } - } - boolean checkAllMoras( List accentPhrases, List otherAccentPhrases, @@ -152,4 +120,109 @@ void checkTts() throws InferenceFailedException, InvalidModelDataException { synthesizer.loadVoiceModel(model); synthesizer.tts("こんにちは", model.metas[0].styles[0].id); } + + @ParameterizedTest + @MethodSource("morphParamsProvider") + void checkMorphing(MorphParams params) + throws InvalidModelDataException, InferenceFailedException { + OpenJtalk openJtalk = loadOpenJtalk(); + Synthesizer synthesizer = + Synthesizer.builder(openJtalk).accelerationMode(Synthesizer.AccelerationMode.CPU).build(); + + synthesizer.loadVoiceModel(loadModel()); + + int baseStyleId = params.getBaseStyleId(); + AudioQuery query = synthesizer.createAudioQuery("こんにちは", baseStyleId); + Map morphableTargets = synthesizer.morphableTargets(baseStyleId); + + for (Map.Entry entry : params.getTargets().entrySet()) { + int targetStyleId = entry.getKey(); + boolean shouldSuccess = entry.getValue(); + + assertTrue(morphableTargets.get(targetStyleId).isMorphable == shouldSuccess); + + try { + // TODO: スナップショットテストをやる + synthesizer.synthesisMorphing(query, baseStyleId, targetStyleId, 0.5); + assertTrue(shouldSuccess); + } catch (SpeakerFeatureException e) { + assertFalse(shouldSuccess); + } + } + } + + static Stream morphParamsProvider() { + return Stream.of( + new MorphParams( + 0, + new TreeMap() { + { + put(0, false); + put(1, false); + put(302, false); + put(303, false); + } + }), + new MorphParams( + 1, + new TreeMap() { + { + put(0, false); + put(1, true); + put(302, false); + put(303, false); + } + }), + new MorphParams( + 302, + new TreeMap() { + { + put(0, false); + put(1, false); + put(302, true); + put(303, true); + } + }), + new MorphParams( + 303, + new TreeMap() { + { + put(0, false); + put(1, false); + put(302, true); + put(303, true); + } + })); + } + + // TODO: Lombokを使う + static class MorphParams { + private final int baseStyleId; + private final SortedMap targets; + + MorphParams(int baseStyleId, SortedMap targets) { + this.baseStyleId = baseStyleId; + this.targets = targets; + } + + int getBaseStyleId() { + return baseStyleId; + } + + SortedMap getTargets() { + return targets; + } + } + + @Test + void checkMorphableTargetsDeniesUnknownStyle() { + OpenJtalk openJtalk = loadOpenJtalk(); + Synthesizer synthesizer = + Synthesizer.builder(openJtalk).accelerationMode(Synthesizer.AccelerationMode.CPU).build(); + + try { + synthesizer.morphableTargets(0); + } catch (StyleNotFoundException e) { + } + } } diff --git a/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py b/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py new file mode 100644 index 000000000..4adcfc6f0 --- /dev/null +++ b/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py @@ -0,0 +1,110 @@ +""" +モーフィング機能をテストする。 + +``test_blocking_morph`` と対になる。 +""" + +from typing import Dict + +import conftest +import pytest +import pytest_asyncio +from voicevox_core import SpeakerFeatureError, StyleId, StyleNotFoundError +from voicevox_core.asyncio import OpenJtalk, Synthesizer, VoiceModel + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "base, targets", + [ + ( + 0, + { + 0: False, + 1: False, + 302: False, + 303: False, + }, + ), + ( + 1, + { + 0: False, + 1: True, + 302: False, + 303: False, + }, + ), + ( + 302, + { + 0: False, + 1: False, + 302: True, + 303: True, + }, + ), + ( + 303, + { + 0: False, + 1: False, + 302: True, + 303: True, + }, + ), + ], +) +async def test_morph( + synthesizer: Synthesizer, base: StyleId, targets: Dict[StyleId, bool] +) -> None: + TEXT = "こんにちは" + MORPH_RATE = 0.5 + + query = await synthesizer.audio_query(TEXT, base) + + for target, should_success in targets.items(): + is_morphable = synthesizer.morphable_targets(base)[target].is_morphable + assert is_morphable == should_success + + if should_success: + # TODO: スナップショットテストをやる + await synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) + else: + with pytest.raises( + SpeakerFeatureError, + match=( + r"^`dummy[1-3]` \([0-9a-f-]{36}\)は以下の機能を持ちません: " + r"`dummy[1-3]` \([0-9a-f-]{36}\)に対するモーフィング$" + ), + ): + await synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) + + +def test_morphable_targets_raises_for_unknown_style(synthesizer: Synthesizer) -> None: + STYLE_ID = StyleId(9999) + + # FIXME: `KeyError.__init__`を通しているため、メッセージが`repr`で表示されてしまう + # https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_python_api/src/convert.rs#L186-L206 + with pytest.raises( + StyleNotFoundError, + match=f"^'`{STYLE_ID}`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています'$", + ): + synthesizer.morphable_targets(STYLE_ID) + + +@pytest_asyncio.fixture +async def synthesizer(open_jtalk: OpenJtalk, model: VoiceModel) -> Synthesizer: + synthesizer = Synthesizer(open_jtalk) + await synthesizer.load_voice_model(model) + return synthesizer + + +@pytest_asyncio.fixture +async def open_jtalk() -> OpenJtalk: + return await OpenJtalk.new(conftest.open_jtalk_dic_dir) + + +@pytest_asyncio.fixture +async def model() -> VoiceModel: + return await VoiceModel.from_path(conftest.model_dir) diff --git a/crates/voicevox_core_python_api/python/test/test_blocking_morph.py b/crates/voicevox_core_python_api/python/test/test_blocking_morph.py new file mode 100644 index 000000000..6551e9eb8 --- /dev/null +++ b/crates/voicevox_core_python_api/python/test/test_blocking_morph.py @@ -0,0 +1,108 @@ +""" +モーフィング機能をテストする。 + +``test_asyncio_morph`` と対になる。 +""" + +from typing import Dict + +import conftest +import pytest +from voicevox_core import SpeakerFeatureError, StyleId, StyleNotFoundError +from voicevox_core.blocking import OpenJtalk, Synthesizer, VoiceModel + + +@pytest.mark.parametrize( + "base, targets", + [ + ( + 0, + { + 0: False, + 1: False, + 302: False, + 303: False, + }, + ), + ( + 1, + { + 0: False, + 1: True, + 302: False, + 303: False, + }, + ), + ( + 302, + { + 0: False, + 1: False, + 302: True, + 303: True, + }, + ), + ( + 303, + { + 0: False, + 1: False, + 302: True, + 303: True, + }, + ), + ], +) +def test_morph( + synthesizer: Synthesizer, base: StyleId, targets: Dict[StyleId, bool] +) -> None: + TEXT = "こんにちは" + MORPH_RATE = 0.5 + + query = synthesizer.audio_query(TEXT, base) + + for target, should_success in targets.items(): + is_morphable = synthesizer.morphable_targets(base)[target].is_morphable + assert is_morphable == should_success + + if should_success: + # TODO: スナップショットテストをやる + synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) + else: + with pytest.raises( + SpeakerFeatureError, + match=( + r"^`dummy[1-3]` \([0-9a-f-]{36}\)は以下の機能を持ちません: " + r"`dummy[1-3]` \([0-9a-f-]{36}\)に対するモーフィング$" + ), + ): + synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) + + +def test_morphable_targets_raises_for_unknown_style(synthesizer: Synthesizer) -> None: + STYLE_ID = StyleId(9999) + + # FIXME: `KeyError.__init__`を通しているため、メッセージが`repr`で表示されてしまう + # https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_python_api/src/convert.rs#L186-L206 + with pytest.raises( + StyleNotFoundError, + match=f"^'`{STYLE_ID}`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています'$", + ): + synthesizer.morphable_targets(STYLE_ID) + + +@pytest.fixture +def synthesizer(open_jtalk: OpenJtalk, model: VoiceModel) -> Synthesizer: + synthesizer = Synthesizer(open_jtalk) + synthesizer.load_voice_model(model) + return synthesizer + + +@pytest.fixture(scope="session") +def open_jtalk() -> OpenJtalk: + return OpenJtalk(conftest.open_jtalk_dic_dir) + + +@pytest.fixture(scope="session") +def model() -> VoiceModel: + return VoiceModel.from_path(conftest.model_dir) diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py index 9b93407fd..2dfca2b43 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py @@ -32,6 +32,7 @@ ParseKanaError, ReadZipEntryError, SaveUserDictError, + SpeakerFeatureError, StyleAlreadyLoadedError, StyleNotFoundError, UseUserDictError, @@ -66,6 +67,7 @@ "PermittedSynthesisMorphing", "ReadZipEntryError", "SaveUserDictError", + "SpeakerFeatureError", "SpeakerMeta", "SpeakerSupportedFeatures", "StyleAlreadyLoadedError", From 8891bb82fbaa65ad8dac65a9edd5ae0679183941 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Thu, 4 Jan 2024 19:15:39 +0900 Subject: [PATCH 39/46] =?UTF-8?q?`MorphableTargets`=20=E2=86=92=20`Morphab?= =?UTF-8?q?leStyles`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/morph.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 30a16b6df..885ca581d 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -10,7 +10,7 @@ use crate::{ SpeakerMeta, StyleId, StyleMeta, }; -use self::permit::MorphableTargets; +use self::permit::MorphableStyles; impl crate::blocking::Synthesizer { pub(crate) fn morphable_targets_( @@ -40,7 +40,7 @@ impl crate::blocking::Synthesizer { metas: &[SpeakerMeta], ) -> crate::Result { let pair = style_ids.lookup_speakers(metas)?; - Ok(MorphableTargets::permit(pair).is_ok()) + Ok(MorphableStyles::permit(pair).is_ok()) } pub(crate) fn synthesis_morphing_( @@ -58,11 +58,11 @@ impl crate::blocking::Synthesizer { } .lookup_speakers(metas)?; - MorphableTargets::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) + MorphableStyles::permit(pair)?.synthesis_morphing(self, audio_query, morph_rate) } } -impl<'metas> MorphableTargets<'metas> { +impl<'metas> MorphableStyles<'metas> { fn synthesis_morphing( self, synthesizer: &crate::blocking::Synthesizer, @@ -210,12 +210,12 @@ mod permit { use super::MorphingPair; - pub(super) struct MorphableTargets<'metas> { + pub(super) struct MorphableStyles<'metas> { inner: MorphingPair, marker: PhantomData<&'metas ()>, } - impl<'metas> MorphableTargets<'metas> { + impl<'metas> MorphableStyles<'metas> { pub(super) fn permit( pair: MorphingPair<(StyleId, &'metas SpeakerMeta)>, ) -> std::result::Result { From 8904be288cee5e6849d46f3138c8c0198a54540a Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Fri, 5 Jan 2024 00:09:20 +0900 Subject: [PATCH 40/46] =?UTF-8?q?=E3=82=B9=E3=83=9A=E3=82=AF=E3=83=88?= =?UTF-8?q?=E3=83=AD=E3=82=B0=E3=83=A9=E3=83=A0=E3=82=92ndarray=E3=81=A7?= =?UTF-8?q?=E6=8D=8C=E3=81=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 70 ++++++++++++------- Cargo.toml | 6 +- crates/voicevox_core/Cargo.toml | 2 +- crates/voicevox_core/src/engine/morph.rs | 87 ++++++++++++++++-------- 4 files changed, 106 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72ca4f57c..d4de6baa0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -425,7 +425,7 @@ version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cexpr", "clang-sys", "clap 3.2.22", @@ -444,23 +444,24 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.64.0" +version = "0.69.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" dependencies = [ - "bitflags", + "bitflags 2.4.1", "cexpr", "clang-sys", "lazy_static", "lazycell", "log", "peeking_take_while", + "prettyplease", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn 1.0.102", + "syn 2.0.38", "which", ] @@ -481,6 +482,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + [[package]] name = "block-buffer" version = "0.9.0" @@ -681,7 +688,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" dependencies = [ "atty", - "bitflags", + "bitflags 1.3.2", "clap_lex 0.2.4", "indexmap 1.9.1", "strsim", @@ -696,7 +703,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b1a0a4208c6c483b952ad35c6eed505fc13b46f08f631b81e828084a9318d74" dependencies = [ "atty", - "bitflags", + "bitflags 1.3.2", "clap_derive", "clap_lex 0.3.0", "once_cell", @@ -803,7 +810,7 @@ version = "0.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "318d6c16e73b3a900eb212ad6a82fc7d298c5ab8184c7a9998646455bc474a16" dependencies = [ - "bitflags", + "bitflags 1.3.2", "concolor-query", "is-terminal", ] @@ -2514,9 +2521,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "onnxruntime" @@ -2863,6 +2870,16 @@ dependencies = [ "yansi", ] +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn 2.0.38", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3109,7 +3126,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -3118,19 +3135,19 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.10.0" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.1", - "regex-syntax 0.8.1", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", ] [[package]] @@ -3144,13 +3161,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.1" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.1", + "regex-syntax 0.8.2", ] [[package]] @@ -3167,9 +3184,9 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "regex-syntax" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56d84fdd47036b038fc80dd333d10b6aab10d5d31f4a366e20014def75328d33" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "reqwest" @@ -3287,7 +3304,7 @@ version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno 0.2.8", "io-lifetimes", "libc", @@ -3301,7 +3318,7 @@ version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno 0.3.1", "io-lifetimes", "libc", @@ -4912,8 +4929,9 @@ dependencies = [ [[package]] name = "world" version = "0.1.0" -source = "git+https://github.com/White-Green/WORLD_rs.git?rev=2337a30bfa47eebd32ef418c60ae5c7b39e43b99#2337a30bfa47eebd32ef418c60ae5c7b39e43b99" +source = "git+https://github.com/White-Green/WORLD_rs.git?rev=37c0d11691afd42e37c627a2a964459c9eaf77b3#37c0d11691afd42e37c627a2a964459c9eaf77b3" dependencies = [ + "ndarray", "once_cell", "world_sys", ] @@ -4921,9 +4939,9 @@ dependencies = [ [[package]] name = "world_sys" version = "0.1.0" -source = "git+https://github.com/White-Green/WORLD_rs.git?rev=2337a30bfa47eebd32ef418c60ae5c7b39e43b99#2337a30bfa47eebd32ef418c60ae5c7b39e43b99" +source = "git+https://github.com/White-Green/WORLD_rs.git?rev=37c0d11691afd42e37c627a2a964459c9eaf77b3#37c0d11691afd42e37c627a2a964459c9eaf77b3" dependencies = [ - "bindgen 0.64.0", + "bindgen 0.69.1", "cc", "once_cell", "regex", diff --git a/Cargo.toml b/Cargo.toml index cd6dd157c..eb1c62161 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,7 @@ ndarray = "0.15.6" ndarray-stats = "0.5.1" num-traits = "0.2.15" octocrab = { version = "0.19.0", default-features = false } -once_cell = "1.18.0" +once_cell = "1.19.0" ouroboros = "0.18.0" parse-display = "0.8.2" pretty_assertions = "1.3.0" @@ -60,7 +60,7 @@ pyo3-asyncio = "0.19.0" pyo3-log = "0.9.0" quote = "1.0.33" rayon = "1.6.1" -regex = "1.10.0" +regex = "1.10.2" reqwest = { version = "0.11.13", default-features = false } rstest = "0.15.0" serde = "1.0.145" @@ -99,7 +99,7 @@ rev = "de226a26e8e18edbdb1d6f986afe37bbbf35fbf4" [workspace.dependencies.world] git = "https://github.com/White-Green/WORLD_rs.git" -rev = "2337a30bfa47eebd32ef418c60ae5c7b39e43b99" +rev = "37c0d11691afd42e37c627a2a964459c9eaf77b3" [workspace.package] version = "0.0.0" diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index 55f6419ef..85e36435a 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -40,7 +40,7 @@ tokio = { workspace = true, features = ["rt"] } # FIXME: feature-gateする tracing.workspace = true uuid = { workspace = true, features = ["v4", "serde"] } voicevox_core_macros = { path = "../voicevox_core_macros" } -world.workspace = true +world = { workspace = true, features = ["ndarray"] } zip.workspace = true [dev-dependencies] diff --git a/crates/voicevox_core/src/engine/morph.rs b/crates/voicevox_core/src/engine/morph.rs index 885ca581d..fb7257c36 100644 --- a/crates/voicevox_core/src/engine/morph.rs +++ b/crates/voicevox_core/src/engine/morph.rs @@ -1,5 +1,8 @@ -use std::collections::BTreeMap; +use std::{collections::BTreeMap, iter}; +use easy_ext::ext; +use ndarray::{Array, Array2}; +use num_traits::Zero; use world::{ signal_analyzer::{AnalyzeResult, SignalAnalyzerBuilder}, spectrogram_like::SpectrogramLike, @@ -73,34 +76,20 @@ impl<'metas> MorphableStyles<'metas> { synthesizer.synthesis_wave(audio_query, style_id, &Default::default()) })?; - let morph_param = MorphingParameter::new(waves); - - let mut morph_spectrogram = SpectrogramLike::::new( - morph_param.base_spectrogram.time_axis_size(), - morph_param.base_spectrogram.frequency_axis_size(), - ); - - // FIXME: サイズ違いの場合は"resize"する - for (morph_spectrogram, (base_spectrogram, target_spectrogram)) in itertools::zip_eq( - morph_spectrogram.lines_mut(), - itertools::zip_eq( - morph_param.base_spectrogram.lines(), - morph_param.target_spectrogram.lines(), - ), - ) { - for (morph_spectrogram, (base_spectrogram, target_spectrogram)) in itertools::zip_eq( - morph_spectrogram, - itertools::zip_eq(base_spectrogram, target_spectrogram), - ) { - *morph_spectrogram = - base_spectrogram * (1. - morph_rate) + target_spectrogram * morph_rate; - } - } + let MorphingParameter { + base_f0, + base_aperiodicity, + base_spectrogram, + target_spectrogram, + } = &MorphingParameter::new(waves); + + let morph_spectrogram = + &(base_spectrogram * (1. - morph_rate) + target_spectrogram * morph_rate).into(); let wave = &world::synthesis::synthesis( - &morph_param.base_f0, - &morph_spectrogram, - &morph_param.base_aperiodicity, + base_f0, + morph_spectrogram, + base_aperiodicity, None, FRAME_PERIOD, DEFAULT_SAMPLING_RATE, @@ -118,8 +107,8 @@ impl<'metas> MorphableStyles<'metas> { struct MorphingParameter { base_f0: Box<[f64]>, base_aperiodicity: SpectrogramLike, - base_spectrogram: SpectrogramLike, - target_spectrogram: SpectrogramLike, + base_spectrogram: Array2, + target_spectrogram: Array2, } impl MorphingParameter { @@ -127,6 +116,10 @@ impl<'metas> MorphableStyles<'metas> { let (base_f0, base_spectrogram, base_aperiodicity) = analyze(&wave.base); let (_, target_spectrogram, _) = analyze(&wave.target); + let base_spectrogram = Array::from(base_spectrogram); + let target_spectrogram = + Array::from(target_spectrogram).resize(base_spectrogram.dim()); + Self { base_f0, base_aperiodicity, @@ -199,6 +192,23 @@ impl MorphingPair { } } +#[ext(Array2Ext)] +impl Array2 { + fn resize(self, (nrows, ncols): (usize, usize)) -> Self { + if self.dim() == (nrows, ncols) { + return self; + } + + let mut ret = Array2::zeros((nrows, ncols)); + for (ret, this) in iter::zip(ret.rows_mut(), self.rows()) { + for (ret, this) in iter::zip(ret, this) { + *ret = *this; + } + } + ret + } +} + mod permit { use std::marker::PhantomData; @@ -261,3 +271,22 @@ mod permit { } } } + +#[cfg(test)] +mod tests { + use ndarray::{array, Array2}; + use rstest::rstest; + + use super::Array2Ext as _; + + #[rstest] + #[case(array![[1]], (2, 2), array![[1, 0], [0, 0]])] + #[case(array![[1, 1], [1, 1]], (1, 1), array![[1]])] + fn resize_works( + #[case] arr: Array2, + #[case] dim: (usize, usize), + #[case] expected: Array2, + ) { + pretty_assertions::assert_eq!(expected, arr.resize(dim)); + } +} From 4444c694fb0db7da608c339301c7cab56261f386 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Fri, 5 Jan 2024 04:39:30 +0900 Subject: [PATCH 41/46] Minor refactor --- crates/voicevox_core_c_api/src/helpers.rs | 6 ++++++ crates/voicevox_core_c_api/src/lib.rs | 5 ++--- crates/voicevox_core_c_api/src/result_code.rs | 4 +--- crates/voicevox_core_c_api/tests/e2e/snapshots.toml | 2 -- .../java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java | 4 ++-- .../python/test/test_asyncio_morph.py | 4 +--- .../python/test/test_blocking_morph.py | 4 +--- crates/voicevox_core_python_api/src/convert.rs | 1 + 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 5c70ce5e5..1864de0c9 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -65,6 +65,12 @@ pub(crate) type CApiResult = std::result::Result; #[derive(Error, Debug)] pub enum CApiError { + // FIXME: こんな感じになってしまう。`#[error(transparent)]`とするべき + // + // ``` + // {timestamp} ERROR voicevox_core::helpers: `0`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています + // {timestamp} ERROR voicevox_core::helpers: Caused by: `0`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています + // ``` #[error("{0}")] RustApi(#[from] voicevox_core::Error), #[error("UTF-8として不正な入力です")] diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index 804338e83..1b6ec2e10 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -536,9 +536,8 @@ pub unsafe extern "C" fn voicevox_synthesizer_create_morphable_targets_json( ) -> VoicevoxResultCode { init_logger_once(); into_result_code_with_error((|| { - let morphable_targets = &synthesizer - .synthesizer - .morphable_targets(StyleId::new(style_id))?; + let style_id = StyleId::new(style_id); + let morphable_targets = &synthesizer.synthesizer.morphable_targets(style_id)?; let morphable_targets = serde_json::to_string(morphable_targets).expect("should not fail"); let morphable_targets = CString::new(morphable_targets).expect("should not end with NUL"); output.as_ptr().write_unaligned( diff --git a/crates/voicevox_core_c_api/src/result_code.rs b/crates/voicevox_core_c_api/src/result_code.rs index 44c2b860d..a881aed02 100644 --- a/crates/voicevox_core_c_api/src/result_code.rs +++ b/crates/voicevox_core_c_api/src/result_code.rs @@ -109,8 +109,6 @@ pub(crate) const fn error_result_to_message(result_code: VoicevoxResultCode) -> cstr!("ユーザー辞書の単語のバリデーションに失敗しました") } VOICEVOX_RESULT_INVALID_UUID_ERROR => cstr!("UUIDの変換に失敗しました"), - VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR => { - cstr!("要求された機能を話者は持っていません") - } + VOICEVOX_RESULT_SPEAKER_FEATURE_ERROR => cstr!("要求された機能を話者は持っていません"), } } diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index ccecff692..6236955da 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -100,8 +100,6 @@ stderr.unix = "" [morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":302}'] ok = false -# FIXME: 以下の部分が`#[error(transparent)]`ではなく`#[error("{0}")]`となっているために、このようなエラー表示になってしまっている -# https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_c_api/src/helpers.rs#L67 stderr.windows = ''' {windows-video-cards} {timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java index 282df0264..a50ce6cc5 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java @@ -151,7 +151,7 @@ void checkMorphing(MorphParams params) } } - static Stream morphParamsProvider() { + private static Stream morphParamsProvider() { return Stream.of( new MorphParams( 0, @@ -196,7 +196,7 @@ static Stream morphParamsProvider() { } // TODO: Lombokを使う - static class MorphParams { + private static class MorphParams { private final int baseStyleId; private final SortedMap targets; diff --git a/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py b/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py index 4adcfc6f0..0c1c4cd7e 100644 --- a/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py +++ b/crates/voicevox_core_python_api/python/test/test_asyncio_morph.py @@ -81,11 +81,9 @@ async def test_morph( await synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) -def test_morphable_targets_raises_for_unknown_style(synthesizer: Synthesizer) -> None: +def test_morphable_targets_denies_unknown_style(synthesizer: Synthesizer) -> None: STYLE_ID = StyleId(9999) - # FIXME: `KeyError.__init__`を通しているため、メッセージが`repr`で表示されてしまう - # https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_python_api/src/convert.rs#L186-L206 with pytest.raises( StyleNotFoundError, match=f"^'`{STYLE_ID}`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています'$", diff --git a/crates/voicevox_core_python_api/python/test/test_blocking_morph.py b/crates/voicevox_core_python_api/python/test/test_blocking_morph.py index 6551e9eb8..07fe394e4 100644 --- a/crates/voicevox_core_python_api/python/test/test_blocking_morph.py +++ b/crates/voicevox_core_python_api/python/test/test_blocking_morph.py @@ -79,11 +79,9 @@ def test_morph( synthesizer.synthesis_morphing(query, base, target, MORPH_RATE) -def test_morphable_targets_raises_for_unknown_style(synthesizer: Synthesizer) -> None: +def test_morphable_targets_denies_unknown_style(synthesizer: Synthesizer) -> None: STYLE_ID = StyleId(9999) - # FIXME: `KeyError.__init__`を通しているため、メッセージが`repr`で表示されてしまう - # https://github.com/VOICEVOX/voicevox_core/blob/4e13bca5a55a08d7aea08af4f949462bd284b1c1/crates/voicevox_core_python_api/src/convert.rs#L186-L206 with pytest.raises( StyleNotFoundError, match=f"^'`{STYLE_ID}`に対するスタイルが見つかりませんでした。音声モデルが読み込まれていないか、読み込みが解除されています'$", diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index 1a957cd14..39e03e77a 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -183,6 +183,7 @@ pub impl voicevox_core::Result { use voicevox_core::ErrorKind; self.map_err(|err| { + // FIXME: `KeyError`を継承しているエラーでは、`msg`が`repr`で表示されてしまう let msg = err.to_string(); let top = match err.kind() { ErrorKind::NotLoadedOpenjtalkDict => NotLoadedOpenjtalkDictError::new_err(msg), From bdb7c3b8357a04a545a6bbb3737e4f0a965483a4 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Fri, 5 Jan 2024 05:22:02 +0900 Subject: [PATCH 42/46] =?UTF-8?q?C=20API=E3=81=A7=E3=82=8216=E9=80=9A?= =?UTF-8?q?=E3=82=8A=E3=83=86=E3=82=B9=E3=83=88=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/e2e/snapshots.toml | 151 +++++++++++++++++- .../tests/e2e/testcases/morph.rs | 69 ++++++-- 2 files changed, 206 insertions(+), 14 deletions(-) diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 6236955da..6068213f3 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -84,21 +84,86 @@ result_messages.25 = "UUIDの変換に失敗しました" result_messages.28 = "要求された機能を話者は持っていません" stderr = "" -[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":1}'] -ok = true +[morph.'{"base_style":0,"target_style":0}'] +ok = false stderr.windows = ''' {windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' + +[morph.'{"base_style":0,"target_style":1}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' + +[morph.'{"base_style":0,"target_style":302}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' + +[morph.'{"base_style":0,"target_style":303}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' + +[morph.'{"base_style":1,"target_style":0}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング ''' -stderr.unix = "" -[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":302,"target_style":303}'] +[morph.'{"base_style":1,"target_style":1}'] ok = true stderr.windows = ''' {windows-video-cards} ''' stderr.unix = "" -[morph.'{"text":"こんにちは、音声合成の世界へようこそ","base_style":1,"target_style":302}'] +[morph.'{"base_style":1,"target_style":302}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング +''' + +[morph.'{"base_style":1,"target_style":303}'] ok = false stderr.windows = ''' {windows-video-cards} @@ -110,6 +175,82 @@ stderr.unix = ''' {timestamp} ERROR voicevox_core::helpers: Caused by: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)は以下の機能を持ちません: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)に対するモーフィング ''' +[morph.'{"base_style":302,"target_style":0}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' + +[morph.'{"base_style":302,"target_style":1}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' + +[morph.'{"base_style":302,"target_style":302}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + +[morph.'{"base_style":302,"target_style":303}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + +[morph.'{"base_style":303,"target_style":0}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy1` (574bc678-8370-44be-b941-08e46e7b47d7)に対するモーフィング +''' + +[morph.'{"base_style":303,"target_style":1}'] +ok = false +stderr.windows = ''' +{windows-video-cards} +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' +stderr.unix = ''' +{timestamp} ERROR voicevox_core::helpers: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +{timestamp} ERROR voicevox_core::helpers: Caused by: `dummy3` (5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3)は以下の機能を持ちません: `dummy2` (dd9ccd75-75f6-40ce-a3db-960cbed2e905)に対するモーフィング +''' + +[morph.'{"base_style":303,"target_style":302}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + +[morph.'{"base_style":303,"target_style":303}'] +ok = true +stderr.windows = ''' +{windows-video-cards} +''' +stderr.unix = "" + [simple_tts] output."こんにちは、音声合成の世界へようこそ".wav_length = 176172 stderr.windows = ''' diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs index 20b79004d..7d4e9b653 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -22,25 +22,79 @@ use crate::{ }, }; +const TEXT: &CStr = cstr!("こんにちは、音声合成の世界へようこそ"); +const MORPH_RATE: f64 = 0.5; + +case!(TestCase { + base_style: 0, + target_style: 0, +}); +case!(TestCase { + base_style: 0, + target_style: 1, +}); +case!(TestCase { + base_style: 0, + target_style: 302, +}); +case!(TestCase { + base_style: 0, + target_style: 303, +}); + +case!(TestCase { + base_style: 1, + target_style: 0, +}); case!(TestCase { - text: "こんにちは、音声合成の世界へようこそ".to_owned(), base_style: 1, target_style: 1, }); case!(TestCase { - text: "こんにちは、音声合成の世界へようこそ".to_owned(), + base_style: 1, + target_style: 302, +}); +case!(TestCase { + base_style: 1, + target_style: 303, +}); + +case!(TestCase { + base_style: 302, + target_style: 0, +}); +case!(TestCase { + base_style: 302, + target_style: 1, +}); +case!(TestCase { + base_style: 302, + target_style: 302, +}); +case!(TestCase { base_style: 302, target_style: 303, }); + case!(TestCase { - text: "こんにちは、音声合成の世界へようこそ".to_owned(), - base_style: 1, + base_style: 303, + target_style: 0, +}); +case!(TestCase { + base_style: 303, + target_style: 1, +}); +case!(TestCase { + base_style: 303, target_style: 302, }); +case!(TestCase { + base_style: 303, + target_style: 303, +}); #[derive(Serialize, Deserialize)] struct TestCase { - text: String, base_style: VoicevoxStyleId, target_style: VoicevoxStyleId, } @@ -107,10 +161,9 @@ impl assert_cdylib::TestCase for TestCase { let audio_query = { let mut audio_query = MaybeUninit::uninit(); - let text = CString::new(&*self.text).unwrap(); assert_ok(voicevox_synthesizer_create_audio_query( synthesizer, - text.as_ptr(), + TEXT.as_ptr(), self.base_style, audio_query.as_mut_ptr(), )); @@ -134,8 +187,6 @@ impl assert_cdylib::TestCase for TestCase { // TODO: スナップショットテストをやる let result = { - const MORPH_RATE: f64 = 0.5; - let mut wav_length = MaybeUninit::uninit(); let mut wav = MaybeUninit::uninit(); let result = voicevox_synthesizer_synthesis_morphing( From 503574092d071af1516a114116d19f5075f5191c Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 11 Feb 2024 03:38:14 +0900 Subject: [PATCH 43/46] =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=82=92?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/metas.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs index 8169a0e9a..e5aa3d08e 100644 --- a/crates/voicevox_core/src/metas.rs +++ b/crates/voicevox_core/src/metas.rs @@ -191,6 +191,9 @@ mod tests { ], "version": "0.0.0", "speaker_uuid": "f34ab151-c0f5-4e0a-9ad2-51ce30dba24d", + "supported_features": { + "permitted_synthesis_morphing": "SELF_ONLY" + }, "order": 1 }, { @@ -204,6 +207,9 @@ mod tests { ], "version": "0.0.0", "speaker_uuid": "d6fd707c-a451-48e9-8f00-fe9ee3bf6264", + "supported_features": { + "permitted_synthesis_morphing": "SELF_ONLY" + }, "order": 0 }, { @@ -222,6 +228,9 @@ mod tests { ], "version": "0.0.0", "speaker_uuid": "d6fd707c-a451-48e9-8f00-fe9ee3bf6264", + "supported_features": { + "permitted_synthesis_morphing": "SELF_ONLY" + }, "order": 0 } ]) @@ -250,6 +259,9 @@ mod tests { ], "version": "0.0.0", "speaker_uuid": "d6fd707c-a451-48e9-8f00-fe9ee3bf6264", + "supported_features": { + "permitted_synthesis_morphing": "SELF_ONLY" + }, "order": 0 }, { @@ -263,6 +275,9 @@ mod tests { ], "version": "0.0.0", "speaker_uuid": "f34ab151-c0f5-4e0a-9ad2-51ce30dba24d", + "supported_features": { + "permitted_synthesis_morphing": "SELF_ONLY" + }, "order": 1 } ]) From 2827328f356cec86c13f47376ef33c3dbeb6a320 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 16 Mar 2024 22:49:47 +0900 Subject: [PATCH 44/46] =?UTF-8?q?TODO=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88?= =?UTF-8?q?=E3=82=92=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/engine/audio_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs index 73f0b9cf1..a5f625228 100644 --- a/crates/voicevox_core/src/engine/audio_file.rs +++ b/crates/voicevox_core/src/engine/audio_file.rs @@ -9,7 +9,7 @@ pub(crate) fn to_wav + From + Cast>( wave: &[T], audio_query: &AudioQueryModel, ) -> Vec { - // TODO: ライブラリ(e.g. https://docs.rs/rubato & https://docs.rs/hound)を使う + // TODO: https://github.com/VOICEVOX/voicevox_core/issues/762 let volume_scale = *audio_query.volume_scale(); let output_stereo = *audio_query.output_stereo(); From 3b8f4297f9c7cd1d845dea5049f7ca38d0b2df07 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sun, 19 May 2024 18:43:49 +0900 Subject: [PATCH 45/46] Fix a test --- crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs index 94024f34a..f43d66680 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -112,7 +112,7 @@ impl assert_cdylib::TestCase for TestCase { let model = { let mut model = MaybeUninit::uninit(); assert_ok(lib.voicevox_voice_model_new_from_path( - cstr!("../../model/sample.vvm").as_ptr(), + cstr!("../test_util/data/sample.vvm").as_ptr(), model.as_mut_ptr(), )); model.assume_init() From 97b2e81c90d90c291bb7bc192288a481cb7925cd Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Wed, 22 May 2024 14:06:25 +0900 Subject: [PATCH 46/46] fixup! Fix a test --- crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs index f43d66680..73da15b10 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/morph.rs @@ -112,7 +112,7 @@ impl assert_cdylib::TestCase for TestCase { let model = { let mut model = MaybeUninit::uninit(); assert_ok(lib.voicevox_voice_model_new_from_path( - cstr!("../test_util/data/sample.vvm").as_ptr(), + c_api::SAMPLE_VOICE_MODEL_FILE_PATH.as_ptr(), model.as_mut_ptr(), )); model.assume_init()