Merge commit '0864f106085e91a90ca60241f1f375db4ea4dfcc' into HEAD

以下のようにしてマージ。 - ソングのモデルもsample.vvmの中に入れて一つにする。それに伴い、 `release-0.15`で増えた一部のテストは削除。 - モデルのロードに関しては、現行のVVMを前提にした処理に置き換える。 - 推論の実行に関しては`ndarray`を前提とした処理に置き換える。
VOICEVOX · Dec 18, 2024 · a2452a1 · a2452a1
2 parents 66582bd + 0864f10
commit a2452a1
Show file tree

Hide file tree

Showing 18 changed files with 1,092 additions and 68 deletions.
diff --git a/_typos.toml b/_typos.toml
@@ -12,4 +12,4 @@ NdArray="NdArray" # onnxruntime::session::NdArray
 [default.extend-words]
 
 [files]
-extend-exclude = ["*.svg"]
+extend-exclude = ["*.svg", "*.onnx"]
diff --git a/crates/voicevox_core/src/infer.rs b/crates/voicevox_core/src/infer.rs
@@ -197,23 +197,33 @@ pub(crate) trait OutputScalar: Sized {
     fn extract(tensor: OutputTensor) -> std::result::Result<ArrayD<Self>, ExtractError>;
 }
 
-impl OutputScalar for f32 {
-    const KIND: OutputScalarKind = OutputScalarKind::Float32;
+#[duplicate_item(
+    T        Kind;
+    [ i64 ] [ Int64 ];
+    [ f32 ] [ Float32 ];
+)]
+impl OutputScalar for T {
+    const KIND: OutputScalarKind = OutputScalarKind::Kind;
 
     fn extract(tensor: OutputTensor) -> std::result::Result<ArrayD<Self>, ExtractError> {
         match tensor {
-            OutputTensor::Float32(tensor) => Ok(tensor),
+            OutputTensor::Kind(tensor) => Ok(tensor),
+            _ => Err(ExtractError::Datatype),
         }
     }
 }
 
 #[derive(Clone, Copy, PartialEq, derive_more::Display)]
 pub(crate) enum OutputScalarKind {
+    #[display("int64_t")]
+    Int64,
+
     #[display("float")]
     Float32,
 }
 
 pub(crate) enum OutputTensor {
+    Int64(ArrayD<i64>),
     Float32(ArrayD<f32>),
 }
 
@@ -246,8 +256,12 @@ pub(crate) struct InferenceSessionOptions {
     pub(crate) device: DeviceSpec,
 }
 
+// TODO: `ShapeError`を直接扱い、データ型違いはパニックにすべきでは？
 #[derive(Error, Debug)]
 pub(crate) enum ExtractError {
+    #[error("wrong datatype")]
+    Datatype,
+
     #[error(transparent)]
     Shape(#[from] ShapeError),
 }

diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs
@@ -1,75 +1,134 @@
+mod frame_decode;
+mod singing_teacher;
 mod talk;
 
 use educe::Educe;
 use serde::{Deserialize, Deserializer};
 
-pub(crate) use self::talk::{
-    GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
-    PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
-    RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
+pub(crate) use self::{
+    frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
+    singing_teacher::{
+        PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
+        PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
+        SingingTeacherOperation,
+    },
+    talk::{
+        GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
+        PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
+        RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
+    },
 };
 
 #[derive(Educe)]
 // TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
 // でもそうなのか？また最新版でも駄目だとしたら、弾いている理由は何なのか？
-#[educe(Clone(bound = "V: InferenceDomainMapValues, V::Talk: Clone"))]
+#[educe(Clone(
+    bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
+))]
 pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
     pub(crate) talk: V::Talk,
+    pub(crate) singing_teacher: V::SingingTeacher,
+    pub(crate) frame_decode: V::FrameDecode,
 }
 
-impl<T> InferenceDomainMap<(T,)> {
-    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T,)> {
+impl<T, S, F> InferenceDomainMap<(T, S, F)> {
+    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
         let talk = &self.talk;
-        InferenceDomainMap { talk }
+        let singing_teacher = &self.singing_teacher;
+        let frame_decode = &self.frame_decode;
+        InferenceDomainMap {
+            talk,
+            singing_teacher,
+            frame_decode,
+        }
     }
 
-    pub(crate) fn map<T2, Ft: FnOnce(T) -> T2>(
+    pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
         self,
-        fs: InferenceDomainMap<(Ft,)>,
-    ) -> InferenceDomainMap<(T2,)> {
+        fs: InferenceDomainMap<(Ft, Fs, Ff)>,
+    ) -> InferenceDomainMap<(T2, S2, F2)> {
         let talk = (fs.talk)(self.talk);
-        InferenceDomainMap { talk }
+        let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
+        let frame_decode = (fs.frame_decode)(self.frame_decode);
+        InferenceDomainMap {
+            talk,
+            singing_teacher,
+            frame_decode,
+        }
     }
 }
 
-impl<T, E> InferenceDomainMap<(Result<T, E>,)> {
-    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T,)>, E> {
+impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
+    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
         let talk = self.talk?;
-        Ok(InferenceDomainMap { talk })
+        let singing_teacher = self.singing_teacher?;
+        let frame_decode = self.frame_decode?;
+        Ok(InferenceDomainMap {
+            talk,
+            singing_teacher,
+            frame_decode,
+        })
     }
 }
 
 impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
 where
     V::Talk: Deserialize<'de>,
+    V::SingingTeacher: Deserialize<'de>,
+    V::FrameDecode: Deserialize<'de>,
 {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: Deserializer<'de>,
     {
-        let Repr { talk } = Repr::deserialize(deserializer)?;
-        return Ok(Self { talk });
+        let Repr {
+            talk,
+            singing_teacher,
+            frame_decode,
+        } = Repr::deserialize(deserializer)?;
+        return Ok(Self {
+            talk,
+            singing_teacher,
+            frame_decode,
+        });
 
         #[derive(Deserialize)]
-        struct Repr<T> {
+        struct Repr<T, S, F> {
             talk: T,
+            singing_teacher: S,
+            frame_decode: F,
         }
     }
 }
 
 pub(crate) trait InferenceDomainMapValues {
     type Talk;
+    type SingingTeacher;
+    type FrameDecode;
 }
 
-impl<T> InferenceDomainMapValues for (T,) {
+impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
     type Talk = T;
+    type SingingTeacher = S;
+    type FrameDecode = F;
 }
 
 macro_rules! inference_domain_map_values {
     (for<$arg:ident> $body:ty) => {
-        (::macros::substitute_type!(
-            $body where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
-        ),)
+        (
+            ::macros::substitute_type!(
+                $body
+                where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
+            ),
+            ::macros::substitute_type!(
+                $body
+                where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain
+            ),
+            ::macros::substitute_type!(
+                $body
+                where $arg = crate::infer::domains::FrameDecodeDomain as crate::infer::InferenceDomain
+            ),
+        )
     };
 }
 pub(crate) use inference_domain_map_values;
diff --git a/crates/voicevox_core/src/infer/domains/frame_decode.rs b/crates/voicevox_core/src/infer/domains/frame_decode.rs
@@ -0,0 +1,52 @@
+use std::{collections::BTreeSet, sync::LazyLock};
+
+use enum_map::Enum;
+use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
+use ndarray::{Array1, Array2};
+
+use crate::{manifest::FrameDecodeManifest, StyleType};
+
+use super::super::{
+    InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
+};
+
+pub(crate) enum FrameDecodeDomain {}
+
+impl InferenceDomain for FrameDecodeDomain {
+    type Operation = FrameDecodeOperation;
+    type Manifest = FrameDecodeManifest;
+
+    fn style_types() -> &'static BTreeSet<StyleType> {
+        static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
+            LazyLock::new(|| [StyleType::FrameDecode, StyleType::Sing].into());
+        &STYLE_TYPES
+    }
+}
+
+#[derive(Clone, Copy, Enum, InferenceOperation)]
+#[inference_operation(
+    type Domain = FrameDecodeDomain;
+)]
+pub(crate) enum FrameDecodeOperation {
+    #[inference_operation(
+        type Input = SfDecodeInput;
+        type Output = SfDecodeOutput;
+    )]
+    SfDecode,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = SfDecode;
+)]
+pub(crate) struct SfDecodeInput {
+    pub(crate) frame_phonemes: Array2<i64>,
+    pub(crate) frame_f0s: Array2<f32>,
+    pub(crate) frame_volumes: Array2<f32>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct SfDecodeOutput {
+    pub(crate) wav: Array2<f32>,
+}
diff --git a/crates/voicevox_core/src/infer/domains/singing_teacher.rs b/crates/voicevox_core/src/infer/domains/singing_teacher.rs
@@ -0,0 +1,95 @@
+use std::{collections::BTreeSet, sync::LazyLock};
+
+use enum_map::Enum;
+use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
+use ndarray::{Array1, Array2};
+
+use crate::{manifest::SingingTeacherManifest, StyleType};
+
+use super::super::{
+    InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
+};
+
+pub(crate) enum SingingTeacherDomain {}
+
+impl InferenceDomain for SingingTeacherDomain {
+    type Operation = SingingTeacherOperation;
+    type Manifest = SingingTeacherManifest;
+
+    fn style_types() -> &'static BTreeSet<StyleType> {
+        static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
+            LazyLock::new(|| [StyleType::SingingTeacher, StyleType::Sing].into());
+        &STYLE_TYPES
+    }
+}
+
+#[derive(Clone, Copy, Enum, InferenceOperation)]
+#[inference_operation(
+    type Domain = SingingTeacherDomain;
+)]
+pub(crate) enum SingingTeacherOperation {
+    #[inference_operation(
+        type Input = PredictSingConsonantLengthInput;
+        type Output = PredictSingConsonantLengthOutput;
+    )]
+    PredictSingConsonantLength,
+
+    #[inference_operation(
+        type Input = PredictSingF0Input;
+        type Output = PredictSingF0Output;
+    )]
+    PredictSingF0,
+
+    #[inference_operation(
+        type Input = PredictSingVolumeInput;
+        type Output = PredictSingVolumeOutput;
+    )]
+    PredictSingVolume,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictSingConsonantLength;
+)]
+pub(crate) struct PredictSingConsonantLengthInput {
+    pub(crate) consonants: Array2<i64>,
+    pub(crate) vowels: Array2<i64>,
+    pub(crate) note_durations: Array2<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictSingConsonantLengthOutput {
+    pub(crate) consonant_lengths: Array2<i64>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictSingF0;
+)]
+pub(crate) struct PredictSingF0Input {
+    pub(crate) phonemes: Array2<i64>,
+    pub(crate) notes: Array2<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictSingF0Output {
+    pub(crate) f0s: Array2<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictSingVolume;
+)]
+pub(crate) struct PredictSingVolumeInput {
+    pub(crate) phonemes: Array2<i64>,
+    pub(crate) notes: Array2<i64>,
+    pub(crate) frame_f0s: Array2<f32>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictSingVolumeOutput {
+    pub(crate) volumes: Array2<f32>,
+}
diff --git a/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs b/crates/voicevox_core/src/infer/runtimes/onnxruntime.rs
@@ -158,7 +158,7 @@ impl InferenceRuntime for self::blocking::Onnxruntime {
                     TensorElementType::Uint16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16"),
                     TensorElementType::Int16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16"),
                     TensorElementType::Int32 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32"),
-                    TensorElementType::Int64 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64"),
+                    TensorElementType::Int64 => Ok(OutputScalarKind::Int64),
                     TensorElementType::String => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING"),
                     TensorElementType::Bfloat16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16"),
                     TensorElementType::Float16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16"),
@@ -253,6 +253,10 @@ fn extract_outputs(outputs: &ort::SessionOutputs<'_, '_>) -> anyhow::Result<Vec<
             };
 
             match ty {
+                TensorElementType::Int64 => {
+                    let output = output.try_extract_tensor::<i64>()?;
+                    Ok(OutputTensor::Int64(output.into_owned()))
+                }
                 TensorElementType::Float32 => {
                     let output = output.try_extract_tensor::<f32>()?;
                     Ok(OutputTensor::Float32(output.into_owned()))