Skip to content

Commit

Permalink
Merge commit '0864f106085e91a90ca60241f1f375db4ea4dfcc' into HEAD
Browse files Browse the repository at this point in the history
以下のようにしてマージ。

- ソングのモデルもsample.vvmの中に入れて一つにする。それに伴い、
  `release-0.15`で増えた一部のテストは削除。
- モデルのロードに関しては、現行のVVMを前提にした処理に置き換える。
- 推論の実行に関しては`ndarray`を前提とした処理に置き換える。
  • Loading branch information
qryxip committed Dec 18, 2024
2 parents 66582bd + 0864f10 commit a2452a1
Show file tree
Hide file tree
Showing 18 changed files with 1,092 additions and 68 deletions.
2 changes: 1 addition & 1 deletion _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ NdArray="NdArray" # onnxruntime::session::NdArray
[default.extend-words]

[files]
extend-exclude = ["*.svg"]
extend-exclude = ["*.svg", "*.onnx"]
20 changes: 17 additions & 3 deletions crates/voicevox_core/src/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,23 +197,33 @@ pub(crate) trait OutputScalar: Sized {
fn extract(tensor: OutputTensor) -> std::result::Result<ArrayD<Self>, ExtractError>;
}

impl OutputScalar for f32 {
const KIND: OutputScalarKind = OutputScalarKind::Float32;
#[duplicate_item(
T Kind;
[ i64 ] [ Int64 ];
[ f32 ] [ Float32 ];
)]
impl OutputScalar for T {
const KIND: OutputScalarKind = OutputScalarKind::Kind;

fn extract(tensor: OutputTensor) -> std::result::Result<ArrayD<Self>, ExtractError> {
match tensor {
OutputTensor::Float32(tensor) => Ok(tensor),
OutputTensor::Kind(tensor) => Ok(tensor),
_ => Err(ExtractError::Datatype),
}
}
}

#[derive(Clone, Copy, PartialEq, derive_more::Display)]
pub(crate) enum OutputScalarKind {
#[display("int64_t")]
Int64,

#[display("float")]
Float32,
}

pub(crate) enum OutputTensor {
Int64(ArrayD<i64>),
Float32(ArrayD<f32>),
}

Expand Down Expand Up @@ -246,8 +256,12 @@ pub(crate) struct InferenceSessionOptions {
pub(crate) device: DeviceSpec,
}

// TODO: `ShapeError`を直接扱い、データ型違いはパニックにすべきでは?
#[derive(Error, Debug)]
pub(crate) enum ExtractError {
#[error("wrong datatype")]
Datatype,

#[error(transparent)]
Shape(#[from] ShapeError),
}
Expand Down
103 changes: 81 additions & 22 deletions crates/voicevox_core/src/infer/domains.rs
Original file line number Diff line number Diff line change
@@ -1,75 +1,134 @@
mod frame_decode;
mod singing_teacher;
mod talk;

use educe::Educe;
use serde::{Deserialize, Deserializer};

pub(crate) use self::talk::{
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
pub(crate) use self::{
frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
singing_teacher::{
PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
SingingTeacherOperation,
},
talk::{
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
},
};

#[derive(Educe)]
// TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
// でもそうなのか?また最新版でも駄目だとしたら、弾いている理由は何なのか?
#[educe(Clone(bound = "V: InferenceDomainMapValues, V::Talk: Clone"))]
#[educe(Clone(
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
))]
pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
pub(crate) talk: V::Talk,
pub(crate) singing_teacher: V::SingingTeacher,
pub(crate) frame_decode: V::FrameDecode,
}

impl<T> InferenceDomainMap<(T,)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T,)> {
impl<T, S, F> InferenceDomainMap<(T, S, F)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
let talk = &self.talk;
InferenceDomainMap { talk }
let singing_teacher = &self.singing_teacher;
let frame_decode = &self.frame_decode;
InferenceDomainMap {
talk,
singing_teacher,
frame_decode,
}
}

pub(crate) fn map<T2, Ft: FnOnce(T) -> T2>(
pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
self,
fs: InferenceDomainMap<(Ft,)>,
) -> InferenceDomainMap<(T2,)> {
fs: InferenceDomainMap<(Ft, Fs, Ff)>,
) -> InferenceDomainMap<(T2, S2, F2)> {
let talk = (fs.talk)(self.talk);
InferenceDomainMap { talk }
let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
let frame_decode = (fs.frame_decode)(self.frame_decode);
InferenceDomainMap {
talk,
singing_teacher,
frame_decode,
}
}
}

impl<T, E> InferenceDomainMap<(Result<T, E>,)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T,)>, E> {
impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
let talk = self.talk?;
Ok(InferenceDomainMap { talk })
let singing_teacher = self.singing_teacher?;
let frame_decode = self.frame_decode?;
Ok(InferenceDomainMap {
talk,
singing_teacher,
frame_decode,
})
}
}

impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
where
V::Talk: Deserialize<'de>,
V::SingingTeacher: Deserialize<'de>,
V::FrameDecode: Deserialize<'de>,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let Repr { talk } = Repr::deserialize(deserializer)?;
return Ok(Self { talk });
let Repr {
talk,
singing_teacher,
frame_decode,
} = Repr::deserialize(deserializer)?;
return Ok(Self {
talk,
singing_teacher,
frame_decode,
});

#[derive(Deserialize)]
struct Repr<T> {
struct Repr<T, S, F> {
talk: T,
singing_teacher: S,
frame_decode: F,
}
}
}

pub(crate) trait InferenceDomainMapValues {
type Talk;
type SingingTeacher;
type FrameDecode;
}

impl<T> InferenceDomainMapValues for (T,) {
impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
type Talk = T;
type SingingTeacher = S;
type FrameDecode = F;
}

macro_rules! inference_domain_map_values {
(for<$arg:ident> $body:ty) => {
(::macros::substitute_type!(
$body where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
),)
(
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::FrameDecodeDomain as crate::infer::InferenceDomain
),
)
};
}
pub(crate) use inference_domain_map_values;
52 changes: 52 additions & 0 deletions crates/voicevox_core/src/infer/domains/frame_decode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use std::{collections::BTreeSet, sync::LazyLock};

use enum_map::Enum;
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
use ndarray::{Array1, Array2};

use crate::{manifest::FrameDecodeManifest, StyleType};

use super::super::{
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
};

pub(crate) enum FrameDecodeDomain {}

impl InferenceDomain for FrameDecodeDomain {
type Operation = FrameDecodeOperation;
type Manifest = FrameDecodeManifest;

fn style_types() -> &'static BTreeSet<StyleType> {
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
LazyLock::new(|| [StyleType::FrameDecode, StyleType::Sing].into());
&STYLE_TYPES
}
}

#[derive(Clone, Copy, Enum, InferenceOperation)]
#[inference_operation(
type Domain = FrameDecodeDomain;
)]
pub(crate) enum FrameDecodeOperation {
#[inference_operation(
type Input = SfDecodeInput;
type Output = SfDecodeOutput;
)]
SfDecode,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = SfDecode;
)]
pub(crate) struct SfDecodeInput {
pub(crate) frame_phonemes: Array2<i64>,
pub(crate) frame_f0s: Array2<f32>,
pub(crate) frame_volumes: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct SfDecodeOutput {
pub(crate) wav: Array2<f32>,
}
95 changes: 95 additions & 0 deletions crates/voicevox_core/src/infer/domains/singing_teacher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
use std::{collections::BTreeSet, sync::LazyLock};

use enum_map::Enum;
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
use ndarray::{Array1, Array2};

use crate::{manifest::SingingTeacherManifest, StyleType};

use super::super::{
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
};

pub(crate) enum SingingTeacherDomain {}

impl InferenceDomain for SingingTeacherDomain {
type Operation = SingingTeacherOperation;
type Manifest = SingingTeacherManifest;

fn style_types() -> &'static BTreeSet<StyleType> {
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
LazyLock::new(|| [StyleType::SingingTeacher, StyleType::Sing].into());
&STYLE_TYPES
}
}

#[derive(Clone, Copy, Enum, InferenceOperation)]
#[inference_operation(
type Domain = SingingTeacherDomain;
)]
pub(crate) enum SingingTeacherOperation {
#[inference_operation(
type Input = PredictSingConsonantLengthInput;
type Output = PredictSingConsonantLengthOutput;
)]
PredictSingConsonantLength,

#[inference_operation(
type Input = PredictSingF0Input;
type Output = PredictSingF0Output;
)]
PredictSingF0,

#[inference_operation(
type Input = PredictSingVolumeInput;
type Output = PredictSingVolumeOutput;
)]
PredictSingVolume,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictSingConsonantLength;
)]
pub(crate) struct PredictSingConsonantLengthInput {
pub(crate) consonants: Array2<i64>,
pub(crate) vowels: Array2<i64>,
pub(crate) note_durations: Array2<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictSingConsonantLengthOutput {
pub(crate) consonant_lengths: Array2<i64>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictSingF0;
)]
pub(crate) struct PredictSingF0Input {
pub(crate) phonemes: Array2<i64>,
pub(crate) notes: Array2<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictSingF0Output {
pub(crate) f0s: Array2<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictSingVolume;
)]
pub(crate) struct PredictSingVolumeInput {
pub(crate) phonemes: Array2<i64>,
pub(crate) notes: Array2<i64>,
pub(crate) frame_f0s: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictSingVolumeOutput {
pub(crate) volumes: Array2<f32>,
}
6 changes: 5 additions & 1 deletion crates/voicevox_core/src/infer/runtimes/onnxruntime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ impl InferenceRuntime for self::blocking::Onnxruntime {
TensorElementType::Uint16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16"),
TensorElementType::Int16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16"),
TensorElementType::Int32 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32"),
TensorElementType::Int64 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64"),
TensorElementType::Int64 => Ok(OutputScalarKind::Int64),
TensorElementType::String => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING"),
TensorElementType::Bfloat16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16"),
TensorElementType::Float16 => Err("ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16"),
Expand Down Expand Up @@ -253,6 +253,10 @@ fn extract_outputs(outputs: &ort::SessionOutputs<'_, '_>) -> anyhow::Result<Vec<
};

match ty {
TensorElementType::Int64 => {
let output = output.try_extract_tensor::<i64>()?;
Ok(OutputTensor::Int64(output.into_owned()))
}
TensorElementType::Float32 => {
let output = output.try_extract_tensor::<f32>()?;
Ok(OutputTensor::Float32(output.into_owned()))
Expand Down
Loading

0 comments on commit a2452a1

Please sign in to comment.