-
Notifications
You must be signed in to change notification settings - Fork 120
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge commit '0864f106085e91a90ca60241f1f375db4ea4dfcc' into HEAD
以下のようにしてマージ。 - ソングのモデルもsample.vvmの中に入れて一つにする。それに伴い、 `release-0.15`で増えた一部のテストは削除。 - モデルのロードに関しては、現行のVVMを前提にした処理に置き換える。 - 推論の実行に関しては`ndarray`を前提とした処理に置き換える。
- Loading branch information
Showing
18 changed files
with
1,092 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,75 +1,134 @@ | ||
mod frame_decode; | ||
mod singing_teacher; | ||
mod talk; | ||
|
||
use educe::Educe; | ||
use serde::{Deserialize, Deserializer}; | ||
|
||
pub(crate) use self::talk::{ | ||
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput, | ||
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, | ||
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, | ||
pub(crate) use self::{ | ||
frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput}, | ||
singing_teacher::{ | ||
PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input, | ||
PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain, | ||
SingingTeacherOperation, | ||
}, | ||
talk::{ | ||
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput, | ||
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, | ||
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, | ||
}, | ||
}; | ||
|
||
#[derive(Educe)] | ||
// TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce | ||
// でもそうなのか?また最新版でも駄目だとしたら、弾いている理由は何なのか? | ||
#[educe(Clone(bound = "V: InferenceDomainMapValues, V::Talk: Clone"))] | ||
#[educe(Clone( | ||
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone" | ||
))] | ||
pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> { | ||
pub(crate) talk: V::Talk, | ||
pub(crate) singing_teacher: V::SingingTeacher, | ||
pub(crate) frame_decode: V::FrameDecode, | ||
} | ||
|
||
impl<T> InferenceDomainMap<(T,)> { | ||
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T,)> { | ||
impl<T, S, F> InferenceDomainMap<(T, S, F)> { | ||
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> { | ||
let talk = &self.talk; | ||
InferenceDomainMap { talk } | ||
let singing_teacher = &self.singing_teacher; | ||
let frame_decode = &self.frame_decode; | ||
InferenceDomainMap { | ||
talk, | ||
singing_teacher, | ||
frame_decode, | ||
} | ||
} | ||
|
||
pub(crate) fn map<T2, Ft: FnOnce(T) -> T2>( | ||
pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>( | ||
self, | ||
fs: InferenceDomainMap<(Ft,)>, | ||
) -> InferenceDomainMap<(T2,)> { | ||
fs: InferenceDomainMap<(Ft, Fs, Ff)>, | ||
) -> InferenceDomainMap<(T2, S2, F2)> { | ||
let talk = (fs.talk)(self.talk); | ||
InferenceDomainMap { talk } | ||
let singing_teacher = (fs.singing_teacher)(self.singing_teacher); | ||
let frame_decode = (fs.frame_decode)(self.frame_decode); | ||
InferenceDomainMap { | ||
talk, | ||
singing_teacher, | ||
frame_decode, | ||
} | ||
} | ||
} | ||
|
||
impl<T, E> InferenceDomainMap<(Result<T, E>,)> { | ||
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T,)>, E> { | ||
impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> { | ||
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> { | ||
let talk = self.talk?; | ||
Ok(InferenceDomainMap { talk }) | ||
let singing_teacher = self.singing_teacher?; | ||
let frame_decode = self.frame_decode?; | ||
Ok(InferenceDomainMap { | ||
talk, | ||
singing_teacher, | ||
frame_decode, | ||
}) | ||
} | ||
} | ||
|
||
impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V> | ||
where | ||
V::Talk: Deserialize<'de>, | ||
V::SingingTeacher: Deserialize<'de>, | ||
V::FrameDecode: Deserialize<'de>, | ||
{ | ||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
where | ||
D: Deserializer<'de>, | ||
{ | ||
let Repr { talk } = Repr::deserialize(deserializer)?; | ||
return Ok(Self { talk }); | ||
let Repr { | ||
talk, | ||
singing_teacher, | ||
frame_decode, | ||
} = Repr::deserialize(deserializer)?; | ||
return Ok(Self { | ||
talk, | ||
singing_teacher, | ||
frame_decode, | ||
}); | ||
|
||
#[derive(Deserialize)] | ||
struct Repr<T> { | ||
struct Repr<T, S, F> { | ||
talk: T, | ||
singing_teacher: S, | ||
frame_decode: F, | ||
} | ||
} | ||
} | ||
|
||
pub(crate) trait InferenceDomainMapValues { | ||
type Talk; | ||
type SingingTeacher; | ||
type FrameDecode; | ||
} | ||
|
||
impl<T> InferenceDomainMapValues for (T,) { | ||
impl<T, S, F> InferenceDomainMapValues for (T, S, F) { | ||
type Talk = T; | ||
type SingingTeacher = S; | ||
type FrameDecode = F; | ||
} | ||
|
||
macro_rules! inference_domain_map_values { | ||
(for<$arg:ident> $body:ty) => { | ||
(::macros::substitute_type!( | ||
$body where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain | ||
),) | ||
( | ||
::macros::substitute_type!( | ||
$body | ||
where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain | ||
), | ||
::macros::substitute_type!( | ||
$body | ||
where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain | ||
), | ||
::macros::substitute_type!( | ||
$body | ||
where $arg = crate::infer::domains::FrameDecodeDomain as crate::infer::InferenceDomain | ||
), | ||
) | ||
}; | ||
} | ||
pub(crate) use inference_domain_map_values; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
use std::{collections::BTreeSet, sync::LazyLock}; | ||
|
||
use enum_map::Enum; | ||
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature}; | ||
use ndarray::{Array1, Array2}; | ||
|
||
use crate::{manifest::FrameDecodeManifest, StyleType}; | ||
|
||
use super::super::{ | ||
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor, | ||
}; | ||
|
||
pub(crate) enum FrameDecodeDomain {} | ||
|
||
impl InferenceDomain for FrameDecodeDomain { | ||
type Operation = FrameDecodeOperation; | ||
type Manifest = FrameDecodeManifest; | ||
|
||
fn style_types() -> &'static BTreeSet<StyleType> { | ||
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> = | ||
LazyLock::new(|| [StyleType::FrameDecode, StyleType::Sing].into()); | ||
&STYLE_TYPES | ||
} | ||
} | ||
|
||
#[derive(Clone, Copy, Enum, InferenceOperation)] | ||
#[inference_operation( | ||
type Domain = FrameDecodeDomain; | ||
)] | ||
pub(crate) enum FrameDecodeOperation { | ||
#[inference_operation( | ||
type Input = SfDecodeInput; | ||
type Output = SfDecodeOutput; | ||
)] | ||
SfDecode, | ||
} | ||
|
||
#[derive(InferenceInputSignature)] | ||
#[inference_input_signature( | ||
type Signature = SfDecode; | ||
)] | ||
pub(crate) struct SfDecodeInput { | ||
pub(crate) frame_phonemes: Array2<i64>, | ||
pub(crate) frame_f0s: Array2<f32>, | ||
pub(crate) frame_volumes: Array2<f32>, | ||
pub(crate) speaker_id: Array1<i64>, | ||
} | ||
|
||
#[derive(InferenceOutputSignature)] | ||
pub(crate) struct SfDecodeOutput { | ||
pub(crate) wav: Array2<f32>, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
use std::{collections::BTreeSet, sync::LazyLock}; | ||
|
||
use enum_map::Enum; | ||
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature}; | ||
use ndarray::{Array1, Array2}; | ||
|
||
use crate::{manifest::SingingTeacherManifest, StyleType}; | ||
|
||
use super::super::{ | ||
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor, | ||
}; | ||
|
||
pub(crate) enum SingingTeacherDomain {} | ||
|
||
impl InferenceDomain for SingingTeacherDomain { | ||
type Operation = SingingTeacherOperation; | ||
type Manifest = SingingTeacherManifest; | ||
|
||
fn style_types() -> &'static BTreeSet<StyleType> { | ||
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> = | ||
LazyLock::new(|| [StyleType::SingingTeacher, StyleType::Sing].into()); | ||
&STYLE_TYPES | ||
} | ||
} | ||
|
||
#[derive(Clone, Copy, Enum, InferenceOperation)] | ||
#[inference_operation( | ||
type Domain = SingingTeacherDomain; | ||
)] | ||
pub(crate) enum SingingTeacherOperation { | ||
#[inference_operation( | ||
type Input = PredictSingConsonantLengthInput; | ||
type Output = PredictSingConsonantLengthOutput; | ||
)] | ||
PredictSingConsonantLength, | ||
|
||
#[inference_operation( | ||
type Input = PredictSingF0Input; | ||
type Output = PredictSingF0Output; | ||
)] | ||
PredictSingF0, | ||
|
||
#[inference_operation( | ||
type Input = PredictSingVolumeInput; | ||
type Output = PredictSingVolumeOutput; | ||
)] | ||
PredictSingVolume, | ||
} | ||
|
||
#[derive(InferenceInputSignature)] | ||
#[inference_input_signature( | ||
type Signature = PredictSingConsonantLength; | ||
)] | ||
pub(crate) struct PredictSingConsonantLengthInput { | ||
pub(crate) consonants: Array2<i64>, | ||
pub(crate) vowels: Array2<i64>, | ||
pub(crate) note_durations: Array2<i64>, | ||
pub(crate) speaker_id: Array1<i64>, | ||
} | ||
|
||
#[derive(InferenceOutputSignature)] | ||
pub(crate) struct PredictSingConsonantLengthOutput { | ||
pub(crate) consonant_lengths: Array2<i64>, | ||
} | ||
|
||
#[derive(InferenceInputSignature)] | ||
#[inference_input_signature( | ||
type Signature = PredictSingF0; | ||
)] | ||
pub(crate) struct PredictSingF0Input { | ||
pub(crate) phonemes: Array2<i64>, | ||
pub(crate) notes: Array2<i64>, | ||
pub(crate) speaker_id: Array1<i64>, | ||
} | ||
|
||
#[derive(InferenceOutputSignature)] | ||
pub(crate) struct PredictSingF0Output { | ||
pub(crate) f0s: Array2<f32>, | ||
} | ||
|
||
#[derive(InferenceInputSignature)] | ||
#[inference_input_signature( | ||
type Signature = PredictSingVolume; | ||
)] | ||
pub(crate) struct PredictSingVolumeInput { | ||
pub(crate) phonemes: Array2<i64>, | ||
pub(crate) notes: Array2<i64>, | ||
pub(crate) frame_f0s: Array2<f32>, | ||
pub(crate) speaker_id: Array1<i64>, | ||
} | ||
|
||
#[derive(InferenceOutputSignature)] | ||
pub(crate) struct PredictSingVolumeOutput { | ||
pub(crate) volumes: Array2<f32>, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.