Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streaming decoder for compatible engine #875

Merged
merged 6 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions crates/test_util/compatible_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
float *phoneme, int64_t *speaker_id, float *output);

bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
int64_t margin_width, int64_t feature_dim,
float *f0, float *phoneme, int64_t *speaker_id,
float *output);

bool render_audio_segment(int64_t length, int64_t feature_dim, float *audio_feature,
int64_t *speaker_id, float *output);
bool render_audio_segment(int64_t length, int64_t margin_width, int64_t feature_size,
float *audio_feature, int64_t *speaker_id, float *output);

const char *last_error_message();
19 changes: 9 additions & 10 deletions crates/voicevox_core_c_api/src/compatible_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,13 +364,11 @@ pub unsafe extern "C" fn decode_forward(
/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; ((length + 2 * margin_width) * feature_dim) as usize]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; ((length + 2 * 14) * 80) as usize]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn generate_full_intermediate(
length: i64,
phoneme_size: i64,
margin_width: i64,
feature_dim: i64,
f0: *mut f32,
phoneme: *mut f32,
speaker_id: *mut i64,
Expand All @@ -383,8 +381,8 @@ pub unsafe extern "C" fn generate_full_intermediate(
assert_aligned(output);
let length = length as usize;
let phoneme_size = phoneme_size as usize;
let margin_width = margin_width as usize;
let feature_dim = feature_dim as usize;
const MARGIN_WIDTH: usize = 14;
const FEATURE_SIZE: usize = 80;
Copy link
Member

@Hiroshiba Hiroshiba Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

こことその下に書かれてるアサート処理はこの関数に独自に追加された処理ですが、個人的にはそのままでもだけしてもどちらでも良さそうに思っています。
@qryxip さんにお任せできれば!

Copy link
Member

@qryxip qryxip Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

私がsuggestしたやつですね。他の関数も後でこれで統一しようかなと。

ちなみにassert!じゃなくif … { panic!(…); }としたのは、私の理解では出力サイズはONNXモデル次第だからですね。Rust API側での保証はしてなかったはず?

let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).generate_full_intermediate(
length,
Expand All @@ -396,7 +394,7 @@ pub unsafe extern "C" fn generate_full_intermediate(
);
match result {
Ok(output_arr) => {
let output_len = (length + 2 * margin_width) * feature_dim;
let output_len = (length + 2 * MARGIN_WIDTH) * FEATURE_SIZE;
if output_arr.len() != output_len {
panic!("expected {}, got {}", output_len, output_arr.len());
}
Expand All @@ -418,13 +416,14 @@ pub unsafe extern "C" fn generate_full_intermediate(

/// # Safety
///
/// - `audio_feature`はRustの`&[f32; (length * feature_dim) as usize]`として解釈できなければならない。
/// - `audio_feature`はRustの`&[f32; (length * feature_size) as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; length as usize * 256]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn render_audio_segment(
length: i64,
feature_dim: i64,
_margin_width: i64,
feature_size: i64,
audio_feature: *mut f32,
speaker_id: *mut i64,
output: *mut f32,
Expand All @@ -434,12 +433,12 @@ pub unsafe extern "C" fn render_audio_segment(
assert_aligned(speaker_id);
assert_aligned(output);
let length = length as usize;
let feature_dim = feature_dim as usize;
let feature_size = feature_size as usize;
let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).render_audio_segment(
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
ndarray::ArrayView2::from_shape_ptr([length, feature_dim], audio_feature).to_owned()
ndarray::ArrayView2::from_shape_ptr([length, feature_size], audio_feature).to_owned()
},
StyleId::new(unsafe { *speaker_id as u32 }),
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,14 @@ impl assert_cdylib::TestCase for TestCase {
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
assert!(lib.render_audio_segment(
length_with_margin,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
audio_feature.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
Expand All @@ -122,8 +121,6 @@ impl assert_cdylib::TestCase for TestCase {
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
Expand All @@ -141,6 +138,7 @@ impl assert_cdylib::TestCase for TestCase {
let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
assert!(lib.render_audio_segment(
slice_length as i64,
EXAMPLE_DATA.intermediate.margin_width,
pitch as i64,
feature_segment.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
Expand All @@ -163,8 +161,8 @@ impl assert_cdylib::TestCase for TestCase {
assert!(wave.iter().copied().all(f32::is_normal));
assert!(wave2.iter().copied().all(f32::is_normal));
assert!(wave3.iter().copied().all(f32::is_normal));
float_assert::close_l1(&wave2, &wave, 0.01);
float_assert::close_l1(&wave3, &wave, 0.01);
float_assert::close_l1(&wave2, &wave, 0.001);
float_assert::close_l1(&wave3, &wave, 0.001);

lib.finalize();
Ok(())
Expand Down
Loading