Skip to content

Commit

Permalink
Streaming decoder for compatible engine (#875)
Browse files Browse the repository at this point in the history
この本文は @qryxip が記述している。

ストリーミング処理を`compatible_engine`に実装する。testcaseとして一括変
換・ストリーム変換の二つの生成結果を追加し、元の生成音声と比較して十分近
いことを確かめた。

`render_audio_segment`には将来の互換性のため、未使用引数
`int64_t margin_width`を入れる。
#875 (comment)

またRust APIの出力サイズをチェックしてパニックする仕組みも入れる。これに
ついては他の関数にも後で導入することとする。
#875 (comment)

Refs: #866
  • Loading branch information
Yosshi999 authored Nov 26, 2024
1 parent c3c0580 commit a5745c2
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 1 deletion.
29 changes: 29 additions & 0 deletions crates/test_util/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> {
phoneme.to_vec()
},
},
intermediate: typing::IntermediateExampleData {
f0_length: 69,
phoneme_size: 45,
feature_dim: 80,
margin_width: 14,
f0_vector: {
let mut f0 = [0.; 69];
f0[9..24].fill(5.905218);
f0[37..60].fill(5.565851);
f0.to_vec()
},
phoneme_vector: {
let mut phoneme = [0.; 45 * 69];
let mut set_one = |index, range| {
for i in range {
phoneme[(i * 45 + index) as usize] = 1.;
}
};
set_one(0, 0..9);
set_one(37, 9..13);
set_one(14, 13..24);
set_one(35, 24..30);
set_one(6, 30..37);
set_one(37, 37..45);
set_one(30, 45..60);
set_one(0, 60..69);
phoneme.to_vec()
},
},
};

fs_err::write(
Expand Down
7 changes: 7 additions & 0 deletions crates/test_util/compatible_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
bool decode_forward(int64_t length, int64_t phoneme_size, float *f0,
float *phoneme, int64_t *speaker_id, float *output);

bool generate_full_intermediate(int64_t length, int64_t phoneme_size,
float *f0, float *phoneme, int64_t *speaker_id,
float *output);

bool render_audio_segment(int64_t length, int64_t margin_width, int64_t feature_size,
float *audio_feature, int64_t *speaker_id, float *output);

const char *last_error_message();
11 changes: 11 additions & 0 deletions crates/test_util/src/typing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,22 @@ pub struct DecodeExampleData {
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct IntermediateExampleData {
pub f0_length: i64,
pub phoneme_size: i64,
pub feature_dim: i64,
pub margin_width: i64,
pub f0_vector: Vec<f32>,
pub phoneme_vector: Vec<f32>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct ExampleData {
pub speaker_id: i64,

pub duration: DurationExampleData,
pub intonation: IntonationExampleData,
pub decode: DecodeExampleData,
pub intermediate: IntermediateExampleData,
}
1 change: 1 addition & 0 deletions crates/voicevox_core_c_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ easy-ext.workspace = true
educe.workspace = true
itertools.workspace = true
libc.workspace = true
ndarray.workspace = true
parking_lot = { workspace = true, features = ["arc_lock"] }
process_path.workspace = true
ref-cast.workspace = true
Expand Down
105 changes: 105 additions & 0 deletions crates/voicevox_core_c_api/src/compatible_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,111 @@ pub unsafe extern "C" fn decode_forward(
}
}

/// # Safety
///
/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。
/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; ((length + 2 * 14) * 80) as usize]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn generate_full_intermediate(
length: i64,
phoneme_size: i64,
f0: *mut f32,
phoneme: *mut f32,
speaker_id: *mut i64,
output: *mut f32,
) -> bool {
init_logger_once();
assert_aligned(f0);
assert_aligned(phoneme);
assert_aligned(speaker_id);
assert_aligned(output);
let length = length as usize;
let phoneme_size = phoneme_size as usize;
const MARGIN_WIDTH: usize = 14;
const FEATURE_SIZE: usize = 80;
let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).generate_full_intermediate(
length,
phoneme_size,
// SAFETY: The safety contract must be upheld by the caller.
unsafe { std::slice::from_raw_parts(f0, length) },
unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) },
StyleId::new(unsafe { *speaker_id as u32 }),
);
match result {
Ok(output_arr) => {
let output_len = (length + 2 * MARGIN_WIDTH) * FEATURE_SIZE;
if output_arr.len() != output_len {
panic!("expected {}, got {}", output_len, output_arr.len());
}
let output_arr = output_arr.as_standard_layout();
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
output_arr
.as_ptr()
.copy_to_nonoverlapping(output, output_len);
}
true
}
Err(err) => {
set_message(&format!("{err}"));
false
}
}
}

/// # Safety
///
/// - `audio_feature`はRustの`&[f32; (length * feature_size) as usize]`として解釈できなければならない。
/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。
/// - `output`はRustの`&mut [MaybeUninit<f32>; length as usize * 256]`として解釈できなければならない。
#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない
pub unsafe extern "C" fn render_audio_segment(
length: i64,
_margin_width: i64,
feature_size: i64,
audio_feature: *mut f32,
speaker_id: *mut i64,
output: *mut f32,
) -> bool {
init_logger_once();
assert_aligned(audio_feature);
assert_aligned(speaker_id);
assert_aligned(output);
let length = length as usize;
let feature_size = feature_size as usize;
let synthesizer = &*lock_synthesizer();
let result = ensure_initialized!(synthesizer).render_audio_segment(
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
ndarray::ArrayView2::from_shape_ptr([length, feature_size], audio_feature).to_owned()
},
StyleId::new(unsafe { *speaker_id as u32 }),
);
match result {
Ok(output_arr) => {
let output_len = length * 256;
if output_arr.len() != output_len {
panic!("expected {}, got {}", output_len, output_arr.len());
}
let output_arr = output_arr.as_standard_layout();
// SAFETY: The safety contract must be upheld by the caller.
unsafe {
output_arr
.as_ptr()
.copy_to_nonoverlapping(output, output_len);
}
true
}
Err(err) => {
set_message(&format!("{err}"));
false
}
}
}

#[track_caller]
fn assert_aligned(ptr: *mut impl Sized) {
assert!(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う

use std::ffi::CStr;
use std::sync::LazyLock;
use std::{cmp::min, ffi::CStr};

use assert_cmd::assert::AssertResult;
use libloading::Library;
Expand Down Expand Up @@ -83,12 +83,86 @@ impl assert_cdylib::TestCase for TestCase {
wave
};

// 中間生成物を経由した場合の生成音声
let wave2 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * length_with_margin as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
assert!(lib.render_audio_segment(
length_with_margin,
EXAMPLE_DATA.intermediate.margin_width,
EXAMPLE_DATA.intermediate.feature_dim,
audio_feature.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave.as_mut_ptr(),
));
wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize
..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]
.to_vec()
};

// 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声
let wave3 = {
let length_with_margin =
EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width;
let mut audio_feature =
vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize];
let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize];
assert!(lib.generate_full_intermediate(
EXAMPLE_DATA.intermediate.f0_length,
EXAMPLE_DATA.intermediate.phoneme_size,
EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32,
EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
audio_feature.as_mut_ptr(),
));
let full_length = EXAMPLE_DATA.intermediate.f0_length as usize;
let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize;
for render_start in (0..full_length).step_by(10) {
// render_start .. render_end の音声を取得する
let render_end = min(render_start + 10, full_length);
let slice_start = render_start;
let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize;
let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch];
let slice_length = slice_end - slice_start;
let mut wave_segment_with_margin = vec![0.; 256 * slice_length];
assert!(lib.render_audio_segment(
slice_length as i64,
EXAMPLE_DATA.intermediate.margin_width,
pitch as i64,
feature_segment.as_ptr() as *mut f32,
&mut { EXAMPLE_DATA.speaker_id } as *mut i64,
wave_segment_with_margin.as_mut_ptr(),
));
let wave_segment = &wave_segment_with_margin[256
* EXAMPLE_DATA.intermediate.margin_width as usize
..wave_segment_with_margin.len()
- 256 * EXAMPLE_DATA.intermediate.margin_width as usize];
wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment);
}
wave
};

std::assert_eq!(SNAPSHOTS.metas, metas_json);

float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01);
float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01);

assert!(wave.iter().copied().all(f32::is_normal));
assert!(wave2.iter().copied().all(f32::is_normal));
assert!(wave3.iter().copied().all(f32::is_normal));
float_assert::close_l1(&wave2, &wave, 0.001);
float_assert::close_l1(&wave3, &wave, 0.001);

lib.finalize();
Ok(())
Expand Down

0 comments on commit a5745c2

Please sign in to comment.