From add2db7399c912c0f314cbcfa3f109f217ea8a3c Mon Sep 17 00:00:00 2001 From: Dener Stassun Date: Tue, 12 Mar 2024 09:53:59 -0300 Subject: [PATCH] dtw: cleanup --- tests/test-dtw.py | 53 ------------------ whisper.cpp | 135 ++-------------------------------------------- whisper.h | 10 ++-- 3 files changed, 8 insertions(+), 190 deletions(-) delete mode 100644 tests/test-dtw.py diff --git a/tests/test-dtw.py b/tests/test-dtw.py deleted file mode 100644 index c0d9cd85a55..00000000000 --- a/tests/test-dtw.py +++ /dev/null @@ -1,53 +0,0 @@ -# Needs "pip install -U openai-whisper" -from whisper.timing import dtw_cpu -import numpy as np -from ctypes import * -import sys - -# Load whisper.cpp -if len(sys.argv) != 2: - print("Usage: python test-dtw ") -wcpp = CDLL(sys.argv[1]) - -# Generate test data -np.random.seed(0) -IN_DINS=[(1,1), (5,5,), (50, 200), (200, 1500), (1500, 200), (200, 50), (1,250), (250, 1)] -pairs=[] -for d in IN_DINS: - x = np.random.standard_normal((d[0], d[1])).astype('float32') - dtw = dtw_cpu(x) - pairs.append((x,dtw)) - -# Run whisper.cpp dtw -for idx, p in enumerate(pairs): - print("Running test {}...".format(idx), file=sys.stderr, end="") - - # Prepare types - in_size = IN_DINS[idx][0]*IN_DINS[idx][1] - in_type = c_float * in_size - out_type = POINTER(POINTER(c_int32)) - out_size_type = POINTER(c_size_t) - - wcpp_test_dtw = wcpp.whisper_test_dtw - wcpp_test_dtw.argtypes = (in_type, c_size_t, c_size_t, out_type, out_size_type, out_size_type) - wcpp_test_dtw.restype = None - - # Create args as ctypes - in_data_py = p[0].flatten().tolist() - in_data = in_type(*in_data_py) - out = POINTER(c_int32)() - out_ne0 = c_size_t() - out_ne1 = c_size_t() - - # Call whisper_test_dtw, retrieve output - wcpp_test_dtw(in_data, IN_DINS[idx][0], IN_DINS[idx][1], byref(out), byref(out_ne0), byref(out_ne1)) - out_np = np.empty((out_ne0.value, out_ne1.value), dtype=np.int32) - for i in range (0, out_ne0.value): - for j in range(0, out_ne1.value): - out_np[i][j] = out[j + i*out_ne1.value] - - # Test - if (np.array_equal(out_np, p[1])): - print(" OK!", file=sys.stderr) - else: - print(" Failed!", file=sys.stderr) diff --git a/whisper.cpp b/whisper.cpp index a86b8f2dde9..2ad84b7864b 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1068,10 +1068,10 @@ static void whisper_kv_cache_seq_cp( // [EXPERIMENTAL] Token-level timestamps with DTW static bool aheads_masks_init( - const whisper_context_params & cparams, - const whisper_hparams & hparams, - struct whisper_aheads_masks & aheads_masks, - ggml_backend_t backend) { + const whisper_context_params & cparams, + const whisper_hparams & hparams, + struct whisper_aheads_masks & aheads_masks, + ggml_backend_t backend) { const int32_t n_text_layer = hparams.n_text_layer; const int32_t n_head = hparams.n_text_head; @@ -6983,40 +6983,6 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor * } } -/*static ggml_tensor * median_filter(ggml_context * ctx, ggml_tensor * x, int filter_width) { - WHISPER_ASSERT(filter_width < x->ne[2]); - WHISPER_ASSERT(filter_width % 2); - WHISPER_ASSERT(ggml_n_dims(x) == 3); - WHISPER_ASSERT(x->type == GGML_TYPE_F32); - - std::vector filter; - filter.reserve(filter_width); - ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[1], x->ne[2]); - - for (int64_t i = 0; i < x->ne[0]; ++i) { - for (int64_t j = 0; j < x->ne[1]; ++j) { - for (int64_t k = 0; k < x->ne[2]; ++k) { - for (int64_t off = -filter_width/2; off <= filter_width/2; ++off) { - // "reflect" padding - int64_t idx = k + off; - if (idx < 0) - idx = -idx; - else if (idx >= x->ne[2]) - idx = 2*(x->ne[2] - 1) - idx; - - filter.push_back(ggml_get_f32_nd(x, i, j, idx, 0)); - } - std::sort(filter.begin(), filter.end()); - const float v = filter[filter.size()/2]; - ggml_set_f32_nd(r, i, j, k, 0, v); - filter.clear(); - } - } - } - - return r; -}*/ - static void whisper_exp_compute_token_level_timestamps_dtw( struct whisper_context * ctx, struct whisper_state * state, @@ -7082,7 +7048,7 @@ static void whisper_exp_compute_token_level_timestamps_dtw( const auto n_tokens = state->aheads_cross_QKs->ne[0]; const auto n_heads = state->aheads_cross_QKs->ne[2]; - // Copy data from decoder buffer to a local CPU tensor, discarding unused audio + // Copy data from decoder buffer to a local CPU tensor, discarding unused audio // tokens (i.e. discarding rows at the end of tensor) // IN: Tensor with N_TOKENS*audio_ctx*N_ALIGNMENT_HEADS dims // OUT: Tensor with N_TOKENS*N_AUDIO_TOKENS*N_ALIGNMENT_HEADS dims @@ -7177,97 +7143,6 @@ static void whisper_exp_compute_token_level_timestamps_dtw( ggml_free(gctx); } -//void whisper_test_dtw(float* in, size_t in_ne0, size_t in_ne1, int32_t **out, size_t *out_ne0, size_t *out_ne1) { -// struct ggml_init_params params = { -// /*.mem_size =*/ 32*1024*1024, -// /*.mem_buffer =*/ NULL, -// /*.no_alloc =*/ false, -// }; -/* struct ggml_context * ctx = ggml_init(params); - - struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, in_ne0, in_ne1); - for (size_t i = 0; i < in_ne0; i++) { - for (size_t j = 0; j < in_ne1; j++) { - ggml_set_f32_nd(x, i, j, 0, 0, in[j + i * in_ne1]); - } - } - struct ggml_tensor * r = dtw_and_backtrace(ctx, x); - - *out = (int32_t*) malloc(sizeof(int32_t) * r->ne[0] * r->ne[1]); - for (int i = 0; i < r->ne[0]; ++i) { - for (int j = 0; j < r->ne[1]; ++j) { - (*out)[j + i * r->ne[1]] = ggml_get_i32_nd(r, i, j, 0, 0); - } - } - *out_ne0 = r->ne[0]; - *out_ne1 = r->ne[1]; - ggml_free(ctx); -}*/ - -//void whisper_test_dtw_timestamp_funcs(float* in, size_t in_ne0, size_t in_ne1, size_t in_ne2, float **out, size_t *out_ne0, size_t *out_ne1, size_t *out_ne2) { -// struct ggml_init_params params = { -// /*.mem_size =*/ 32*1024*1024, -// /*.mem_buffer =*/ NULL, -// /*.no_alloc =*/ false, -// }; -/* struct ggml_context * ctx = ggml_init(params); - - struct ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, in_ne0, in_ne1, in_ne2); - for (int64_t idx = 0; idx < in_ne0*in_ne1*in_ne2; ++idx) { - int64_t k = idx % in_ne2; - int64_t j = (idx / in_ne2) % in_ne1; - int64_t i = idx / (in_ne1*in_ne2); - //fprintf(stderr, "idx=%ld i=%ld j=%ld k=%ld\n", idx, i, j, k); - ggml_set_f32_nd(x, i, j, k, 0, in[idx]); - } - - // Testing normalization - // Change dimensions so first is N_TOKENS to normalize over that - // Permute to correct shape for next computations - ggml_tensor * w = ggml_permute(ctx, x, 1, 0, 2, 3); // N_TOKENS*N_AUDIO_TOKENS*ALIGMENT_HEADS - w = ggml_cont(ctx, w); - w = ggml_norm(ctx, w, 0); - w = ggml_permute(ctx, w, 2, 1, 0, 3); - w = ggml_permute(ctx, w, 0, 2, 1, 3); - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, w); - ggml_graph_compute_with_ctx(ctx, gf, 4); - - // Pass median filter, dimensions unchanged - struct ggml_context * gctx2 = ggml_init(params); - ggml_tensor * w_medfilt = median_filter(gctx2, w, 7); - - // - Take mean over rows (matrix = weights.mean(axis=0)) - // - // Out dimension is N_TOKENS*N_AUDIO_TOKENS - ggml_tensor * w_mean = ggml_mean(gctx2, w_medfilt); - ggml_tensor * scale = ggml_new_tensor_1d(gctx2, GGML_TYPE_F32, 1); - ggml_set_f32_1d(scale, 0, -1); - ggml_tensor * w_negative = ggml_scale(gctx2, w_mean, scale); - ggml_tensor * w_reshape = ggml_reshape_2d(gctx2, w_negative, w_negative->ne[1], w_negative->ne[2]); - struct ggml_cgraph * gf2 = ggml_new_graph(gctx2); - ggml_build_forward_expand(gf2, w_reshape); - ggml_graph_compute_with_ctx(gctx2, gf2, 4); - - // Find alignment - ggml_tensor * alignment = dtw_and_backtrace(gctx2, w_reshape); - - // Copy output - ggml_tensor * r = alignment; - *out = (float*) malloc(sizeof(float) * r->ne[0] * r->ne[1] * r->ne[2]); - for (int idx = 0; idx < r->ne[0] * r->ne[1] * r->ne[2]; ++idx) { - int64_t k = idx % r->ne[2]; - int64_t j = (idx / r->ne[2]) % r->ne[1]; - int64_t i = idx / (r->ne[2]*r->ne[1]); - (*out)[idx] = ggml_get_f32_nd(r, i, j, k, 0); - } - *out_ne0 = r->ne[0]; - *out_ne1 = r->ne[1]; - *out_ne2 = r->ne[2]; - ggml_free(ctx); -}*/ - - void whisper_log_set(ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : whisper_log_callback_default; g_state.log_callback_user_data = user_data; diff --git a/whisper.h b/whisper.h index eb56f51a9f1..702c3927d75 100644 --- a/whisper.h +++ b/whisper.h @@ -115,8 +115,8 @@ extern "C" { bool use_gpu; int gpu_device; // CUDA device + // [EXPERIMENTAL] Token-level timestamps with DTW // FIXME: not sure if the way dtw_n_top_most and dtw_custom are structured is comfortable? - // [EXPERIMENTAL] DTW-based token-level timestamps bool dtw_token_timestamps; enum whisper_alignment_heads_preset dtw_aheads_preset; struct { @@ -142,9 +142,9 @@ extern "C" { int64_t t0; // start time of the token int64_t t1; // end time of the token - // dtw token-level timestamp data + // [EXPERIMENTAL] Token-level timestamps with DTW // do not use if you haven't computed token-level timestamps with dtw - // (I think) roughly corresponds to the moment in audio in which the token was output + // Roughly corresponds to the moment in audio in which the token was output int64_t t_dtw; float vlen; // voice length of the token @@ -659,10 +659,6 @@ extern "C" { WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data); - // test dtw - //WHISPER_API void whisper_test_dtw(float* in, size_t in_ne0, size_t in_ne1, int32_t **out, size_t *out_ne0, size_t *out_ne1); - //WHISPER_API void whisper_test_dtw_timestamp_funcs(float* in, size_t in_ne0, size_t in_ne1, size_t in_ne2, float **out, size_t *out_ne0, size_t *out_ne1, size_t *out_ne2); - #ifdef __cplusplus } #endif