Skip to content

Commit

Permalink
whisper.cpp: impl dtw algo
Browse files Browse the repository at this point in the history
  • Loading branch information
denersc committed Nov 21, 2023
1 parent 46f5b6c commit 3c9969e
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
53 changes: 53 additions & 0 deletions tests/test-dtw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Needs "pip install -U openai-whisper"
from whisper.timing import dtw_cpu
import numpy as np
from ctypes import *
import sys

# Load whisper.cpp
if len(sys.argv) != 2:
print("Usage: python test-dtw <PATH_TO_LIBWHISPER.SO>")
wcpp = CDLL(sys.argv[1])

# Generate test data
np.random.seed(0)
IN_DINS=[(1,1), (5,5,), (50, 200), (200, 1500), (1500, 200), (200, 50), (1,250), (250, 1)]
pairs=[]
for d in IN_DINS:
x = np.random.standard_normal((d[0], d[1])).astype('float32')
dtw = dtw_cpu(x)
pairs.append((x,dtw))

# Run whisper.cpp dtw
for idx, p in enumerate(pairs):
print("Running test {}...".format(idx), file=sys.stderr, end="")

# Prepare types
in_size = IN_DINS[idx][0]*IN_DINS[idx][1]
in_type = c_float * in_size
out_type = POINTER(POINTER(c_int32))
out_size_type = POINTER(c_size_t)

wcpp_test_dtw = wcpp.whisper_test_dtw
wcpp_test_dtw.argtypes = (in_type, c_size_t, c_size_t, out_type, out_size_type, out_size_type)
wcpp_test_dtw.restype = None

# Create args as ctypes
in_data_py = p[0].flatten().tolist()
in_data = in_type(*in_data_py)
out = POINTER(c_int32)()
out_ne0 = c_size_t()
out_ne1 = c_size_t()

# Call whisper_test_dtw, retrieve output
wcpp_test_dtw(in_data, IN_DINS[idx][0], IN_DINS[idx][1], byref(out), byref(out_ne0), byref(out_ne1))
out_np = np.empty((out_ne0.value, out_ne1.value), dtype=np.int32)
for i in range (0, out_ne0.value):
for j in range(0, out_ne1.value):
out_np[i][j] = out[j + i*out_ne1.value]

# Test
if (np.array_equal(out_np, p[1])):
print(" OK!", file=sys.stderr)
else:
print(" Failed!", file=sys.stderr)
169 changes: 169 additions & 0 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6532,6 +6532,175 @@ static void whisper_exp_compute_token_level_timestamps(
//}
}

//
// token level timestamps - dtw version
//

// dtw + backtrace to return found path
// based on
// https://github.com/openai/whisper/blob/main/whisper/timing.py#L83
static ggml_tensor * dtw_and_backtrace(ggml_context *ctx, ggml_tensor *x) {
WHISPER_ASSERT(x->n_dims == 2);

int64_t N = x->ne[0];
int64_t M = x->ne[1];
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);

cost = ggml_set_f32(cost, INFINITY);
trace = ggml_set_f32(trace, -1);
ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);

// dtw
// supposedly can be optmized by computing diagonals in parallel ?
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
for (int64_t j = 1; j < M + 1; ++j) {
for (int64_t i = 1; i < N + 1; ++i) {
float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0);
float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0);

float c;
int32_t t;
if (c0 < c1 && c0 < c2) {
c = c0;
t = 0;
} else if (c1 < c0 && c1 < c2) {
c = c1;
t = 1;
} else {
c = c2;
t = 2;
}

c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
ggml_set_f32_nd(cost, i, j, 0, 0, c);
ggml_set_i32_nd(trace, i, j, 0, 0, t);
}
}

// Backtrace
const int64_t BT_MAX_ROWS = N + M - 1;
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
// trace[0, :] = 2;
for (int64_t i = 0; i < M + 1; ++i)
ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
//trace[:, 0] = 1;
for (int64_t i = 0; i < N + 1; ++i)
ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
int bt_row_idx = BT_MAX_ROWS - 1;
int64_t i = N;
int64_t j = M;
while (i > 0 || j > 0) {
ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
--bt_row_idx;

int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0);
if (t == 0) {
--i;
--j;
} else if (t == 1) {
--i;
} else if (t == 2) {
--j;
} else {
WHISPER_ASSERT(0);
}
}

// Clip + transpose
// This might not be entirely necessary for our case, but leaving it for now so output matrix
// is identical to dtw on openAI timing.py
const int64_t result_n_cols = BT_MAX_ROWS-bt_row_idx-1;
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
for (int64_t i = 0; i < 2; ++i) {
for (int64_t j = 0; j < result_n_cols; ++j) {
int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
ggml_set_i32_nd(r, i, j, 0, 0, v);
}
}

return r;
}

void whisper_test_dtw(float* in, size_t in_ne0, size_t in_ne1, int32_t **out, size_t *out_ne0, size_t *out_ne1) {
struct ggml_init_params params = {
/*.mem_size =*/ 32*1024*1024,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx = ggml_init(params);

struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, in_ne0, in_ne1);
for (int i = 0; i < in_ne0; i++) {
for (int j = 0; j < in_ne1; j++) {
ggml_set_f32_nd(x, i, j, 0, 0, in[j + i * in_ne1]);
}
}
struct ggml_tensor * r = dtw_and_backtrace(ctx, x);

*out = (int32_t*) malloc(sizeof(int32_t) * r->ne[0] * r->ne[1]);
for (int i = 0; i < r->ne[0]; ++i) {
for (int j = 0; j < r->ne[1]; ++j) {
(*out)[j + i * r->ne[1]] = ggml_get_i32_nd(r, i, j, 0, 0);
}
}
*out_ne0 = r->ne[0];
*out_ne1 = r->ne[1];
ggml_free(ctx);
}

static void whisper_exp_compute_token_level_timestamps_dtw(
struct whisper_context & ctx,
struct whisper_state & state,
int n_frames,
int medfilt_width,
float qk_scale)
{

// - Get and stack QKs from alignment heads
// - Suppose we produced 15 tokens
// This should yield a N_HEADS*15*FRAMES tensor
// FRAMES=1500 with max segment length = 30s (30/(FRAME_SIZE)=30/(0,02)=1500)

// - Discard third dimensions parts that are audio padding
// e.g. actual audio is 10 seconds, so 1000 frames are padding, only 500 contain audio
// So output would be a tensor with N_HEADS*15*500 dimension

// - Scale matrix by qk_scale, than apply softmax
// Output still N_HEADS*15*500

// - Normalize - subtract by mean, divide by std (not sure how to, original code
// takes mean and std with dim=-2, torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False))
// Still N_HEADS*15*500

// - Pass median filter
// Still N_HEADS*15*500

// - Take mean over rows (matrix = weights.mean(axis=0))
// Out now is 15*500

// - Skip start of sentence sequence (matrix = matrix[len(tokenizer.sot_sequence) : -1])
// Discard first len(tokenizer.sot_sequence) tokens over first dimension
// Suppose len(tokenier.sot_sequence) = 3, so
// Output now is 12*500

// Multiply by -1, pass to dtw to get text and time indices
// Output will map each token index to a time index. Each time index corresponds to 20mS (audio
// frame size). From here, it is trivial to place a timestamp on each token.
// This timestamp seems to be more like "start of token" timestamp, roughly the audio moment
// the model outputed a certain token.
// Heuristics are needed to extrapolate a "end of token" time by using the time start of
// the next token.

// After this point, OpenAI code extends this with heuristics to place start/end times
// on each word instead of tokens. I find this to be a sort of decoupled second step.
// Without this, whisper users can still retrieve start times for each token and come up
// with heuristics that better serve their case.

}

void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
g_state.log_callback = log_callback ? log_callback : whisper_log_callback_default;
g_state.log_callback_user_data = user_data;
Expand Down
3 changes: 3 additions & 0 deletions whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,9 @@ extern "C" {

WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);

// test dtw
WHISPER_API void whisper_test_dtw(float* in, size_t in_ne0, size_t in_ne1, int32_t **out, size_t *out_ne0, size_t *out_ne1);

#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit 3c9969e

Please sign in to comment.