Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whisper pipeline: add perf metrics #971

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ int main(int argc, char* argv[]) try {
for (auto& chunk : *result.chunks) {
std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
}

} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
if (m_evaluated){
return;
}
// If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that.
// If start_item is specified then recalcualte durations according to start times and calculate statistics only after that.
if (start_time.has_value()) {
auto start_time_val = *start_time;
auto& tok_times = raw_metrics.m_new_token_times;
Expand Down
5 changes: 3 additions & 2 deletions src/cpp/src/whisper/timestamps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ ov::genai::ExtractedSegments extract_segments(const std::vector<int64_t>& tokens
tokens.end());
}

// last timestamps generated in pairs <ts><ts><eos> -> speech segment continuation to the next chunk -> token_start will have value
// single ending timestamp <ts><eos> -> no more speech till the end of current chunk -> set offset to the end of frame
// last timestamps generated in pairs <ts><ts><eos> -> speech segment continuation to the next chunk -> token_start
// will have value single ending timestamp <ts><eos> -> no more speech till the end of current chunk -> set offset
// to the end of frame
if (!token_start.has_value()) {
extracted_segments.last_offset = nb_max_frames;
}
Expand Down
78 changes: 61 additions & 17 deletions src/cpp/src/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "../utils.hpp"
#include "logit_processor.hpp"
#include "openvino/genai/perf_metrics.hpp"
#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/whisper_generation_config.hpp"
#include "openvino/genai/whisper_pipeline.hpp"
Expand All @@ -18,12 +19,15 @@
#include "whisper_feature_extractor.hpp"
#include "whisper_models.hpp"

using ov::genai::MicroSeconds;

namespace {

ov::Tensor encode(ov::InferRequest& request,
std::vector<float>& mel_data,
const size_t feature_size,
const size_t nb_max_frames) {
const size_t nb_max_frames,
ov::genai::RawPerfMetrics& raw_metrics) {
OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames,
"Mel spectrogram required size: ",
feature_size,
Expand All @@ -37,7 +41,10 @@ ov::Tensor encode(ov::InferRequest& request,

request.set_tensor("input_features", input_tensor);

const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);

// reset input tensor
request.set_tensor("input_features", ov::Tensor(ov::element::f32, {0, feature_size, nb_max_frames}));
Expand Down Expand Up @@ -72,18 +79,30 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
}
}

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
std::vector<int64_t>& input_ids,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool apply_logit_processors = true,
const bool return_timestamps = false) {
decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});

ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
decoder.set_tensor("input_ids", input_ids_tensor);

decoder.infer();
infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand All @@ -106,6 +125,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
int64_t input_id,
const size_t cache_position,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool return_timestamps,
const std::vector<int64_t>& generated_tokens) {
decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
Expand All @@ -118,7 +138,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
cache_position_tensor.set_shape({1});
cache_position_tensor.data<int64_t>()[0] = cache_position;

decoder_with_past.infer();
infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");

Expand All @@ -137,7 +157,17 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
ov::InferRequest decoder,
const ov::genai::WhisperGenerationConfig& config) {
std::vector<int64_t> input_ids{config.decoder_start_token_id};
int64_t output_token = decode(encoder_hidden_state, decoder, input_ids, config, false, false);

decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});

ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
decoder.set_tensor("input_ids", input_ids_tensor);

decoder.infer();

auto output_tensor = decoder.get_tensor("logits");

int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);

return output_token;
}
Expand Down Expand Up @@ -181,8 +211,10 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
std::vector<int64_t> init_ids,
const size_t max_new_tokens,
const bool return_timestamps,
ov::genai::RawPerfMetrics& raw_metrics,
const std::shared_ptr<ov::genai::StreamerBase> streamer) {
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
int64_t output_token =
decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);

std::vector<int64_t> output_tokens{output_token};

Expand All @@ -203,6 +235,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
output_tokens.back(),
init_ids.size() + output_tokens.size() - 1,
config,
raw_metrics,
return_timestamps,
output_tokens);

Expand Down Expand Up @@ -230,23 +263,30 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
namespace ov {
namespace genai {

std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_generate(
const ov::genai::WhisperGenerationConfig& config,
const ov::genai::WhisperConfig& model_config,
const RawSpeechInput& raw_speech,
ov::genai::WhisperInitializedModels& models,
WhisperFeatureExtractor& feature_extractor,
const std::shared_ptr<StreamerBase> streamer) {
WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
const ov::genai::WhisperConfig& model_config,
const RawSpeechInput& raw_speech,
ov::genai::WhisperInitializedModels& models,
WhisperFeatureExtractor& feature_extractor,
const std::shared_ptr<StreamerBase> streamer) {
auto input_features = feature_extractor.extract(raw_speech);

const bool is_shortform = input_features.n_frames <= feature_extractor.nb_max_frames;
// long-form audio processing requires timestamps to be enabled
const bool return_timestamps = config.return_timestamps || !is_shortform;

std::vector<int64_t> init_ids;
std::vector<int64_t> output_tokens;
size_t max_new_tokens = config.get_max_new_tokens();

WhisperGenerateResult result;
RawPerfMetrics& raw_metrics = result.perf_metrics.raw_metrics;
result.perf_metrics.num_input_tokens = 0;
raw_metrics.m_new_token_times.reserve(max_new_tokens);
raw_metrics.m_batch_sizes.reserve(max_new_tokens);
raw_metrics.m_token_infer_durations.reserve(max_new_tokens);
raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}};

std::vector<int64_t> init_ids;
std::vector<int64_t>& output_tokens = result.output_tokens;
std::vector<Segment> segments;

// 0.02 by default
Expand All @@ -263,7 +303,8 @@ std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_gen
ov::Tensor hidden_state_tensor = encode(models.encoder,
input_features_chunk,
feature_extractor.feature_size,
feature_extractor.nb_max_frames);
feature_extractor.nb_max_frames,
raw_metrics);

// prepare init_ids just once for whole input
if (init_ids.empty()) {
Expand All @@ -276,6 +317,7 @@ std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_gen
init_ids,
max_new_tokens - output_tokens.size(),
return_timestamps,
raw_metrics,
streamer);

if (return_timestamps) {
Expand Down Expand Up @@ -310,10 +352,12 @@ std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_gen

// if return_timestamps wasn't enabled by user
if (!config.return_timestamps) {
return {output_tokens, std::nullopt};
return result;
}

return {output_tokens, segments};
result.segments = segments;

return result;
}
} // namespace genai
} // namespace ov
19 changes: 12 additions & 7 deletions src/cpp/src/whisper/whisper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@ struct Segment {
std::vector<int64_t> m_tokens;
};

std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_generate(
const ov::genai::WhisperGenerationConfig& config,
const ov::genai::WhisperConfig& model_config,
const ov::genai::RawSpeechInput& raw_speech,
ov::genai::WhisperInitializedModels& models,
ov::genai::WhisperFeatureExtractor& feature_extractor,
const std::shared_ptr<StreamerBase> streamer);
struct WhisperGenerateResult {
std::vector<int64_t> output_tokens;
std::optional<std::vector<Segment>> segments = std::nullopt;
PerfMetrics perf_metrics;
};

WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
const ov::genai::WhisperConfig& model_config,
const ov::genai::RawSpeechInput& raw_speech,
ov::genai::WhisperInitializedModels& models,
ov::genai::WhisperFeatureExtractor& feature_extractor,
const std::shared_ptr<StreamerBase> streamer);

} // namespace genai
} // namespace ov
53 changes: 34 additions & 19 deletions src/cpp/src/whisper_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,28 +93,43 @@ class WhisperPipeline::Impl {
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
}

auto [output_tokens, segments] = ov::genai::whisper_generate(config,
m_model_config,
raw_speech_input,
m_models,
m_feature_extractor,
streamer_ptr);

WhisperDecodedResults decoded_results{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}};
if (!segments.has_value()) {
return decoded_results;
auto generate_result = ov::genai::whisper_generate(config,
m_model_config,
raw_speech_input,
m_models,
m_feature_extractor,
streamer_ptr);
auto decode_start_time = std::chrono::steady_clock::now();
WhisperDecodedResults result{std::vector{m_tokenizer.decode(generate_result.output_tokens)}, std::vector{1.f}};
generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));

result.perf_metrics = generate_result.perf_metrics;
auto& segments = generate_result.segments;

if (segments.has_value()) {
std::vector<WhisperDecodedResultChunk> chunks;
chunks.reserve((*segments).size());

for (auto& segment : *segments) {
decode_start_time = std::chrono::steady_clock::now();
chunks.push_back(
WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
}

result.chunks = chunks;
}

std::vector<WhisperDecodedResultChunk> chunks;
chunks.reserve((*segments).size());
auto& metrics = result.perf_metrics;
metrics.load_time = this->m_load_time_ms;
auto stop_time = std::chrono::steady_clock::now();
metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
result.perf_metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f));
metrics.evaluate_statistics(start_time);

for (auto& segment : *segments) {
chunks.push_back(
WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
}

decoded_results.chunks = chunks;
return decoded_results;
return result;
}
};

Expand Down
2 changes: 2 additions & 0 deletions src/python/py_generate_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
.def("get_num_input_tokens", &PerfMetrics::get_num_input_tokens)
.def("get_ttft", &PerfMetrics::get_ttft)
.def("get_tpot", &PerfMetrics::get_tpot)
.def("get_ipot", &PerfMetrics::get_ipot)
.def("get_throughput", &PerfMetrics::get_throughput)
.def("get_generate_duration", &PerfMetrics::get_generate_duration)
.def("get_inference_duration", &PerfMetrics::get_inference_duration)
.def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)
.def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration)
.def("__add__", &PerfMetrics::operator+)
Expand Down
44 changes: 37 additions & 7 deletions tests/python_tests/test_whisper_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,25 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id):
compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id)


@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@pytest.mark.parametrize(
"test_sample",
get_samples_from_dataset(language="en", length=1),
)
@pytest.mark.precommit
def test_smoke(model_descr, test_sample):
model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)

expected = opt_pipe(test_sample)

genai_result = pipe.generate(test_sample)

assert genai_result.texts[0] == expected["text"]

assert "chunks" not in expected
assert genai_result.chunks == None


@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@pytest.mark.precommit
def test_whisper_config_constructor(model_descr):
Expand Down Expand Up @@ -509,17 +528,28 @@ def test_longform_audio(model_descr, test_sample):
@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@pytest.mark.parametrize(
"test_sample",
get_samples_from_dataset(language="en", length=1),
[
*get_samples_from_dataset(language="en", length=1),
],
)
@pytest.mark.precommit
def test_smoke(model_descr, test_sample):
def test_perf_metrics(model_descr, test_sample):
model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)

expected = opt_pipe(test_sample)
result = pipe.generate(test_sample)

genai_result = pipe.generate(test_sample)
perf_metrics = result.perf_metrics

assert genai_result.texts[0] == expected["text"]
assert perf_metrics is not None

assert "chunks" not in expected
assert genai_result.chunks == None
assert perf_metrics.get_load_time() > 0
assert perf_metrics.get_num_generated_tokens() > 0
assert perf_metrics.get_num_input_tokens() == 0
assert perf_metrics.get_ttft().mean > 0
assert perf_metrics.get_tpot().mean > 0
assert perf_metrics.get_ipot().mean > 0
assert perf_metrics.get_throughput().mean > 0
assert perf_metrics.get_inference_duration().mean > 0
assert perf_metrics.get_generate_duration().mean > 0
assert perf_metrics.get_tokenization_duration().mean == 0
assert perf_metrics.get_detokenization_duration().mean > 0
Loading