Skip to content

Commit

Permalink
Add perf metrics support for WhisperStaticPipeline (#1337)
Browse files Browse the repository at this point in the history
  • Loading branch information
eshiryae authored Dec 23, 2024
1 parent c09207c commit 3496d45
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 44 deletions.
37 changes: 4 additions & 33 deletions src/cpp/src/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "whisper_config.hpp"
#include "whisper_feature_extractor.hpp"
#include "whisper_models.hpp"
#include "whisper_utils.hpp"

using ov::genai::MicroSeconds;

Expand Down Expand Up @@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
}
}

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
std::vector<int64_t>& input_ids,
Expand All @@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
decoder.set_tensor("input_ids", input_ids_tensor);

infer_with_perf_metrics(decoder, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand Down Expand Up @@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
cache_position_tensor.set_shape({1});
cache_position_tensor.data<int64_t>()[0] = cache_position;

infer_with_perf_metrics(decoder_with_past, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");

Expand Down Expand Up @@ -265,25 +255,6 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
return {false, output_tokens};
}

template <typename T>
void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace

namespace ov {
Expand Down Expand Up @@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
feature_extractor.nb_max_frames,
time_precision);

filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);

segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());

Expand Down
46 changes: 46 additions & 0 deletions src/cpp/src/whisper/whisper_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "whisper_utils.hpp"

namespace {

template <typename T>
void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

} // namespace

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace utils
} // namespace genai
} // namespace ov
22 changes: 22 additions & 0 deletions src/cpp/src/whisper/whisper_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <openvino/openvino.hpp>

#include "openvino/genai/perf_metrics.hpp"

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics);

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges);

} // namespace utils
} // namespace genai
} // namespace ov
70 changes: 59 additions & 11 deletions src/cpp/src/whisper_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "whisper/timestamps.hpp"
#include "whisper/whisper.hpp"
#include "whisper/whisper_config.hpp"
#include "whisper/whisper_utils.hpp"

#include "openvino/core/layout.hpp"
#include "openvino/core/preprocess/pre_post_process.hpp"
Expand All @@ -26,6 +27,8 @@
#include "openvino/op/convert.hpp"
#include "openvino/op/parameter.hpp"

using ov::genai::MicroSeconds;

namespace {

template <typename T>
Expand All @@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
ov::Tensor encode(ov::InferRequest& request,
std::vector<float>& mel_data,
const size_t feature_size,
const size_t nb_max_frames) {
const size_t nb_max_frames,
ov::genai::RawPerfMetrics& raw_metrics) {
OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames,
"Mel spectrogram required size: ",
feature_size,
Expand All @@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request,
mel_data.size(),
".");
copy_to_tensor(mel_data, request.get_tensor("input_features"));

const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);

return request.get_tensor("last_hidden_state");
}

Expand Down Expand Up @@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
const std::vector<int32_t>& init_ids,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool apply_logit_processors = true,
const bool return_timestamps = false) {
// NB: Fill decoder inputs
encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);

decoder.infer();
ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand All @@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
const int64_t input_id,
const int64_t position_id,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool return_timestamps,
const std::vector<int64_t>& generated_tokens) {
// FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
Expand All @@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
// FIXME: Is "attention_mask" supposed to be f16?
decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;

decoder_with_past.infer();
ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");
ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
Expand Down Expand Up @@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq

int64_t detect_language(ov::Tensor& encoder_hidden_state,
ov::InferRequest decoder,
const ov::genai::WhisperGenerationConfig& config) {
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics) {
decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});

std::vector<int32_t> init_ids{static_cast<int32_t>(config.decoder_start_token_id)};
set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);

const auto infer_start = std::chrono::steady_clock::now();
decoder.infer();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);

auto output_tensor = decoder.get_tensor("logits");

Expand All @@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
const ov::genai::WhisperGenerationConfig& config,
const bool return_timestamps) {
const bool return_timestamps,
ov::genai::RawPerfMetrics& raw_metrics) {
if (!config.is_multilingual) {
if (return_timestamps) {
return std::vector<int32_t>{static_cast<int32_t>(config.decoder_start_token_id)};
Expand All @@ -263,7 +279,7 @@ std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
language_token_id = static_cast<int32_t>(config.lang_to_id.at(language));
}
} else {
language_token_id = detect_language(encoder_hidden_state, decoder, config);
language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics);
}

int32_t task_token_id = static_cast<int32_t>(config.transcribe_token_id);
Expand All @@ -289,8 +305,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
std::vector<int32_t> init_ids,
const size_t max_new_tokens,
const bool return_timestamps,
ov::genai::RawPerfMetrics& raw_metrics,
const std::shared_ptr<ov::genai::ChunkStreamerBase> streamer) {
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);
std::vector<int64_t> output_tokens{output_token};

if (!return_timestamps && streamer && streamer->put(output_token)) {
Expand All @@ -308,6 +325,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
output_tokens.back(),
i + init_ids.size(),
config,
raw_metrics,
return_timestamps,
output_tokens);
update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size());
Expand Down Expand Up @@ -576,6 +594,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
const RawSpeechInput& raw_speech_input,
OptionalWhisperGenerationConfig generation_config,
ChunkStreamerVariant streamer) {
auto start_time = std::chrono::steady_clock::now();
WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
config.validate();

Expand All @@ -591,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
}

size_t max_new_tokens = config.get_max_new_tokens();

WhisperPerfMetrics perf_metrics;
perf_metrics.num_input_tokens = 0;
RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics;
raw_metrics.m_new_token_times.reserve(max_new_tokens);
raw_metrics.m_batch_sizes.reserve(max_new_tokens);
raw_metrics.m_token_infer_durations.reserve(max_new_tokens);
raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}};

const auto extract_start = std::chrono::steady_clock::now();
auto input_features = m_feature_extractor.extract(raw_speech_input);
const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start);
perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms);

const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames;
// long-form audio processing requires timestamps to be enabled
const bool return_timestamps = config.return_timestamps || !is_shortform;

size_t max_new_tokens = config.get_max_new_tokens();

std::vector<int32_t> init_ids;
std::vector<int64_t> output_tokens;
std::vector<Segment> segments;
Expand All @@ -619,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
ov::Tensor hidden_state_tensor = encode(m_models.encoder,
input_features_chunk,
m_feature_extractor.feature_size,
m_feature_extractor.nb_max_frames);
m_feature_extractor.nb_max_frames,
raw_metrics);

// prepare init_ids just once for whole input
if (init_ids.empty()) {
init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps);
init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics);
}

auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
Expand All @@ -632,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
init_ids,
max_new_tokens - output_tokens.size(),
return_timestamps,
raw_metrics,
streamer_ptr);

if (return_timestamps) {
Expand All @@ -640,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
m_feature_extractor.nb_max_frames,
time_precision);

ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);

segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());

output_tokens.insert(output_tokens.end(),
Expand Down Expand Up @@ -669,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
streamer_ptr->end();
}

auto decode_start_time = std::chrono::steady_clock::now();
WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}};
result.perf_metrics = perf_metrics;
result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));

// if return_timestamps wasn't enabled by user
if (!config.return_timestamps) {
Expand All @@ -681,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
chunks.reserve(segments.size());

for (auto& segment : segments) {
decode_start_time = std::chrono::steady_clock::now();
chunks.push_back(
WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
}

result.chunks = chunks;
}

auto& metrics = result.perf_metrics;
metrics.load_time = this->m_load_time_ms;
auto stop_time = std::chrono::steady_clock::now();
metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f));
metrics.evaluate_statistics(start_time);

return result;
}

Expand Down

0 comments on commit 3496d45

Please sign in to comment.