Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add perf metrics support for WhisperStaticPipeline #1337

Merged
merged 7 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 4 additions & 33 deletions src/cpp/src/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "whisper_config.hpp"
#include "whisper_feature_extractor.hpp"
#include "whisper_models.hpp"
#include "whisper_utils.hpp"

using ov::genai::MicroSeconds;

Expand Down Expand Up @@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
}
}

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
std::vector<int64_t>& input_ids,
Expand All @@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
decoder.set_tensor("input_ids", input_ids_tensor);

infer_with_perf_metrics(decoder, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand Down Expand Up @@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
cache_position_tensor.set_shape({1});
cache_position_tensor.data<int64_t>()[0] = cache_position;

infer_with_perf_metrics(decoder_with_past, raw_metrics);
ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");

Expand Down Expand Up @@ -265,25 +255,6 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
return {false, output_tokens};
}

template <typename T>
void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace

namespace ov {
Expand Down Expand Up @@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
feature_extractor.nb_max_frames,
time_precision);

filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);

segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());

Expand Down
46 changes: 46 additions & 0 deletions src/cpp/src/whisper/whisper_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "whisper_utils.hpp"

namespace {

template <typename T>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, in doesn't have sense to put templated function into anonym namespace, does it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
std::vector<T> result{value.begin(), value.begin() + offset};
for (auto [start, end] : ranges) {
result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
}

value = result;
}

} // namespace

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
raw_metrics.m_new_token_times.emplace_back(infer_end);
raw_metrics.m_batch_sizes.emplace_back(1);
}

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges) {
filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
}

} // namespace utils
} // namespace genai
} // namespace ov
22 changes: 22 additions & 0 deletions src/cpp/src/whisper/whisper_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <openvino/openvino.hpp>

#include "openvino/genai/perf_metrics.hpp"

namespace ov {
namespace genai {
namespace utils {

void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics);

void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
size_t offset,
std::vector<std::pair<size_t, size_t>>& ranges);

} // namespace utils
} // namespace genai
} // namespace ov
70 changes: 59 additions & 11 deletions src/cpp/src/whisper_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "whisper/timestamps.hpp"
#include "whisper/whisper.hpp"
#include "whisper/whisper_config.hpp"
#include "whisper/whisper_utils.hpp"

#include "openvino/core/layout.hpp"
#include "openvino/core/preprocess/pre_post_process.hpp"
Expand All @@ -26,6 +27,8 @@
#include "openvino/op/convert.hpp"
#include "openvino/op/parameter.hpp"

using ov::genai::MicroSeconds;

namespace {

template <typename T>
Expand All @@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
ov::Tensor encode(ov::InferRequest& request,
std::vector<float>& mel_data,
const size_t feature_size,
const size_t nb_max_frames) {
const size_t nb_max_frames,
ov::genai::RawPerfMetrics& raw_metrics) {
OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames,
"Mel spectrogram required size: ",
feature_size,
Expand All @@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request,
mel_data.size(),
".");
copy_to_tensor(mel_data, request.get_tensor("input_features"));

const auto infer_start = std::chrono::steady_clock::now();
request.infer();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);

return request.get_tensor("last_hidden_state");
}

Expand Down Expand Up @@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
const std::vector<int32_t>& init_ids,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool apply_logit_processors = true,
const bool return_timestamps = false) {
// NB: Fill decoder inputs
encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);

decoder.infer();
ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);

auto output_tensor = decoder.get_tensor("logits");

Expand All @@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
const int64_t input_id,
const int64_t position_id,
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics,
const bool return_timestamps,
const std::vector<int64_t>& generated_tokens) {
// FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
Expand All @@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
// FIXME: Is "attention_mask" supposed to be f16?
decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;

decoder_with_past.infer();
ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);

auto output_tensor = decoder_with_past.get_tensor("logits");
ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
Expand Down Expand Up @@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq

int64_t detect_language(ov::Tensor& encoder_hidden_state,
ov::InferRequest decoder,
const ov::genai::WhisperGenerationConfig& config) {
const ov::genai::WhisperGenerationConfig& config,
ov::genai::RawPerfMetrics& raw_metrics) {
decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});

std::vector<int32_t> init_ids{static_cast<int32_t>(config.decoder_start_token_id)};
set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);

const auto infer_start = std::chrono::steady_clock::now();
decoder.infer();
const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);

auto output_tensor = decoder.get_tensor("logits");

Expand All @@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
const ov::genai::WhisperGenerationConfig& config,
const bool return_timestamps) {
const bool return_timestamps,
ov::genai::RawPerfMetrics& raw_metrics) {
if (!config.is_multilingual) {
if (return_timestamps) {
return std::vector<int32_t>{static_cast<int32_t>(config.decoder_start_token_id)};
Expand All @@ -263,7 +279,7 @@ std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
language_token_id = static_cast<int32_t>(config.lang_to_id.at(language));
}
} else {
language_token_id = detect_language(encoder_hidden_state, decoder, config);
language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics);
}

int32_t task_token_id = static_cast<int32_t>(config.transcribe_token_id);
Expand All @@ -289,8 +305,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
std::vector<int32_t> init_ids,
const size_t max_new_tokens,
const bool return_timestamps,
ov::genai::RawPerfMetrics& raw_metrics,
const std::shared_ptr<ov::genai::ChunkStreamerBase> streamer) {
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);
std::vector<int64_t> output_tokens{output_token};

if (!return_timestamps && streamer && streamer->put(output_token)) {
Expand All @@ -308,6 +325,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
output_tokens.back(),
i + init_ids.size(),
config,
raw_metrics,
return_timestamps,
output_tokens);
update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size());
Expand Down Expand Up @@ -576,6 +594,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
const RawSpeechInput& raw_speech_input,
OptionalWhisperGenerationConfig generation_config,
ChunkStreamerVariant streamer) {
auto start_time = std::chrono::steady_clock::now();
WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
config.validate();

Expand All @@ -591,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
}

size_t max_new_tokens = config.get_max_new_tokens();

WhisperPerfMetrics perf_metrics;
perf_metrics.num_input_tokens = 0;
RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics;
raw_metrics.m_new_token_times.reserve(max_new_tokens);
raw_metrics.m_batch_sizes.reserve(max_new_tokens);
raw_metrics.m_token_infer_durations.reserve(max_new_tokens);
raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}};

const auto extract_start = std::chrono::steady_clock::now();
auto input_features = m_feature_extractor.extract(raw_speech_input);
const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start);
perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms);

const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames;
// long-form audio processing requires timestamps to be enabled
const bool return_timestamps = config.return_timestamps || !is_shortform;

size_t max_new_tokens = config.get_max_new_tokens();

std::vector<int32_t> init_ids;
std::vector<int64_t> output_tokens;
std::vector<Segment> segments;
Expand All @@ -619,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
ov::Tensor hidden_state_tensor = encode(m_models.encoder,
input_features_chunk,
m_feature_extractor.feature_size,
m_feature_extractor.nb_max_frames);
m_feature_extractor.nb_max_frames,
raw_metrics);

// prepare init_ids just once for whole input
if (init_ids.empty()) {
init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps);
init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics);
}

auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
Expand All @@ -632,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
init_ids,
max_new_tokens - output_tokens.size(),
return_timestamps,
raw_metrics,
streamer_ptr);

if (return_timestamps) {
Expand All @@ -640,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
m_feature_extractor.nb_max_frames,
time_precision);

ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);

segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());

output_tokens.insert(output_tokens.end(),
Expand Down Expand Up @@ -669,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
streamer_ptr->end();
}

auto decode_start_time = std::chrono::steady_clock::now();
WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}};
result.perf_metrics = perf_metrics;
result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));

// if return_timestamps wasn't enabled by user
if (!config.return_timestamps) {
Expand All @@ -681,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
chunks.reserve(segments.size());

for (auto& segment : segments) {
decode_start_time = std::chrono::steady_clock::now();
chunks.push_back(
WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
}

result.chunks = chunks;
}

auto& metrics = result.perf_metrics;
metrics.load_time = this->m_load_time_ms;
auto stop_time = std::chrono::steady_clock::now();
metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f));
metrics.evaluate_statistics(start_time);

return result;
}

Expand Down
Loading