diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 9d6aa698ce..04993f288c 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -18,6 +18,7 @@ #include "whisper_config.hpp" #include "whisper_feature_extractor.hpp" #include "whisper_models.hpp" +#include "whisper_utils.hpp" using ov::genai::MicroSeconds; @@ -79,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { } } -void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) { - const auto infer_start = std::chrono::steady_clock::now(); - request.infer(); - const auto infer_end = std::chrono::steady_clock::now(); - const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start); - raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); - raw_metrics.m_token_infer_durations.emplace_back(infer_ms); - raw_metrics.m_new_token_times.emplace_back(infer_end); - raw_metrics.m_batch_sizes.emplace_back(1); -} - int64_t decode(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, std::vector& input_ids, @@ -102,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state, ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); decoder.set_tensor("input_ids", input_ids_tensor); - infer_with_perf_metrics(decoder, raw_metrics); + ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); auto output_tensor = decoder.get_tensor("logits"); @@ -138,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state, cache_position_tensor.set_shape({1}); cache_position_tensor.data()[0] = cache_position; - infer_with_perf_metrics(decoder_with_past, raw_metrics); + ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); auto output_tensor = decoder_with_past.get_tensor("logits"); @@ -265,25 +255,6 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta return {false, output_tokens}; } -template -void filter_by_ranges(std::vector& value, size_t offset, std::vector>& ranges) { - OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second)); - std::vector result{value.begin(), value.begin() + offset}; - for (auto [start, end] : ranges) { - result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end); - } - - value = result; -} - -void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, - size_t offset, - std::vector>& ranges) { - filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges); - filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges); - filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges); -} - } // namespace namespace ov { @@ -362,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& feature_extractor.nb_max_frames, time_precision); - filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp new file mode 100644 index 0000000000..6e56a1439d --- /dev/null +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "whisper_utils.hpp" + +namespace { + +template +void filter_by_ranges(std::vector& value, size_t offset, std::vector>& ranges) { + OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second)); + std::vector result{value.begin(), value.begin() + offset}; + for (auto [start, end] : ranges) { + result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end); + } + + value = result; +} + +} // namespace + +namespace ov { +namespace genai { +namespace utils { + +void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) { + const auto infer_start = std::chrono::steady_clock::now(); + request.infer(); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(1); +} + +void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, + size_t offset, + std::vector>& ranges) { + filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges); + filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges); + filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges); +} + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp new file mode 100644 index 0000000000..234feed6a8 --- /dev/null +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/perf_metrics.hpp" + +namespace ov { +namespace genai { +namespace utils { + +void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics); + +void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, + size_t offset, + std::vector>& ranges); + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index dc26789846..cc61eb0659 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -14,6 +14,7 @@ #include "whisper/timestamps.hpp" #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" +#include "whisper/whisper_utils.hpp" #include "openvino/core/layout.hpp" #include "openvino/core/preprocess/pre_post_process.hpp" @@ -26,6 +27,8 @@ #include "openvino/op/convert.hpp" #include "openvino/op/parameter.hpp" +using ov::genai::MicroSeconds; + namespace { template @@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector& src_vec, ov::Tensor dst_tensor) { ov::Tensor encode(ov::InferRequest& request, std::vector& mel_data, const size_t feature_size, - const size_t nb_max_frames) { + const size_t nb_max_frames, + ov::genai::RawPerfMetrics& raw_metrics) { OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames, "Mel spectrogram required size: ", feature_size, @@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request, mel_data.size(), "."); copy_to_tensor(mel_data, request.get_tensor("input_features")); + + const auto infer_start = std::chrono::steady_clock::now(); request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + return request.get_tensor("last_hidden_state"); } @@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, const std::vector& init_ids, const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics, const bool apply_logit_processors = true, const bool return_timestamps = false) { // NB: Fill decoder inputs encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states")); set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id); - decoder.infer(); + ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); auto output_tensor = decoder.get_tensor("logits"); @@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past, const int64_t input_id, const int64_t position_id, const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics, const bool return_timestamps, const std::vector& generated_tokens) { // FIXME: Avoid this cast to i32. Why it's not i64 precision in model? @@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past, // FIXME: Is "attention_mask" supposed to be f16? decoder_with_past.get_tensor("attention_mask").data()[position_id - 1] = 0u; - decoder_with_past.infer(); + ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); auto output_tensor = decoder_with_past.get_tensor("logits"); ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); @@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq int64_t detect_language(ov::Tensor& encoder_hidden_state, ov::InferRequest decoder, - const ov::genai::WhisperGenerationConfig& config) { + const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics) { decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); std::vector init_ids{static_cast(config.decoder_start_token_id)}; set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id); + const auto infer_start = std::chrono::steady_clock::now(); decoder.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); auto output_tensor = decoder.get_tensor("logits"); @@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state, std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, const ov::genai::WhisperGenerationConfig& config, - const bool return_timestamps) { + const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics) { if (!config.is_multilingual) { if (return_timestamps) { return std::vector{static_cast(config.decoder_start_token_id)}; @@ -263,7 +279,7 @@ std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, language_token_id = static_cast(config.lang_to_id.at(language)); } } else { - language_token_id = detect_language(encoder_hidden_state, decoder, config); + language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics); } int32_t task_token_id = static_cast(config.transcribe_token_id); @@ -289,8 +305,9 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta std::vector init_ids, const size_t max_new_tokens, const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics, const std::shared_ptr streamer) { - int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps); + int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps); std::vector output_tokens{output_token}; if (!return_timestamps && streamer && streamer->put(output_token)) { @@ -308,6 +325,7 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta output_tokens.back(), i + init_ids.size(), config, + raw_metrics, return_timestamps, output_tokens); update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size()); @@ -576,6 +594,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( const RawSpeechInput& raw_speech_input, OptionalWhisperGenerationConfig generation_config, ChunkStreamerVariant streamer) { + auto start_time = std::chrono::steady_clock::now(); WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; config.validate(); @@ -591,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } + size_t max_new_tokens = config.get_max_new_tokens(); + + WhisperPerfMetrics perf_metrics; + perf_metrics.num_input_tokens = 0; + RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics; + raw_metrics.m_new_token_times.reserve(max_new_tokens); + raw_metrics.m_batch_sizes.reserve(max_new_tokens); + raw_metrics.m_token_infer_durations.reserve(max_new_tokens); + raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}}; + + const auto extract_start = std::chrono::steady_clock::now(); auto input_features = m_feature_extractor.extract(raw_speech_input); + const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start); + perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms); const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames; // long-form audio processing requires timestamps to be enabled const bool return_timestamps = config.return_timestamps || !is_shortform; - size_t max_new_tokens = config.get_max_new_tokens(); - std::vector init_ids; std::vector output_tokens; std::vector segments; @@ -619,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( ov::Tensor hidden_state_tensor = encode(m_models.encoder, input_features_chunk, m_feature_extractor.feature_size, - m_feature_extractor.nb_max_frames); + m_feature_extractor.nb_max_frames, + raw_metrics); // prepare init_ids just once for whole input if (init_ids.empty()) { - init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps); + init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics); } auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, @@ -632,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( init_ids, max_new_tokens - output_tokens.size(), return_timestamps, + raw_metrics, streamer_ptr); if (return_timestamps) { @@ -640,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( m_feature_extractor.nb_max_frames, time_precision); + ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); output_tokens.insert(output_tokens.end(), @@ -669,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( streamer_ptr->end(); } + auto decode_start_time = std::chrono::steady_clock::now(); WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}}; + result.perf_metrics = perf_metrics; + result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( + PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time)); // if return_timestamps wasn't enabled by user if (!config.return_timestamps) { @@ -681,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate( chunks.reserve(segments.size()); for (auto& segment : segments) { + decode_start_time = std::chrono::steady_clock::now(); chunks.push_back( WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)}); + result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( + PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time)); } result.chunks = chunks; } + auto& metrics = result.perf_metrics; + metrics.load_time = this->m_load_time_ms; + auto stop_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f)); + metrics.evaluate_statistics(start_time); + return result; }