From 67d6cd3e591629b45175456ce2296fa318f673c2 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Mon, 13 Jan 2025 13:38:39 +0100 Subject: [PATCH] Whisper pipeline: support stateful decoder (#1474) Ticket: 159473 Optimum-intel PR: https://github.com/huggingface/optimum-intel/pull/1078 This PR switches optimum-intel in tests to stateful seq2seq branch. Tests check both stateful and with past decoders. Once optimum-intel PR is merged I'll switch version back to master. --- .github/workflows/windows.yml | 3 +- src/README.md | 2 +- src/cpp/src/logger.hpp | 17 ++ src/cpp/src/whisper/models/decoder.cpp | 26 +++ src/cpp/src/whisper/models/decoder.hpp | 29 +++ .../src/whisper/models/statefull_decoder.cpp | 60 ++++++ .../src/whisper/models/statefull_decoder.hpp | 29 +++ .../src/whisper/models/with_past_decoder.cpp | 107 ++++++++++ .../src/whisper/models/with_past_decoder.hpp | 32 +++ src/cpp/src/whisper/whisper.cpp | 166 ++++------------ src/cpp/src/whisper/whisper.hpp | 8 +- src/cpp/src/whisper/whisper_models.hpp | 2 +- src/cpp/src/whisper_pipeline.cpp | 23 ++- tests/python_tests/requirements.txt | 2 +- tests/python_tests/test_whisper_pipeline.py | 185 +++++++++++------- 15 files changed, 477 insertions(+), 214 deletions(-) create mode 100644 src/cpp/src/logger.hpp create mode 100644 src/cpp/src/whisper/models/decoder.cpp create mode 100644 src/cpp/src/whisper/models/decoder.hpp create mode 100644 src/cpp/src/whisper/models/statefull_decoder.cpp create mode 100644 src/cpp/src/whisper/models/statefull_decoder.hpp create mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp create mode 100644 src/cpp/src/whisper/models/with_past_decoder.hpp diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index ea07316942..f3ff07b641 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -311,10 +311,9 @@ jobs: python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - # will install transformers 4.46.3 version # transformers 4.46.3 will enable return_timestamps tests # this check enabled for windows only. Ticket: 160205. - python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 + python -m pip install transformers==4.46.3 python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" diff --git a/src/README.md b/src/README.md index 5d18d0b67b..af4953f98a 100644 --- a/src/README.md +++ b/src/README.md @@ -179,7 +179,7 @@ int main(int argc, char* argv[]) { Streaming with a custom class: -C++ template for a stremer. +C++ template for a streamer. ```cpp #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" diff --git a/src/cpp/src/logger.hpp b/src/cpp/src/logger.hpp new file mode 100644 index 0000000000..503a419e5e --- /dev/null +++ b/src/cpp/src/logger.hpp @@ -0,0 +1,17 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +namespace ov::genai { + +class Logger { +public: + static void warn(std::string message) { + std::cout << "[WARN] " << message << '\n'; + }; +}; + +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp new file mode 100644 index 0000000000..32a8f2eff6 --- /dev/null +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "decoder.hpp" + +#include + +#include "statefull_decoder.hpp" +#include "utils.hpp" +#include "with_past_decoder.hpp" + +namespace ov::genai { +std::shared_ptr WhisperDecoder::from_path(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); + + if (has_decoder_with_past) { + return std::make_shared(models_path, device, properties); + } + + return std::make_shared(models_path, device, properties); +} + +WhisperDecoder::~WhisperDecoder() = default; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp new file mode 100644 index 0000000000..cd58e54729 --- /dev/null +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { +class WhisperDecoder { +public: + static std::shared_ptr from_path(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + virtual std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) = 0; + + virtual std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) = 0; + + virtual void reset_state() = 0; + + virtual ~WhisperDecoder(); +}; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp new file mode 100644 index 0000000000..bc2c91c91f --- /dev/null +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "statefull_decoder.hpp" + +#include "utils.hpp" + +namespace ov::genai { +WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + ov::Core core = utils::singleton_core(); + + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); + + utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_request = compiled_model.create_infer_request(); +} + +std::pair WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + +std::pair WhisperStatefullDecoder::decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) { + m_request.set_tensor("encoder_hidden_states", encoder_hidden_state); + + ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); + m_request.set_tensor("input_ids", input_ids_tensor); + + ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position"); + cache_position_tensor.set_shape({input_ids.size()}); + + auto cache_data = cache_position_tensor.data(); + std::iota(cache_data, cache_data + cache_position_tensor.get_size(), cache_position); + + m_request.get_tensor("beam_idx").set_shape({1}); + m_request.get_tensor("beam_idx").data()[0] = 0; + + const auto infer_start = std::chrono::steady_clock::now(); + m_request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + + auto output_tensor = m_request.get_tensor("logits"); + + return {output_tensor, infer_ms}; +}; + +void WhisperStatefullDecoder::reset_state() { + m_request.reset_state(); +} +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp new file mode 100644 index 0000000000..6f1c9eb002 --- /dev/null +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "decoder.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { + +class WhisperStatefullDecoder : public WhisperDecoder { +public: + WhisperStatefullDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) override; + + std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) override; + + void reset_state() override; + +private: + ov::InferRequest m_request; +}; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp new file mode 100644 index 0000000000..7f62ea5657 --- /dev/null +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "with_past_decoder.hpp" + +#include + +#include "logger.hpp" +#include "utils.hpp" + +namespace { +void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { + // source outputs: + // present.0.decoder.key + // present.0.decoder.value + // present.0.encoder.key + // present.0.encoder.value + + // dest inputs: + // past_key_values.0.decoder.key + // past_key_values.0.decoder.value + // past_key_values.0.encoder.key + // past_key_values.0.encoder.value + + for (auto& source_output : source.get_compiled_model().outputs()) { + std::string source_output_name = source_output.get_any_name(); + if (source_output_name.find("logits") != std::string::npos) { + continue; + } + + std::string with_past_input_name = + std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); + + auto kv_tensor = source.get_tensor(source_output_name); + dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); + } +} +} // namespace + +namespace ov::genai { +WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n" + "To obtain stateful decoder model use latest `optimum-intel` package:\n" + "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n" + "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny"); + ov::Core core = utils::singleton_core(); + + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_request_decoder = compiled_model.create_infer_request(); + + compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); + m_request_decoder_with_past = compiled_model.create_infer_request(); +} + +std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + +std::pair WhisperWithPastDecoder::decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) { + const bool initial_step = cache_position == 0; + ov::InferRequest& request = initial_step ? m_request_decoder : m_request_decoder_with_past; + + request.set_tensor("encoder_hidden_states", encoder_hidden_state); + + const ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); + request.set_tensor("input_ids", input_ids_tensor); + + if (!initial_step) { + ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data()[0] = cache_position; + } + + const auto infer_start = std::chrono::steady_clock::now(); + request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + + auto output_tensor = request.get_tensor("logits"); + + if (initial_step) { + set_past_key_value(m_request_decoder, m_request_decoder_with_past); + } else if (!m_decoder_with_past_kv_value_set) { + set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past); + m_decoder_with_past_kv_value_set = true; + } + + return {output_tensor, infer_ms}; +} + +void WhisperWithPastDecoder::reset_state() { + m_request_decoder_with_past.reset_state(); + m_decoder_with_past_kv_value_set = false; +} +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp new file mode 100644 index 0000000000..c7af1cdaa2 --- /dev/null +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "decoder.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { + +class WhisperWithPastDecoder : public WhisperDecoder { +public: + WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) override; + + std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) override; + + void reset_state() override; + +private: + ov::InferRequest m_request_decoder; + ov::InferRequest m_request_decoder_with_past; + bool m_decoder_with_past_kv_value_set = false; +}; + +} // namespace ov::genai diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 04993f288c..3ab873609d 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -10,6 +10,7 @@ #include "context_tokens.hpp" #include "logit_processor.hpp" +#include "models/decoder.hpp" #include "openvino/genai/perf_metrics.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" @@ -53,89 +54,34 @@ ov::Tensor encode(ov::InferRequest& request, return request.get_tensor("last_hidden_state"); } -void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { - // source outputs: - // present.0.decoder.key - // present.0.decoder.value - // present.0.encoder.key - // present.0.encoder.value - - // dest inputs: - // past_key_values.0.decoder.key - // past_key_values.0.decoder.value - // past_key_values.0.encoder.key - // past_key_values.0.encoder.value - - for (auto& source_output : source.get_compiled_model().outputs()) { - std::string source_output_name = source_output.get_any_name(); - if (source_output_name.find("logits") != std::string::npos) { - continue; - } - - std::string with_past_input_name = - std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); - - auto kv_tensor = source.get_tensor(source_output_name); - dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); - } -} - int64_t decode(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder, - std::vector& input_ids, + std::shared_ptr decoder, + const std::vector& input_ids, + const size_t cache_position, const ov::genai::WhisperGenerationConfig& config, ov::genai::RawPerfMetrics& raw_metrics, - const bool apply_logit_processors = true, - const bool return_timestamps = false) { - decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); - decoder.set_tensor("input_ids", input_ids_tensor); - - ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); - - auto output_tensor = decoder.get_tensor("logits"); + const bool return_timestamps, + const bool initial_step, + const std::vector& generated_tokens) { + auto [output_tensor, infer_ms] = decoder->decode(encoder_hidden_state, input_ids, cache_position); + const auto infer_end = std::chrono::steady_clock::now(); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(1); - if (apply_logit_processors) { + if (initial_step) { ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens); - ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); - - if (return_timestamps) { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); - } } - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - return output_token; -} - -int64_t decode_with_past(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder_with_past, - int64_t input_id, - const size_t cache_position, - const ov::genai::WhisperGenerationConfig& config, - ov::genai::RawPerfMetrics& raw_metrics, - const bool return_timestamps, - const std::vector& generated_tokens) { - decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - std::vector input_ids = {input_id}; - ov::Tensor input_ids_tensor(ov::element::i64, {1, 1}, input_ids.data()); - decoder_with_past.set_tensor("input_ids", input_ids_tensor); - - ov::Tensor cache_position_tensor = decoder_with_past.get_tensor("cache_position"); - cache_position_tensor.set_shape({1}); - cache_position_tensor.data()[0] = cache_position; - - ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); - - auto output_tensor = decoder_with_past.get_tensor("logits"); - ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); if (return_timestamps) { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); + if (initial_step) { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); + } else { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); + } } int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); @@ -143,31 +89,8 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state, return output_token; } -int64_t detect_language(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder, - const ov::genai::WhisperGenerationConfig& config, - ov::genai::RawPerfMetrics& raw_metrics) { - std::vector input_ids{config.decoder_start_token_id}; - - decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); - decoder.set_tensor("input_ids", input_ids_tensor); - - const auto infer_start = std::chrono::steady_clock::now(); - decoder.infer(); - const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); - raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); - - auto output_tensor = decoder.get_tensor("logits"); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - return output_token; -} - std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, - ov::InferRequest decoder, + std::shared_ptr decoder, const ov::genai::WhisperGenerationConfig& config, const bool return_timestamps, ov::genai::RawPerfMetrics& raw_metrics) { @@ -186,7 +109,9 @@ std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, language_token_id = config.lang_to_id.at(language); } } else { - language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics); + auto [language_token, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id); + language_token_id = language_token; + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); } int64_t task_token_id = config.transcribe_token_id; @@ -206,14 +131,14 @@ std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, std::pair> full_decode(ov::Tensor& encoder_hidden_state, const ov::genai::WhisperGenerationConfig& config, - ov::genai::WhisperInitializedModels& models, - std::vector init_ids, + std::shared_ptr decoder, + const std::vector& init_tokens, const size_t max_new_tokens, const bool return_timestamps, ov::genai::RawPerfMetrics& raw_metrics, const std::shared_ptr streamer) { int64_t output_token = - decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps); + decode(encoder_hidden_state, decoder, init_tokens, 0, config, raw_metrics, return_timestamps, true, {}); std::vector output_tokens{output_token}; @@ -225,21 +150,16 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta return {false, output_tokens}; } - set_past_key_value(models.decoder, models.decoder_with_past); - for (size_t i = 0; i < max_new_tokens - 1; i++) { - auto output_token = decode_with_past(encoder_hidden_state, - models.decoder_with_past, - output_tokens.back(), - init_ids.size() + i, - config, - raw_metrics, - return_timestamps, - output_tokens); - - if (i == 0) { - set_past_key_value(models.decoder_with_past, models.decoder_with_past); - } + auto output_token = decode(encoder_hidden_state, + decoder, + {output_tokens.back()}, + init_tokens.size() + i, + config, + raw_metrics, + return_timestamps, + false, + output_tokens); if (output_token == config.eos_token_id) { break; @@ -264,7 +184,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& const ov::genai::WhisperConfig& model_config, const WhisperContextTokens& context_tokens, const RawSpeechInput& raw_speech, - ov::genai::WhisperInitializedModels& models, + ov::InferRequest& encoder, + std::shared_ptr decoder, WhisperFeatureExtractor& feature_extractor, const std::shared_ptr streamer) { size_t max_new_tokens = config.get_max_new_tokens(); @@ -301,16 +222,15 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames); - ov::Tensor hidden_state_tensor = encode(models.encoder, + ov::Tensor hidden_state_tensor = encode(encoder, input_features_chunk, feature_extractor.feature_size, feature_extractor.nb_max_frames, raw_metrics); - // prepare init_ids just once for whole input + // prepare init_tokens just once for whole input if (init_tokens.empty()) { - init_tokens = - prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics); + init_tokens = prepare_init_tokens(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics); } std::vector chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset); @@ -318,14 +238,14 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, config, - models, + decoder, chunk_init_tokens, max_new_tokens - output_tokens.size(), return_timestamps, raw_metrics, streamer); - models.decoder_with_past.reset_state(); + decoder->reset_state(); if (return_timestamps) { auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens, @@ -333,7 +253,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& feature_extractor.nb_max_frames, time_precision); - ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp index 81f559db9f..fbdf56d171 100644 --- a/src/cpp/src/whisper/whisper.hpp +++ b/src/cpp/src/whisper/whisper.hpp @@ -6,6 +6,7 @@ #include #include "context_tokens.hpp" +#include "models/decoder.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" #include "whisper_config.hpp" @@ -30,9 +31,10 @@ struct WhisperGenerateResult { WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config, const ov::genai::WhisperConfig& model_config, const WhisperContextTokens& context_tokens, - const ov::genai::RawSpeechInput& raw_speech, - ov::genai::WhisperInitializedModels& models, - ov::genai::WhisperFeatureExtractor& feature_extractor, + const RawSpeechInput& raw_speech, + ov::InferRequest& encoder, + std::shared_ptr decoder, + WhisperFeatureExtractor& feature_extractor, const std::shared_ptr streamer); } // namespace genai diff --git a/src/cpp/src/whisper/whisper_models.hpp b/src/cpp/src/whisper/whisper_models.hpp index 576bdb9dc7..9a915e92f4 100644 --- a/src/cpp/src/whisper/whisper_models.hpp +++ b/src/cpp/src/whisper/whisper_models.hpp @@ -3,7 +3,7 @@ #pragma once -#include +#include namespace ov { namespace genai { diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index 70dbc48507..ffd792c889 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -10,6 +10,7 @@ #include "utils.hpp" #include "whisper/context_tokens.hpp" +#include "whisper/models/decoder.hpp" #include "whisper/streamer.hpp" #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" @@ -47,25 +48,18 @@ namespace genai { class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::WhisperPipelineImplBase { public: - ov::genai::WhisperInitializedModels m_models; - WhisperPipelineStatefulImpl(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) : WhisperPipelineImplBase{models_path} { ov::Core core = utils::singleton_core(); - ov::CompiledModel compiled_model = core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); + ov::CompiledModel compiled_model = + core.compile_model(models_path / "openvino_encoder_model.xml", device, properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model"); - m_models.encoder = compiled_model.create_infer_request(); - - compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); - ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); - m_models.decoder = compiled_model.create_infer_request(); + m_encoder = compiled_model.create_infer_request(); - compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); - m_models.decoder_with_past = compiled_model.create_infer_request(); - ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); + m_decoder = WhisperDecoder::from_path(models_path, device, properties); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { @@ -99,7 +93,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi m_model_config, context_tokens, raw_speech_input, - m_models, + m_encoder, + m_decoder, m_feature_extractor, streamer_ptr); auto decode_start_time = std::chrono::steady_clock::now(); @@ -136,6 +131,10 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi return result; } + +private: + ov::InferRequest m_encoder; + std::shared_ptr m_decoder; }; std::pair streamer(ChunkStreamerVariant func) { diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index c851c71ee5..78cacd61ae 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index c046d1ae2c..06d5e56b3c 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -52,50 +52,25 @@ def get_whisper_models_list(tiny_only=False): # used whisper models are relatively small # cache them in memory to speedup tests @functools.lru_cache() -def read_whisper_model(params, **tokenizer_kwargs): +def read_whisper_model(params, stateful=True): model_id, path = params + if not stateful: + path = pathlib.Path(f"{path}_with_past") - processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) - - if (path / "openvino_encoder_model.xml").exists(): - opt_model = OVModelForSpeechSeq2Seq.from_pretrained( - path, - trust_remote_code=True, - compile=False, - device="CPU", - load_in_8bit=False, - ) - else: - - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( - tokenizer, - with_detokenizer=True, - clean_up_tokenization_spaces=False, - **tokenizer_kwargs, - ) + if not (path / "openvino_encoder_model.xml").exists(): + save_model(model_id=model_id, tmp_path=path, stateful=stateful) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - - # to store tokenizer config jsons with special tokens - tokenizer.save_pretrained(path) + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + path, + trust_remote_code=True, + compile=False, + device="CPU", + load_in_8bit=False, + ) - opt_model = OVModelForSpeechSeq2Seq.from_pretrained( - model_id, - export=True, - trust_remote_code=True, - stateful=False, - compile=False, - device="CPU", - load_in_8bit=False, - ) - opt_model.generation_config.save_pretrained(path) - opt_model.config.save_pretrained(path) - opt_model.save_pretrained(path) - processor.save_pretrained(path) + processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) - opt_pipe = pipeline( + hf_pipe = pipeline( "automatic-speech-recognition", model=opt_model, tokenizer=processor.tokenizer, @@ -105,11 +80,42 @@ def read_whisper_model(params, **tokenizer_kwargs): return ( model_id, path, - opt_pipe, + hf_pipe, ov_genai.WhisperPipeline(path, "CPU", **{"ENABLE_MMAP": False}), ) +def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True): + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( + tokenizer, + with_detokenizer=True, + clean_up_tokenization_spaces=False, + ) + + openvino.save_model(ov_tokenizer, tmp_path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, tmp_path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(tmp_path) + + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + model_id, + export=True, + trust_remote_code=True, + stateful=stateful, + compile=False, + device="CPU", + load_in_8bit=False, + ) + opt_model.generation_config.save_pretrained(tmp_path) + opt_model.config.save_pretrained(tmp_path) + opt_model.save_pretrained(tmp_path) + + processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) + processor.save_pretrained(tmp_path) + + def run_huggingface( pipeline, sample, @@ -179,6 +185,9 @@ def run_pipeline_with_ref( streamer: typing.Callable[[str], bool] | None = None, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) + _, _, hf_with_past_pipe, genai_with_past_pipe = read_whisper_model( + (model_id, tmp_path), stateful=False + ) if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) @@ -189,6 +198,12 @@ def run_pipeline_with_ref( compare_results(hf_result, genai_result) + genai_with_past_result = run_genai( + genai_with_past_pipe, _sample, generation_config, streamer + ) + + compare_results(hf_result, genai_with_past_result) + def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] @@ -274,9 +289,9 @@ def test_whisper_config_constructor(model_descr): @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_whisper_constructors(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample)["text"] + expected = hf_pipe(test_sample)["text"] genai_result = ov_genai.WhisperPipeline( models_path=path, device="CPU", **{"ENABLE_MMAP": False} @@ -294,17 +309,17 @@ def test_whisper_constructors(model_descr, test_sample): @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_max_new_tokens(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample, max_new_tokens=10) + expected = hf_pipe(test_sample, max_new_tokens=10) - genai_result = pipe.generate(test_sample, max_new_tokens=10) + genai_result = genai_pipe.generate(test_sample, max_new_tokens=10) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 10 - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) @@ -318,23 +333,23 @@ def test_max_new_tokens(model_descr, test_sample): ) @pytest.mark.precommit def test_language_mode(model_descr, test_samples): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) samples, language = test_samples - expected = opt_pipe( + expected = hf_pipe( samples[0], max_new_tokens=30, generate_kwargs={"language": language} ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( samples[0], max_new_tokens=30, language=f"<|{language}|>" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = f"<|{language}|>" - genai_result = pipe.generate(samples[0], config) + genai_result = genai_pipe.generate(samples[0], config) compare_results(expected, genai_result) @@ -345,46 +360,46 @@ def test_language_mode(model_descr, test_samples): ) @pytest.mark.precommit def test_task_mode(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe( + expected = hf_pipe( test_sample, max_new_tokens=30, generate_kwargs={"language": "fr", "task": "translate"}, ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( test_sample, max_new_tokens=30, language="<|fr|>", task="translate" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = "<|fr|>" config.task = "translate" - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) # seems to be equivalent to translate task - expected = opt_pipe( + expected = hf_pipe( test_sample, max_new_tokens=30, generate_kwargs={"language": "en", "task": "transcribe"}, ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( test_sample, max_new_tokens=30, language="<|en|>", task="transcribe" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = "<|en|>" config.task = "transcribe" - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) @@ -400,12 +415,12 @@ def test_task_mode(model_descr, test_sample): ) @pytest.mark.precommit def test_language_autodetect(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - input_features = opt_pipe.feature_extractor(test_sample) - language_id = opt_pipe.model.detect_language(input_features["input_features"])[0] + input_features = hf_pipe.feature_extractor(test_sample) + language_id = hf_pipe.model.detect_language(input_features["input_features"])[0] # ensure detected language us not english - assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"] + assert language_id != genai_pipe.get_generation_config().lang_to_id["<|en|>"] run_pipeline_with_ref( model_id=model_descr[0], @@ -469,6 +484,34 @@ def test_longform_audio(model_descr, test_sample): assert "".join(streamer_result) == hf_result["text"] +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) +@pytest.mark.parametrize( + "test_sample", get_samples_from_dataset(length=10, long_form=True) +) +@pytest.mark.precommit +def test_longform_audio_with_past(model_descr, test_sample): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True) + + streamer_result = [] + + genai_result = run_genai( + genai_pipe, + test_sample, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + streamer=lambda x: streamer_result.append(x), + ) + + hf_result = run_huggingface( + hf_pipe, + test_sample, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + ) + + compare_results(hf_result, genai_result) + + assert "".join(streamer_result) == hf_result["text"] + + @pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit def test_shortform(model_descr): @@ -494,19 +537,19 @@ def test_shortform(model_descr): ) @pytest.mark.precommit def test_initial_prompt_hotwords(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - result = pipe.generate(test_sample) + result = genai_pipe.generate(test_sample) assert "Joel Keaton" in result.texts[0] assert "Joel Kyton" not in result.texts[0] - result = pipe.generate(test_sample, initial_prompt="Joel Kyton") + result = genai_pipe.generate(test_sample, initial_prompt="Joel Kyton") assert "Joel Keaton" not in result.texts[0] assert "Joel Kyton" in result.texts[0] - result = pipe.generate(test_sample, hotwords="Joel Kyton") + result = genai_pipe.generate(test_sample, hotwords="Joel Kyton") assert "Joel Keaton" not in result.texts[0] assert "Joel Kyton" in result.texts[0] @@ -521,9 +564,9 @@ def test_initial_prompt_hotwords(model_descr, test_sample): ) @pytest.mark.precommit def test_perf_metrics(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - result = pipe.generate(test_sample) + result = genai_pipe.generate(test_sample) perf_metrics = result.perf_metrics