From 2bcc94064f57943d8c3755d29bed903ddf31bf6c Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Wed, 13 Dec 2023 16:15:00 +0400
Subject: [PATCH] [GPU] Fixed stateful KV cache issues (#21618)

---
 .../intel_gpu/primitives/broadcast.hpp        |  8 +-
 src/plugins/intel_gpu/src/graph/broadcast.cpp |  2 +-
 .../mark_shape_of_subgraphs.cpp               |  5 +
 .../src/graph/include/read_value_inst.h       |  7 +-
 .../intel_gpu/src/graph/primitive_inst.cpp    | 44 +++++----
 .../intel_gpu/src/plugin/ops/broadcast.cpp    |  2 +
 .../intel_gpu/src/plugin/variable_state.cpp   |  2 +
 .../tests/common/subgraphs_builders.hpp       | 52 +++++++++-
 .../subgraph_tests/dynamic/kv_cache.cpp       | 98 +++++++++++++------
 9 files changed, 163 insertions(+), 57 deletions(-)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp
index 70fcb1ee44cde7..c86c0ed2a7da97 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "openvino/core/partial_shape.hpp"
 #include "openvino/op/broadcast.hpp"
 
 #include "primitive.hpp"
@@ -131,6 +132,8 @@ struct broadcast : public primitive_base<broadcast> {
     ///        along which broadcast should happen.
     std::vector<uint16_t> broadcast_axes;
 
+    ov::PartialShape output_pshape = ov::PartialShape::dynamic();
+
     size_t hash() const override {
         size_t seed = primitive::hash();
         seed = hash_range(seed, broadcast_axes.begin(), broadcast_axes.end());
@@ -146,7 +149,8 @@ struct broadcast : public primitive_base<broadcast> {
 
         return axes_mapping == rhs_casted.axes_mapping &&
                broadcast_mode == rhs_casted.broadcast_mode &&
-               broadcast_sizes == rhs_casted.broadcast_sizes;
+               broadcast_sizes == rhs_casted.broadcast_sizes &&
+               output_pshape == rhs_casted.output_pshape;
     }
 
     void save(BinaryOutputBuffer& ob) const override {
@@ -156,6 +160,7 @@ struct broadcast : public primitive_base<broadcast> {
         ob << make_data(&broadcast_mode, sizeof(ov::op::BroadcastModeSpec));
         ob << broadcast_sizes;
         ob << broadcast_axes;
+        ob << output_pshape;
     }
 
     void load(BinaryInputBuffer& ib) override {
@@ -165,6 +170,7 @@ struct broadcast : public primitive_base<broadcast> {
         ib >> make_data(&broadcast_mode, sizeof(ov::op::BroadcastModeSpec));
         ib >> broadcast_sizes;
         ib >> broadcast_axes;
+        ib >> output_pshape;
     }
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/broadcast.cpp b/src/plugins/intel_gpu/src/graph/broadcast.cpp
index e4e5b369dabd0c..7af4434b3c7e06 100644
--- a/src/plugins/intel_gpu/src/graph/broadcast.cpp
+++ b/src/plugins/intel_gpu/src/graph/broadcast.cpp
@@ -90,7 +90,7 @@ std::vector<layout> broadcast_inst::calc_output_layouts(broadcast_node const& /*
         if (input1.is_static()) {
             output_rank = input1.get_dim(0);    // target shape rank is set as second input.
         }
-        output_shapes[0] = ShapeType::dynamic(std::max(static_cast<int>(output_rank), 1));
+        output_shapes[0] = desc->output_pshape.rank().is_static() ? desc->output_pshape : ShapeType::dynamic(std::max(static_cast<int>(output_rank), 1));
     }
 
     format output_format = format::adjust_to_rank(input0_layout.format, output_shapes[0].size());
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
index 4d7e61a7e4eff7..d6d365e0d1f94b 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
@@ -3,6 +3,7 @@
 //
 
 #include "shape_of_inst.h"
+#include "read_value_inst.h"
 #include "reshape_inst.h"
 #include "eltwise_inst.h"
 #include "pass_manager.h"
@@ -43,6 +44,10 @@ bool mark_shape_of_subgraphs::can_mark_node(const program_node& node) {
     if (node.has_fused_primitives())
         return false;
 
+    // read_value may have initializer which is shape_of sub-graph, but read_value itself is not a part of such sub-graph
+    if (node.is_type<read_value>())
+        return false;
+
     if (node.is_type<reshape>())
         return true;
 
diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
index a84be19aae2651..7209e8756fbf76 100644
--- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
@@ -34,11 +34,8 @@ class typed_primitive_inst<read_value> : public typed_primitive_inst_base<read_v
     static std::vector<layout> calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) {
         auto desc = impl_param.typed_desc<read_value>();
         const auto default_layout = desc->output_layout;
-        auto out_layout = impl_param.state_layout.value_or(default_layout);
-        if (out_layout.is_dynamic() && desc->input_size() > 0) {
-            out_layout = impl_param.get_input_layout(0);
-        }
-        return { out_layout };
+
+        return { impl_param.state_layout.value_or(default_layout) };
     }
 
     static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param);
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index cf4b84fee6f2a5..e967a5b7f152e9 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -265,8 +265,30 @@ void primitive_inst::update_shape() {
     }
 
     if (get_node().is_type<read_value>()) {
-        const auto& variable_id = get_node().as<read_value>().get_primitive()->variable_id;
-        auto new_layout = get_network().get_variable(variable_id).get_layout();
+        auto prim = get_node().as<read_value>().get_primitive();
+        const auto& variable_id = prim->variable_id;
+        auto& variable = get_network().get_variable(variable_id);
+        // Initial variable shape is taken from variable itself
+        auto new_layout = variable.get_layout();
+
+        // If variable is not set and we have an initializer - use it's shape as shape of variable
+        if (!variable.is_set() && _impl_params->input_layouts.size() == 1) {
+            new_layout = _impl_params->get_input_layout(0);
+        }
+
+        // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0
+        if (new_layout.is_dynamic()) {
+            auto pshape = new_layout.get_partial_shape();
+            for (auto& d : pshape) {
+                if (d.is_dynamic()) {
+                    d = 0;
+                }
+            }
+            new_layout.set_partial_shape(pshape);
+        }
+
+        variable.set_layout(new_layout);
+
         if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) {
             _impl_params->state_layout = new_layout;
             input_shape_changed = true;
@@ -299,7 +321,7 @@ void primitive_inst::update_shape() {
             }
         }
         if (!subgraph_input_changed) {
-            GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgrap and input shape is not changed\n";
+            GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgraph and input shape is not changed\n";
             reset_shape_change();
             return;
         }
@@ -402,20 +424,6 @@ void primitive_inst::update_shape() {
         get_network().get_variable(desc->variable_id).set_layout(_impl_params->get_output_layout());
         _impl_params->state_layout = _impl_params->get_output_layout();
     }
-
-    if (get_node().is_type<read_value>()) {
-        auto desc = get_node().as<read_value>().get_primitive();
-        if (_impl_params->output_layouts[0].is_dynamic()) {
-            auto pshape = _impl_params->output_layouts[0].get_partial_shape();
-            for (auto& d : pshape) {
-                if (d.is_dynamic()) {
-                    d = 0;
-                }
-            }
-            _impl_params->output_layouts[0].set_partial_shape(pshape);
-        }
-        get_network().get_variable(desc->variable_id).set_layout(_impl_params->get_output_layout());
-    }
 }
 
 event::ptr primitive_inst::realloc_if_needed() {
@@ -448,7 +456,7 @@ event::ptr primitive_inst::realloc_if_needed() {
     // read_value/assign nodes are supposed to always use variable memory
     if (auto stateful_prim = dynamic_cast<memory_state::variable*>(this)) {
         std::string variable_id = stateful_prim->variable_id();
-        auto variable = get_network().get_variable(variable_id);
+        auto& variable = get_network().get_variable(variable_id);
         variable.set_layout(actual_layout);
         GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory()
                                << " (size=" << variable.get_memory()->size() << ")" << std::endl;
diff --git a/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp b/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp
index 3e208bd2c99063..aa764774275e73 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp
@@ -90,6 +90,8 @@ static void CreateCommonBroadcastOp(ProgramBuilder& p, const std::shared_ptr<ov:
                                                             mode);
     }
 
+    broadcast_prim->output_pshape = op->get_output_partial_shape(0);
+
     p.add_primitive(*op, broadcast_prim);
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
index 80a393e506747c..cdd551b5ca82ed 100644
--- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp
+++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
@@ -9,6 +9,7 @@
 #include "intel_gpu/plugin/variable_state.hpp"
 #include "intel_gpu/runtime/memory_caps.hpp"
 #include "intel_gpu/runtime/layout.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 
 #include <memory>
 
@@ -45,6 +46,7 @@ void VariableState::set() {
 
 void VariableState::set_layout(const cldnn::layout& new_layout) {
     m_layout = new_layout;
+    GPU_DEBUG_TRACE_DETAIL << "Update state layout to " << new_layout.to_short_string() << std::endl;
     update_device_buffer();
 }
 
diff --git a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
index 12fc37cac187d6..7696d547ea1c22 100644
--- a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
+++ b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
@@ -7,13 +7,20 @@
 #include <memory>
 #include "openvino/core/dimension.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/node_vector.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/op/broadcast.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/gather.hpp"
+#include "openvino/op/read_value.hpp"
+#include "openvino/op/shape_of.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/result.hpp"
 #include "openvino/op/parameter.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/concat.hpp"
+#include "openvino/op/util/read_value_base.hpp"
 #include "openvino/pass/make_stateful.hpp"
 
 namespace tests {
@@ -22,7 +29,9 @@ inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch
                                                             ov::Dimension n_heads = ov::Dimension::dynamic(),
                                                             ov::Dimension n_features = ov::Dimension::dynamic(),
                                                             ov::element::Type_t element_type = ov::element::f32,
-                                                            bool stateful = false) {
+                                                            bool stateful = false,
+                                                            bool fuse_cache_reorder = false,
+                                                            bool build_state_initializer = false) {
     ov::PartialShape kv_cache_size = {batch, n_heads, -1, n_features};
     ov::PartialShape new_token_size = {batch, -1, n_heads, n_features};
     ov::PartialShape matmul_in_size = {batch, n_heads, -1, -1};
@@ -34,9 +43,37 @@ inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch
     auto in_matmul = std::make_shared<ov::op::v0::Parameter>(element_type, matmul_in_size);
     in_matmul->set_friendly_name("in_matmul");
 
+    ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul};
+    std::shared_ptr<ov::Node> concat_input = in_kv_prev;
+    if (fuse_cache_reorder) {
+        auto in_beam_idx = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{batch});
+        in_beam_idx->set_friendly_name("beam_idx");
+        params.push_back(in_beam_idx);
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
+        auto gather = std::make_shared<ov::op::v8::Gather>(in_kv_prev, in_beam_idx, axis, 0);
+        concat_input = gather;
+    }
+
+    std::shared_ptr<ov::Node> state_initializer = nullptr;
+    if (stateful && build_state_initializer) {
+        auto shapeof = std::make_shared<ov::op::v3::ShapeOf>(in_new_token, ov::element::i32);
+
+        auto indices = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, 0);
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
+        auto gather = std::make_shared<ov::op::v8::Gather>(shapeof, indices, axis, 0);
+
+        auto bcast_value = std::make_shared<ov::op::v0::Constant>(element_type, ov::Shape{}, 0.0f);
+        ov::NodeVector dims = {gather};
+        for (size_t i = 1; i < kv_cache_size.size(); i++) {
+            dims.push_back(std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, static_cast<int32_t>(kv_cache_size[i].get_min_length())));
+        }
+        auto shape = std::make_shared<ov::op::v0::Concat>(dims, 0);
+        state_initializer = std::make_shared<ov::op::v3::Broadcast>(bcast_value, shape);
+    }
+
     auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, {new_token_size.size()}, {0, 2, 1, 3});
     auto transpose = std::make_shared<ov::op::v1::Transpose>(in_new_token, transpose_const);
-    auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{in_kv_prev, transpose}, 2);
+    auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{concat_input, transpose}, 2);
     auto convert = std::make_shared<ov::op::v0::Convert>(concat, element_type);
     auto kv_present = std::make_shared<ov::op::v0::Result>(convert);
     kv_present->set_friendly_name("present_key_values");
@@ -44,13 +81,22 @@ inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch
     auto matmul_out = std::make_shared<ov::op::v0::Result>(matmul);
     matmul_out->set_friendly_name("matmul_out");
 
-    ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul};
     ov::ResultVector results{kv_present, matmul_out};
     auto model = std::make_shared<ov::Model>(results, params, "LLM-KV-Cache");
     if (stateful) {
         ov::pass::MakeStateful({{in_kv_prev, kv_present}}).run_on_model(model);
     }
 
+    if (state_initializer) {
+        for (auto op : model->get_ops()) {
+            if (auto read_value = std::dynamic_pointer_cast<ov::op::v6::ReadValue>(op)) {
+                read_value->set_arguments(ov::OutputVector{state_initializer});
+                break;
+            }
+        }
+    }
+    model->validate_nodes_and_infer_types();
+
     return model;
 }
 
diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
index a347261d756d12..2d949cbebcd677 100644
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
@@ -78,7 +78,7 @@ TEST_P(KVCacheTest, Inference_cached) {
         ov::test::utils::removeFilesWithExt(cacheDirName, "blob");
         ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache");
         ov::test::utils::removeDir(cacheDirName);
-        core->set_property(ov::cache_dir(cacheDirName));
+        configuration.insert(ov::cache_dir(cacheDirName));
         compile_model();
     }
     {
@@ -112,7 +112,9 @@ class KVCacheTests: public ::testing::Test {
         GTEST_SKIP();
     #endif
         auto core = ov::test::utils::PluginCache::get().core();
-
+        ov::AnyMap properties = {
+            ov::hint::inference_precision(ov::element::f16)
+        };
         std::string cacheDirName;
         if (is_caching_test) {
             std::stringstream ss;
@@ -123,7 +125,7 @@ class KVCacheTests: public ::testing::Test {
             ov::test::utils::removeFilesWithExt(cacheDirName, "blob");
             ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache");
             ov::test::utils::removeDir(cacheDirName);
-            core->set_property(ov::cache_dir(cacheDirName));
+            properties.insert(ov::cache_dir(cacheDirName));
         }
 
         const size_t batch = 1;
@@ -136,9 +138,9 @@ class KVCacheTests: public ::testing::Test {
 
         auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type);
         if (is_caching_test) {
-            core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
+            core->compile_model(model, ov::test::utils::DEVICE_GPU, properties);
         }
-        auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
+        auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties);
 
         auto input0 = model->get_parameters().at(0);
         auto input1 = model->get_parameters().at(1);
@@ -249,12 +251,16 @@ class KVCacheTests: public ::testing::Test {
         }
     }
 
-    void test_smoke_multipleIterations_stateful(bool is_caching_test) {
+    void test_smoke_multipleIterations_stateful(bool is_caching_test, bool fuse_cache_reorder, bool build_state_initializer) {
     #if defined(ANDROID)
         GTEST_SKIP();
     #endif
         auto core = ov::test::utils::PluginCache::get().core();
 
+        ov::AnyMap properties = {
+            ov::hint::inference_precision(ov::element::f16)
+        };
+
         std::string cacheDirName;
         if (is_caching_test) {
             std::stringstream ss;
@@ -265,7 +271,7 @@ class KVCacheTests: public ::testing::Test {
             ov::test::utils::removeFilesWithExt(cacheDirName, "blob");
             ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache");
             ov::test::utils::removeDir(cacheDirName);
-            core->set_property(ov::cache_dir(cacheDirName));
+            properties.insert(ov::cache_dir(cacheDirName));
         }
 
         const size_t batch = 1;
@@ -276,34 +282,58 @@ class KVCacheTests: public ::testing::Test {
 
         ov::element::Type element_type = ov::element::f16;
 
-        auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type, true);
-        auto ref_model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type, false);
+        const bool stateful = true;
+
+        auto model = tests::make_llm_kv_cache_pattern(build_state_initializer ? ov::Dimension::dynamic() : batch,
+                                                      n_heads,
+                                                      n_features,
+                                                      element_type,
+                                                      stateful,
+                                                      fuse_cache_reorder,
+                                                      build_state_initializer && stateful);
+        auto ref_model = tests::make_llm_kv_cache_pattern(build_state_initializer ? ov::Dimension::dynamic() : batch,
+                                                          n_heads,
+                                                          n_features,
+                                                          element_type,
+                                                          !stateful,
+                                                          fuse_cache_reorder,
+                                                          build_state_initializer && !stateful);
         if (is_caching_test) {
-            core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
+            core->compile_model(model, ov::test::utils::DEVICE_GPU, properties);
         }
-        auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
+        auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties);
 
         auto input0 = model->get_parameters().at(0);
         auto input1 = model->get_parameters().at(1);
+        auto input2 = fuse_cache_reorder ? model->get_parameters().at(2) : nullptr;
         auto output0 = model->get_results().at(0);
 
-        auto get_ref_results = [&](const ov::Tensor& kv_cache, const ov::Tensor& new_token_data, const ov::Tensor& matmul_data) {
+        auto beam_idx_shape = ov::Shape{batch};
+        auto beam_idx_data = ov::Tensor(ov::element::i32, beam_idx_shape);
+        for (size_t i = 0; i < batch; i++) {
+            beam_idx_data.data<int32_t>()[i] = i;
+        }
+
+        auto get_ref_results = [&ref_model, fuse_cache_reorder, &beam_idx_shape, &beam_idx_data](const ov::Tensor& kv_cache,
+                                                                                                 const ov::Tensor& new_token_data,
+                                                                                                 const ov::Tensor& matmul_data) {
             auto input0 = ref_model->get_parameters().at(0);
             auto input1 = ref_model->get_parameters().at(1);
             auto input2 = ref_model->get_parameters().at(2);
-            ngraph::helpers::resize_function(ref_model, {kv_cache.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()});
-
-            auto compiled_model_ref = core->compile_model(ref_model, ov::test::utils::DEVICE_TEMPLATE);
-            auto inf_req_ref = compiled_model_ref.create_infer_request();
-            inf_req_ref.set_tensor(input0, kv_cache);
-            inf_req_ref.set_tensor(input1, new_token_data);
-            inf_req_ref.set_tensor(input2, matmul_data);
-            inf_req_ref.infer();
-            std::vector<ov::Tensor> results_ref;
-            for (auto&& output : ref_model->get_results()) {
-                results_ref.push_back(inf_req_ref.get_tensor(output));
+            auto input3 = fuse_cache_reorder ? ref_model->get_parameters().at(3)  : nullptr;
+            std::vector<ov::Shape> input_shapes = {kv_cache.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()};
+            std::map<std::shared_ptr<ov::Node>, ov::Tensor> inputs = {
+                {input0, kv_cache},
+                {input1, new_token_data},
+                {input2, matmul_data}
+            };
+            if (fuse_cache_reorder) {
+                input_shapes.push_back(beam_idx_shape);
+                inputs.emplace(input3, beam_idx_data);
             }
-            return results_ref;
+
+            ngraph::helpers::resize_function(ref_model, input_shapes);
+            return ngraph::helpers::interpretFunction(ref_model, inputs);
         };
 
         auto compare_tensors = [&model](const std::vector<ov::Tensor> expected, const std::vector<ov::Tensor>& actual) {
@@ -335,7 +365,9 @@ class KVCacheTests: public ::testing::Test {
 
         infer_request.set_tensor(input0, new_token_input);
         infer_request.set_tensor(input1, matmul_input);
-
+        if (fuse_cache_reorder) {
+            infer_request.set_tensor(input2, beam_idx_data);
+        }
         ov::Tensor ref_kv_cache;
 
         {
@@ -401,11 +433,19 @@ TEST_F(KVCacheTests, smoke_multipleIterations_cached) {
     this->test_smoke_multipleIterations(true);
 }
 
-TEST_F(KVCacheTests, smoke_multipleIterations_stateful) {
-    this->test_smoke_multipleIterations_stateful(false);
+TEST_F(KVCacheTests, smoke_multipleIterations_stateful_no_gather_no_initializer) {
+    this->test_smoke_multipleIterations_stateful(false, false, false);
+}
+
+TEST_F(KVCacheTests, smoke_multipleIterations_stateful_no_gather_no_initializer_cached) {
+    this->test_smoke_multipleIterations_stateful(true, false, false);
+}
+
+TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer) {
+    this->test_smoke_multipleIterations_stateful(false, true, true);
 }
 
-TEST_F(KVCacheTests, smoke_multipleIterations_stateful_cached) {
-    this->test_smoke_multipleIterations_stateful(true);
+TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_cached) {
+    this->test_smoke_multipleIterations_stateful(true, true, true);
 }
 } // namespace