From 2bcc94064f57943d8c3755d29bed903ddf31bf6c Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 13 Dec 2023 16:15:00 +0400 Subject: [PATCH] [GPU] Fixed stateful KV cache issues (#21618) --- .../intel_gpu/primitives/broadcast.hpp | 8 +- src/plugins/intel_gpu/src/graph/broadcast.cpp | 2 +- .../mark_shape_of_subgraphs.cpp | 5 + .../src/graph/include/read_value_inst.h | 7 +- .../intel_gpu/src/graph/primitive_inst.cpp | 44 +++++---- .../intel_gpu/src/plugin/ops/broadcast.cpp | 2 + .../intel_gpu/src/plugin/variable_state.cpp | 2 + .../tests/common/subgraphs_builders.hpp | 52 +++++++++- .../subgraph_tests/dynamic/kv_cache.cpp | 98 +++++++++++++------ 9 files changed, 163 insertions(+), 57 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp index 70fcb1ee44cde7..c86c0ed2a7da97 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/broadcast.hpp @@ -4,6 +4,7 @@ #pragma once +#include "openvino/core/partial_shape.hpp" #include "openvino/op/broadcast.hpp" #include "primitive.hpp" @@ -131,6 +132,8 @@ struct broadcast : public primitive_base { /// along which broadcast should happen. std::vector broadcast_axes; + ov::PartialShape output_pshape = ov::PartialShape::dynamic(); + size_t hash() const override { size_t seed = primitive::hash(); seed = hash_range(seed, broadcast_axes.begin(), broadcast_axes.end()); @@ -146,7 +149,8 @@ struct broadcast : public primitive_base { return axes_mapping == rhs_casted.axes_mapping && broadcast_mode == rhs_casted.broadcast_mode && - broadcast_sizes == rhs_casted.broadcast_sizes; + broadcast_sizes == rhs_casted.broadcast_sizes && + output_pshape == rhs_casted.output_pshape; } void save(BinaryOutputBuffer& ob) const override { @@ -156,6 +160,7 @@ struct broadcast : public primitive_base { ob << make_data(&broadcast_mode, sizeof(ov::op::BroadcastModeSpec)); ob << broadcast_sizes; ob << broadcast_axes; + ob << output_pshape; } void load(BinaryInputBuffer& ib) override { @@ -165,6 +170,7 @@ struct broadcast : public primitive_base { ib >> make_data(&broadcast_mode, sizeof(ov::op::BroadcastModeSpec)); ib >> broadcast_sizes; ib >> broadcast_axes; + ib >> output_pshape; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/broadcast.cpp b/src/plugins/intel_gpu/src/graph/broadcast.cpp index e4e5b369dabd0c..7af4434b3c7e06 100644 --- a/src/plugins/intel_gpu/src/graph/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/broadcast.cpp @@ -90,7 +90,7 @@ std::vector broadcast_inst::calc_output_layouts(broadcast_node const& /* if (input1.is_static()) { output_rank = input1.get_dim(0); // target shape rank is set as second input. } - output_shapes[0] = ShapeType::dynamic(std::max(static_cast(output_rank), 1)); + output_shapes[0] = desc->output_pshape.rank().is_static() ? desc->output_pshape : ShapeType::dynamic(std::max(static_cast(output_rank), 1)); } format output_format = format::adjust_to_rank(input0_layout.format, output_shapes[0].size()); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp index 4d7e61a7e4eff7..d6d365e0d1f94b 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp @@ -3,6 +3,7 @@ // #include "shape_of_inst.h" +#include "read_value_inst.h" #include "reshape_inst.h" #include "eltwise_inst.h" #include "pass_manager.h" @@ -43,6 +44,10 @@ bool mark_shape_of_subgraphs::can_mark_node(const program_node& node) { if (node.has_fused_primitives()) return false; + // read_value may have initializer which is shape_of sub-graph, but read_value itself is not a part of such sub-graph + if (node.is_type()) + return false; + if (node.is_type()) return true; diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index a84be19aae2651..7209e8756fbf76 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -34,11 +34,8 @@ class typed_primitive_inst : public typed_primitive_inst_base calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto default_layout = desc->output_layout; - auto out_layout = impl_param.state_layout.value_or(default_layout); - if (out_layout.is_dynamic() && desc->input_size() > 0) { - out_layout = impl_param.get_input_layout(0); - } - return { out_layout }; + + return { impl_param.state_layout.value_or(default_layout) }; } static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index cf4b84fee6f2a5..e967a5b7f152e9 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -265,8 +265,30 @@ void primitive_inst::update_shape() { } if (get_node().is_type()) { - const auto& variable_id = get_node().as().get_primitive()->variable_id; - auto new_layout = get_network().get_variable(variable_id).get_layout(); + auto prim = get_node().as().get_primitive(); + const auto& variable_id = prim->variable_id; + auto& variable = get_network().get_variable(variable_id); + // Initial variable shape is taken from variable itself + auto new_layout = variable.get_layout(); + + // If variable is not set and we have an initializer - use it's shape as shape of variable + if (!variable.is_set() && _impl_params->input_layouts.size() == 1) { + new_layout = _impl_params->get_input_layout(0); + } + + // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 + if (new_layout.is_dynamic()) { + auto pshape = new_layout.get_partial_shape(); + for (auto& d : pshape) { + if (d.is_dynamic()) { + d = 0; + } + } + new_layout.set_partial_shape(pshape); + } + + variable.set_layout(new_layout); + if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { _impl_params->state_layout = new_layout; input_shape_changed = true; @@ -299,7 +321,7 @@ void primitive_inst::update_shape() { } } if (!subgraph_input_changed) { - GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgrap and input shape is not changed\n"; + GPU_DEBUG_TRACE_DETAIL << id() << ": skip shape_update, because it is in shape_of_subgraph and input shape is not changed\n"; reset_shape_change(); return; } @@ -402,20 +424,6 @@ void primitive_inst::update_shape() { get_network().get_variable(desc->variable_id).set_layout(_impl_params->get_output_layout()); _impl_params->state_layout = _impl_params->get_output_layout(); } - - if (get_node().is_type()) { - auto desc = get_node().as().get_primitive(); - if (_impl_params->output_layouts[0].is_dynamic()) { - auto pshape = _impl_params->output_layouts[0].get_partial_shape(); - for (auto& d : pshape) { - if (d.is_dynamic()) { - d = 0; - } - } - _impl_params->output_layouts[0].set_partial_shape(pshape); - } - get_network().get_variable(desc->variable_id).set_layout(_impl_params->get_output_layout()); - } } event::ptr primitive_inst::realloc_if_needed() { @@ -448,7 +456,7 @@ event::ptr primitive_inst::realloc_if_needed() { // read_value/assign nodes are supposed to always use variable memory if (auto stateful_prim = dynamic_cast(this)) { std::string variable_id = stateful_prim->variable_id(); - auto variable = get_network().get_variable(variable_id); + auto& variable = get_network().get_variable(variable_id); variable.set_layout(actual_layout); GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory() << " (size=" << variable.get_memory()->size() << ")" << std::endl; diff --git a/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp b/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp index 3e208bd2c99063..aa764774275e73 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/broadcast.cpp @@ -90,6 +90,8 @@ static void CreateCommonBroadcastOp(ProgramBuilder& p, const std::shared_ptroutput_pshape = op->get_output_partial_shape(0); + p.add_primitive(*op, broadcast_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index 80a393e506747c..cdd551b5ca82ed 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -9,6 +9,7 @@ #include "intel_gpu/plugin/variable_state.hpp" #include "intel_gpu/runtime/memory_caps.hpp" #include "intel_gpu/runtime/layout.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" #include @@ -45,6 +46,7 @@ void VariableState::set() { void VariableState::set_layout(const cldnn::layout& new_layout) { m_layout = new_layout; + GPU_DEBUG_TRACE_DETAIL << "Update state layout to " << new_layout.to_short_string() << std::endl; update_device_buffer(); } diff --git a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp index 12fc37cac187d6..7696d547ea1c22 100644 --- a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp +++ b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp @@ -7,13 +7,20 @@ #include #include "openvino/core/dimension.hpp" #include "openvino/core/model.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/op/broadcast.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/read_value.hpp" +#include "openvino/op/shape_of.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/result.hpp" #include "openvino/op/parameter.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/concat.hpp" +#include "openvino/op/util/read_value_base.hpp" #include "openvino/pass/make_stateful.hpp" namespace tests { @@ -22,7 +29,9 @@ inline std::shared_ptr make_llm_kv_cache_pattern(ov::Dimension batch ov::Dimension n_heads = ov::Dimension::dynamic(), ov::Dimension n_features = ov::Dimension::dynamic(), ov::element::Type_t element_type = ov::element::f32, - bool stateful = false) { + bool stateful = false, + bool fuse_cache_reorder = false, + bool build_state_initializer = false) { ov::PartialShape kv_cache_size = {batch, n_heads, -1, n_features}; ov::PartialShape new_token_size = {batch, -1, n_heads, n_features}; ov::PartialShape matmul_in_size = {batch, n_heads, -1, -1}; @@ -34,9 +43,37 @@ inline std::shared_ptr make_llm_kv_cache_pattern(ov::Dimension batch auto in_matmul = std::make_shared(element_type, matmul_in_size); in_matmul->set_friendly_name("in_matmul"); + ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul}; + std::shared_ptr concat_input = in_kv_prev; + if (fuse_cache_reorder) { + auto in_beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{batch}); + in_beam_idx->set_friendly_name("beam_idx"); + params.push_back(in_beam_idx); + auto axis = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto gather = std::make_shared(in_kv_prev, in_beam_idx, axis, 0); + concat_input = gather; + } + + std::shared_ptr state_initializer = nullptr; + if (stateful && build_state_initializer) { + auto shapeof = std::make_shared(in_new_token, ov::element::i32); + + auto indices = std::make_shared(ov::element::i32, ov::Shape{1}, 0); + auto axis = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto gather = std::make_shared(shapeof, indices, axis, 0); + + auto bcast_value = std::make_shared(element_type, ov::Shape{}, 0.0f); + ov::NodeVector dims = {gather}; + for (size_t i = 1; i < kv_cache_size.size(); i++) { + dims.push_back(std::make_shared(ov::element::i32, ov::Shape{1}, static_cast(kv_cache_size[i].get_min_length()))); + } + auto shape = std::make_shared(dims, 0); + state_initializer = std::make_shared(bcast_value, shape); + } + auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, {new_token_size.size()}, {0, 2, 1, 3}); auto transpose = std::make_shared(in_new_token, transpose_const); - auto concat = std::make_shared(ov::OutputVector{in_kv_prev, transpose}, 2); + auto concat = std::make_shared(ov::OutputVector{concat_input, transpose}, 2); auto convert = std::make_shared(concat, element_type); auto kv_present = std::make_shared(convert); kv_present->set_friendly_name("present_key_values"); @@ -44,13 +81,22 @@ inline std::shared_ptr make_llm_kv_cache_pattern(ov::Dimension batch auto matmul_out = std::make_shared(matmul); matmul_out->set_friendly_name("matmul_out"); - ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul}; ov::ResultVector results{kv_present, matmul_out}; auto model = std::make_shared(results, params, "LLM-KV-Cache"); if (stateful) { ov::pass::MakeStateful({{in_kv_prev, kv_present}}).run_on_model(model); } + if (state_initializer) { + for (auto op : model->get_ops()) { + if (auto read_value = std::dynamic_pointer_cast(op)) { + read_value->set_arguments(ov::OutputVector{state_initializer}); + break; + } + } + } + model->validate_nodes_and_infer_types(); + return model; } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index a347261d756d12..2d949cbebcd677 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -78,7 +78,7 @@ TEST_P(KVCacheTest, Inference_cached) { ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); ov::test::utils::removeDir(cacheDirName); - core->set_property(ov::cache_dir(cacheDirName)); + configuration.insert(ov::cache_dir(cacheDirName)); compile_model(); } { @@ -112,7 +112,9 @@ class KVCacheTests: public ::testing::Test { GTEST_SKIP(); #endif auto core = ov::test::utils::PluginCache::get().core(); - + ov::AnyMap properties = { + ov::hint::inference_precision(ov::element::f16) + }; std::string cacheDirName; if (is_caching_test) { std::stringstream ss; @@ -123,7 +125,7 @@ class KVCacheTests: public ::testing::Test { ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); ov::test::utils::removeDir(cacheDirName); - core->set_property(ov::cache_dir(cacheDirName)); + properties.insert(ov::cache_dir(cacheDirName)); } const size_t batch = 1; @@ -136,9 +138,9 @@ class KVCacheTests: public ::testing::Test { auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type); if (is_caching_test) { - core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16)); + core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); } - auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16)); + auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); auto input0 = model->get_parameters().at(0); auto input1 = model->get_parameters().at(1); @@ -249,12 +251,16 @@ class KVCacheTests: public ::testing::Test { } } - void test_smoke_multipleIterations_stateful(bool is_caching_test) { + void test_smoke_multipleIterations_stateful(bool is_caching_test, bool fuse_cache_reorder, bool build_state_initializer) { #if defined(ANDROID) GTEST_SKIP(); #endif auto core = ov::test::utils::PluginCache::get().core(); + ov::AnyMap properties = { + ov::hint::inference_precision(ov::element::f16) + }; + std::string cacheDirName; if (is_caching_test) { std::stringstream ss; @@ -265,7 +271,7 @@ class KVCacheTests: public ::testing::Test { ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); ov::test::utils::removeDir(cacheDirName); - core->set_property(ov::cache_dir(cacheDirName)); + properties.insert(ov::cache_dir(cacheDirName)); } const size_t batch = 1; @@ -276,34 +282,58 @@ class KVCacheTests: public ::testing::Test { ov::element::Type element_type = ov::element::f16; - auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type, true); - auto ref_model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type, false); + const bool stateful = true; + + auto model = tests::make_llm_kv_cache_pattern(build_state_initializer ? ov::Dimension::dynamic() : batch, + n_heads, + n_features, + element_type, + stateful, + fuse_cache_reorder, + build_state_initializer && stateful); + auto ref_model = tests::make_llm_kv_cache_pattern(build_state_initializer ? ov::Dimension::dynamic() : batch, + n_heads, + n_features, + element_type, + !stateful, + fuse_cache_reorder, + build_state_initializer && !stateful); if (is_caching_test) { - core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16)); + core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); } - auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16)); + auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); auto input0 = model->get_parameters().at(0); auto input1 = model->get_parameters().at(1); + auto input2 = fuse_cache_reorder ? model->get_parameters().at(2) : nullptr; auto output0 = model->get_results().at(0); - auto get_ref_results = [&](const ov::Tensor& kv_cache, const ov::Tensor& new_token_data, const ov::Tensor& matmul_data) { + auto beam_idx_shape = ov::Shape{batch}; + auto beam_idx_data = ov::Tensor(ov::element::i32, beam_idx_shape); + for (size_t i = 0; i < batch; i++) { + beam_idx_data.data()[i] = i; + } + + auto get_ref_results = [&ref_model, fuse_cache_reorder, &beam_idx_shape, &beam_idx_data](const ov::Tensor& kv_cache, + const ov::Tensor& new_token_data, + const ov::Tensor& matmul_data) { auto input0 = ref_model->get_parameters().at(0); auto input1 = ref_model->get_parameters().at(1); auto input2 = ref_model->get_parameters().at(2); - ngraph::helpers::resize_function(ref_model, {kv_cache.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()}); - - auto compiled_model_ref = core->compile_model(ref_model, ov::test::utils::DEVICE_TEMPLATE); - auto inf_req_ref = compiled_model_ref.create_infer_request(); - inf_req_ref.set_tensor(input0, kv_cache); - inf_req_ref.set_tensor(input1, new_token_data); - inf_req_ref.set_tensor(input2, matmul_data); - inf_req_ref.infer(); - std::vector results_ref; - for (auto&& output : ref_model->get_results()) { - results_ref.push_back(inf_req_ref.get_tensor(output)); + auto input3 = fuse_cache_reorder ? ref_model->get_parameters().at(3) : nullptr; + std::vector input_shapes = {kv_cache.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()}; + std::map, ov::Tensor> inputs = { + {input0, kv_cache}, + {input1, new_token_data}, + {input2, matmul_data} + }; + if (fuse_cache_reorder) { + input_shapes.push_back(beam_idx_shape); + inputs.emplace(input3, beam_idx_data); } - return results_ref; + + ngraph::helpers::resize_function(ref_model, input_shapes); + return ngraph::helpers::interpretFunction(ref_model, inputs); }; auto compare_tensors = [&model](const std::vector expected, const std::vector& actual) { @@ -335,7 +365,9 @@ class KVCacheTests: public ::testing::Test { infer_request.set_tensor(input0, new_token_input); infer_request.set_tensor(input1, matmul_input); - + if (fuse_cache_reorder) { + infer_request.set_tensor(input2, beam_idx_data); + } ov::Tensor ref_kv_cache; { @@ -401,11 +433,19 @@ TEST_F(KVCacheTests, smoke_multipleIterations_cached) { this->test_smoke_multipleIterations(true); } -TEST_F(KVCacheTests, smoke_multipleIterations_stateful) { - this->test_smoke_multipleIterations_stateful(false); +TEST_F(KVCacheTests, smoke_multipleIterations_stateful_no_gather_no_initializer) { + this->test_smoke_multipleIterations_stateful(false, false, false); +} + +TEST_F(KVCacheTests, smoke_multipleIterations_stateful_no_gather_no_initializer_cached) { + this->test_smoke_multipleIterations_stateful(true, false, false); +} + +TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer) { + this->test_smoke_multipleIterations_stateful(false, true, true); } -TEST_F(KVCacheTests, smoke_multipleIterations_stateful_cached) { - this->test_smoke_multipleIterations_stateful(true); +TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_cached) { + this->test_smoke_multipleIterations_stateful(true, true, true); } } // namespace