From f0cffc4cf0d806028f78b6d875662a04e43a8f79 Mon Sep 17 00:00:00 2001 From: Paul Youngsoo Ahn Date: Thu, 4 Jan 2024 19:36:51 +0900 Subject: [PATCH] [GPU] Fix issue for skipping gather (#21887) * [GPU] Fix issue for skipping gather - checkout input layout is zero count before calling dep_memory() - add the case for input layout count is zero - in gather_inst, run build_deps before checking is_the_same_buffer(output_memory(), input_memory()) * Follow-up code review * check can_skip_execution before running do_runtime_skip_reorder do_runtime_skip_gather do_runtime_in_place_kv_cache * Set output memory using input memory if output memory is nullptr when current node is skpped for empty output tensor * fix ci failures * [do_runtime_skip_gather] set can_be_optimized as false when input is empty and indices is empty * Fix func KVCache test failure - move do_runtime_in_place_kv_cache in front of chekcing skip execution --- src/plugins/intel_gpu/src/graph/network.cpp | 4 +++ .../intel_gpu/src/graph/primitive_inst.cpp | 27 +++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 5735deb88b25dc..457cfcfd5fd65a 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -731,6 +731,10 @@ void network::allocate_primitives() { for (auto const& node : po) { if (node->can_be_optimized() && !node->is_dynamic()) { auto opt_inst = _primitives.at(node->id()); + // build deps when prim_inst does not update dependencies yet. + if (!node->get_dependencies().empty() && opt_inst->dependencies().empty()) { + opt_inst->build_deps(); + } opt_inst->update_output_memory(); } } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 803477b78f481b..4bf1383ec64f26 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -508,7 +508,8 @@ event::ptr primitive_inst::realloc_if_needed() { } // Clear out memory if if was previously reused, but now primitive can't be optimized - if (_node->is_type() && !can_be_optimized() && _outputs[0] && _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) { + if (_node->is_type() && !can_be_optimized() && _outputs[0] + && dep_memory_ptr(0) && _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) { _outputs[0] = nullptr; max_output_layout_size = 0; } @@ -918,6 +919,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() { void primitive_inst::do_runtime_skip_gather() { // Check pattern if (!get_node().is_type() + || !get_node().can_be_optimized() || _impl_params->has_fused_primitives() || _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type || get_node().get_dependency(1).is_constant() || get_node().get_dependency(1).is_type()) @@ -930,8 +932,15 @@ void primitive_inst::do_runtime_skip_gather() { auto idx_shape = _impl_params->get_input_layout(1).get_shape(); auto idx_rank = idx_shape.size(); - if (idx_rank > 1) { - GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of its indices rank " << idx_shape.size() << std::endl; + if (_impl_params->get_input_layout(0).count() == 0) { + GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of input is empty " << _impl_params->get_input_layout(0).to_short_string() << std::endl; + set_can_be_optimized(false); + return; + } + + if (idx_rank != 1) { + GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of its indices rank " << idx_rank << std::endl; + set_can_be_optimized(false); return; } @@ -1052,12 +1061,8 @@ event::ptr primitive_inst::execute(const std::vector& events) { OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null"); update_shape(); - // Check successor reorder if layouts are same - // Need to set can_be_optimized for user reorder at predecessor because - // if the user is can_be_optimized and output node then current nodes' output should be allocated to host. - do_runtime_skip_reorder(); - do_runtime_skip_gather(); do_runtime_in_place_kv_cache(); + bool can_skip_execution = false; if (_impl_params->output_layouts[0].count() == 0) { GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping because output data is empty " << std::endl; @@ -1084,6 +1089,12 @@ event::ptr primitive_inst::execute(const std::vector& events) { return ev; } + // Check successor reorder if layouts are same + // Need to set can_be_optimized for user reorder at predecessor because + // if the user is can_be_optimized and output node then current nodes' output should be allocated to host. + do_runtime_skip_reorder(); + do_runtime_skip_gather(); + if (!is_valid_fusion()) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id())); auto subgraph = get_unfused_subgraph();