Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Fix issue for skipping gather #21887

Merged
merged 7 commits into from
Jan 4, 2024
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,10 @@ void network::allocate_primitives() {
for (auto const& node : po) {
if (node->can_be_optimized() && !node->is_dynamic()) {
auto opt_inst = _primitives.at(node->id());
// build deps when prim_inst does not update dependencies yet.
if (!node->get_dependencies().empty() && opt_inst->dependencies().empty()) {
opt_inst->build_deps();
}
opt_inst->update_output_memory();
}
}
Expand Down
27 changes: 19 additions & 8 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,8 @@ event::ptr primitive_inst::realloc_if_needed() {
}

// Clear out memory if if was previously reused, but now primitive can't be optimized
if (_node->is_type<gather>() && !can_be_optimized() && _outputs[0] && _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
if (_node->is_type<gather>() && !can_be_optimized() && _outputs[0]
&& dep_memory_ptr(0) && _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
yeonbok marked this conversation as resolved.
Show resolved Hide resolved
_outputs[0] = nullptr;
max_output_layout_size = 0;
}
Expand Down Expand Up @@ -918,6 +919,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
void primitive_inst::do_runtime_skip_gather() {
// Check pattern
if (!get_node().is_type<gather>()
|| !get_node().can_be_optimized()
|| _impl_params->has_fused_primitives()
|| _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type
|| get_node().get_dependency(1).is_constant() || get_node().get_dependency(1).is_type<data>())
Expand All @@ -930,8 +932,15 @@ void primitive_inst::do_runtime_skip_gather() {
auto idx_shape = _impl_params->get_input_layout(1).get_shape();
yeonbok marked this conversation as resolved.
Show resolved Hide resolved
auto idx_rank = idx_shape.size();

if (idx_rank > 1) {
GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of its indices rank " << idx_shape.size() << std::endl;
if (_impl_params->get_input_layout(0).count() == 0) {
GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of input is empty " << _impl_params->get_input_layout(0).to_short_string() << std::endl;
set_can_be_optimized(false);
return;
}

if (idx_rank != 1) {
GPU_DEBUG_TRACE_DETAIL << "-- Cannot optimize becuase of its indices rank " << idx_rank << std::endl;
set_can_be_optimized(false);
return;
}

Expand Down Expand Up @@ -1052,12 +1061,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
update_shape();

// Check successor reorder if layouts are same
// Need to set can_be_optimized for user reorder at predecessor because
// if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
do_runtime_skip_reorder();
do_runtime_skip_gather();
do_runtime_in_place_kv_cache();

bool can_skip_execution = false;
if (_impl_params->output_layouts[0].count() == 0) {
GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping because output data is empty " << std::endl;
Expand All @@ -1084,6 +1089,12 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
return ev;
}

// Check successor reorder if layouts are same
// Need to set can_be_optimized for user reorder at predecessor because
// if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
do_runtime_skip_reorder();
do_runtime_skip_gather();

if (!is_valid_fusion()) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
auto subgraph = get_unfused_subgraph();
Expand Down
Loading