diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp index 3873c543df9e5a..d9fab79d76ab2e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp @@ -9,18 +9,47 @@ #include "strided_slice_inst.h" #include "kv_cache_inst.h" #include "gemm_inst.h" +#include "shape_of_inst.h" #include "broadcast_inst.h" +#include "non_zero_inst.h" +#include "non_max_suppression_inst.h" +#include "unique_inst.hpp" #include "program_helpers.h" using namespace cldnn; void mark_runtime_skippable_nodes::run(program& p) { auto itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) { auto& node = *itr++; // Set gathers that might be skipped at runtime as can_be_optimized. // If not set, memory dependency will not work for the nodes that are skipped at runtime - program_helpers::do_for_types(*node, [](gather_node& node){ + if (node->is_type() || node->is_constant()) + continue; + + std::function all_users_are_shape_of = [&](const program_node& node) { + if (node.is_input() || node.is_output()) + return false; + for (auto& u : node.get_users()) { + if (!u->is_type()) + return false; + } + return true; + }; + + if (all_users_are_shape_of(*node) && + // primitives that should be executed to know output shapes + !node->is_type() && !node->is_type() && + !node->is_type()) { + // always to skip, no runtime execution + node->can_be_optimized(true); + GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node->id() << " has only shape_of as users. Set can_be_optimized always" + << std::endl; + continue; + } + + program_helpers::do_for_types(*node, [](gather_node& node) { // Check pattern auto impl_params = node.get_kernel_impl_params(); if (node.has_fused_primitives() || diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp index e9532f28b17c61..3990224c1b8518 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp @@ -50,7 +50,7 @@ void post_input_reorder::run(program& p) { // add a reorder if primitive's input format doesn't match implementation's input format if (node->is_type()) { const auto fc_impl = dynamic_cast*>(impl); - if (!fc_impl) + if (!fc_impl || node->can_be_optimized()) continue; const auto& fc_params = *static_cast(fc_impl->_kernel_data.params.get());