From 820448bda5af427681a3985dd1b64f7a2540071e Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Tue, 23 Jan 2024 15:04:17 -0800 Subject: [PATCH] [GPU] Several optimization to reduce runtime host overhead (#22249) * Several optimization to reduce runtime host overhead * Reduced set_arg more * Applied review comments * Exclude mem_changed check for condition and loop because their memory is to be allocated during execution --- .../src/graph/impls/ocl/primitive_base.hpp | 8 ++--- .../src/graph/include/primitive_inst.h | 2 ++ src/plugins/intel_gpu/src/graph/network.cpp | 7 ++-- .../intel_gpu/src/graph/primitive_inst.cpp | 33 +++++++++++++++---- .../kernel_selector/kernel_selector_common.h | 2 ++ .../concatenation_kernel_base.cpp | 1 + 6 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 6909681c99aaa2..d4e4927e200f1a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -259,8 +259,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { for (size_t kd_idx = 0; kd_idx < _kernel_data.kernels.size(); ++kd_idx) { if (_kernel_data.kernels[kd_idx].skip_execution) continue; - std::vector new_events; - // If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr) bool needs_completion_event = instance.needs_completion_event(); @@ -280,10 +278,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { << (needs_completion_event ? " has_completion_event=true" : "") << std::endl; auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, needs_completion_event); - new_events.push_back(ev); + if (_kernel_data.needs_sub_kernels_sync) { + tmp_events = {ev}; + } all_events.push_back(ev); - - tmp_events = new_events; } if ((all_events.size() == 0) && (tmp_events.size() > 0)) diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index e2973d892e9fdd..ebe58eb8eef97f 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -225,6 +225,7 @@ class primitive_inst { void reset_output_change() { _output_changed = false; } bool shape_changed() const { return _shape_changed; } + bool mem_changed() const { return _mem_changed; } void reset_shape_change() { _shape_changed = false; } void set_shape_change() { _shape_changed = true; } @@ -351,6 +352,7 @@ class primitive_inst { bool _output_changed; // todo: implement output reuse if neither of inputs has changed bool _shape_changed = false; + bool _mem_changed = false; bool _has_valid_input = true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst) bool _has_mutable_input = false; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 1464ee4523c074..885732ca7c346e 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -852,10 +852,10 @@ void network::execute_impl(const std::vector& events) { // Wait for previous execution completion reset_execution(false); GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) { - GPU_DEBUG_COUT << "----------------------------------------------" << std::endl; + GPU_DEBUG_COUT << "============================================================================" << std::endl; GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl; } else { - GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl; + GPU_DEBUG_COUT << "============================================================================" << std::endl; GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl; } @@ -1035,7 +1035,6 @@ void network::execute_impl(const std::vector& events) { execute_primitive(inst, events); executed_prims++; - if (needs_flushing && executed_prims % flush_frequency == 0) get_stream().flush(); @@ -1297,7 +1296,7 @@ void network::allocate_primitive_instance(program_node const& node) { std::function is_mutable_input = [&is_mutable_input](const program_node& node) { for (auto& dep : node.get_dependencies()) { const auto dep_node = dep.first; - if (dep_node->is_type() || dep_node->is_type() || dep_node->is_type()) { + if (dep_node->is_type() || dep_node->is_type() || (dep_node->is_type() && !dep_node->can_be_optimized())) { return true; } if (dep_node->can_be_optimized()) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index dd5d9641196c20..500615bd249c8d 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -544,7 +544,7 @@ event::ptr primitive_inst::realloc_if_needed() { // If we allocated too large memory, reclaim the memory. - if (updated_layout.count() * 10 < _max_output_layout_count) { + if (updated_layout.get_buffer_size().count() * 10 < _max_output_layout_count) { GPU_DEBUG_TRACE_DETAIL << id() << ": Updated output size " << updated_layout.count() << " is much smaller than current memory size! " << _max_output_layout_count << "Reset memory" << std::endl; @@ -584,6 +584,7 @@ event::ptr primitive_inst::realloc_if_needed() { _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout); } if (need_reset_output_memory() && !can_be_optimized()) { + GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl; ev = _outputs[0]->fill(_network.get_stream()); } } else { @@ -1098,7 +1099,15 @@ event::ptr primitive_inst::execute(const std::vector& events) { const auto primitive_id = id(); OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input"); GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "Execute " << id() << " (type: " << _impl_params->desc->type_string() << ") " << std::endl; + for (size_t i = 0; i < _deps.size(); ++i) { + GPU_DEBUG_TRACE_DETAIL << "- inputs[" << i << "] : " << _deps[i].first->id() << std::endl; + } + GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl; bool need_args_update = false; + _mem_changed = false; + const auto orig_outputs = _outputs; std::vector dependencies; if (is_dynamic() && !has_inner_networks()) { do_runtime_in_place_concat(); @@ -1173,9 +1182,8 @@ event::ptr primitive_inst::execute(const std::vector& events) { // Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation. // Only try update weight and realloc when impl is updated. if (shape_changed() || !_impl || (!shape_changed() && _impl->is_dynamic())) { - need_args_update = true; - if (update_impl()) { + need_args_update = true; auto ev = update_weights(); if (ev) dependencies.push_back(ev); @@ -1194,16 +1202,29 @@ event::ptr primitive_inst::execute(const std::vector& events) { // Dynamic insts may reallocate its' output buffer, so we need to update kernel's args respectively bool has_dynamic_dependencies_insts = std::any_of(_deps.begin(), _deps.end(), [](const std::pair& dep) { - return dep.first->is_dynamic(); + return dep.first->mem_changed(); }); // Output buffer may be changed under the following conditions, so we need to set args to kernel on each iteration - if ((is_dynamic() && need_args_update) || has_mutable_input() || is_output() || (!is_dynamic() && has_dynamic_dependencies_insts)) { + if ((is_dynamic() && need_args_update) || has_mutable_input() || is_output() || has_dynamic_dependencies_insts) { set_arguments(); } on_execute(); - GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", " + if (!_node->is_type() && !_node->is_type()) { + for (size_t i = 0; i < _outputs.size(); ++i) { + if ((!orig_outputs[i] && _outputs[i]) || (orig_outputs[i] && !_outputs[i])) { + _mem_changed = true; + break; + } + if (!_network.get_engine().is_the_same_buffer(*orig_outputs[i], *_outputs[i])) { + _mem_changed = true; + break; + } + } + } + GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() + << ", " << "can_be_optimized=" << can_be_optimized() << ")" << std::endl; const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index c08e1b78e7bb60..363946afa0f704 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -96,6 +96,7 @@ struct KernelData { int autoTuneIndex = -1; bool can_reuse_memory = true; + bool needs_sub_kernels_sync = true; static bool SkipKernelExecution(const base_params& params, size_t kernel_id = 0) { for (const auto& input : params.inputs) { @@ -125,6 +126,7 @@ struct KernelData { kd.reorderInput = false; // for KW kd.autoTuneIndex = -1; kd.can_reuse_memory = true; + kd.needs_sub_kernels_sync = true; for (auto& kernel : kd.kernels) { kernel.skip_execution = SkipKernelExecution(orgParams); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_base.cpp index 297476ea6c83e5..7663aee8fae401 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_base.cpp @@ -141,6 +141,7 @@ KernelsData ConcatenationKernelBase::GetCommonKernelsData(const Params& params, const concatenation_params& orgParams = static_cast(params); KernelData kd = KernelData::Default(params, orgParams.inputs.size()); + kd.needs_sub_kernels_sync = false; GetUpdateDispatchDataFunc(kd); bool is_dynamic = orgParams.has_dynamic_tensors();