Skip to content

Commit

Permalink
[GPU] Several optimization to reduce runtime host overhead (openvinot…
Browse files Browse the repository at this point in the history
…oolkit#22249)

* Several optimization to reduce runtime host overhead

* Reduced set_arg more

* Applied review comments

* Exclude mem_changed check for condition and loop because their memory is to be allocated during execution
  • Loading branch information
yeonbok authored Jan 23, 2024
1 parent 76e75aa commit 820448b
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 15 deletions.
8 changes: 3 additions & 5 deletions src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
for (size_t kd_idx = 0; kd_idx < _kernel_data.kernels.size(); ++kd_idx) {
if (_kernel_data.kernels[kd_idx].skip_execution)
continue;
std::vector<event::ptr> new_events;

// If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
bool needs_completion_event = instance.needs_completion_event();

Expand All @@ -280,10 +278,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
<< (needs_completion_event ? " has_completion_event=true" : "") << std::endl;

auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, needs_completion_event);
new_events.push_back(ev);
if (_kernel_data.needs_sub_kernels_sync) {
tmp_events = {ev};
}
all_events.push_back(ev);

tmp_events = new_events;
}

if ((all_events.size() == 0) && (tmp_events.size() > 0))
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class primitive_inst {
void reset_output_change() { _output_changed = false; }

bool shape_changed() const { return _shape_changed; }
bool mem_changed() const { return _mem_changed; }
void reset_shape_change() { _shape_changed = false; }
void set_shape_change() { _shape_changed = true; }

Expand Down Expand Up @@ -351,6 +352,7 @@ class primitive_inst {

bool _output_changed; // todo: implement output reuse if neither of inputs has changed
bool _shape_changed = false;
bool _mem_changed = false;
bool _has_valid_input =
true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
bool _has_mutable_input = false;
Expand Down
7 changes: 3 additions & 4 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -852,10 +852,10 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
// Wait for previous execution completion
reset_execution(false);
GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
GPU_DEBUG_COUT << "============================================================================" << std::endl;
GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
} else {
GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
GPU_DEBUG_COUT << "============================================================================" << std::endl;
GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
}

Expand Down Expand Up @@ -1035,7 +1035,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {

execute_primitive(inst, events);
executed_prims++;

if (needs_flushing && executed_prims % flush_frequency == 0)
get_stream().flush();

Expand Down Expand Up @@ -1297,7 +1296,7 @@ void network::allocate_primitive_instance(program_node const& node) {
std::function<bool(const program_node&)> is_mutable_input = [&is_mutable_input](const program_node& node) {
for (auto& dep : node.get_dependencies()) {
const auto dep_node = dep.first;
if (dep_node->is_type<input_layout>() || dep_node->is_type<mutable_data>() || dep_node->is_type<read_value>()) {
if (dep_node->is_type<input_layout>() || dep_node->is_type<mutable_data>() || (dep_node->is_type<read_value>() && !dep_node->can_be_optimized())) {
return true;
}
if (dep_node->can_be_optimized()) {
Expand Down
33 changes: 27 additions & 6 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ event::ptr primitive_inst::realloc_if_needed() {


// If we allocated too large memory, reclaim the memory.
if (updated_layout.count() * 10 < _max_output_layout_count) {
if (updated_layout.get_buffer_size().count() * 10 < _max_output_layout_count) {
GPU_DEBUG_TRACE_DETAIL << id() << ": Updated output size " << updated_layout.count()
<< " is much smaller than current memory size! " << _max_output_layout_count
<< "Reset memory" << std::endl;
Expand Down Expand Up @@ -584,6 +584,7 @@ event::ptr primitive_inst::realloc_if_needed() {
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
}
if (need_reset_output_memory() && !can_be_optimized()) {
GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;
ev = _outputs[0]->fill(_network.get_stream());
}
} else {
Expand Down Expand Up @@ -1098,7 +1099,15 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
const auto primitive_id = id();
OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl;
GPU_DEBUG_TRACE_DETAIL << "Execute " << id() << " (type: " << _impl_params->desc->type_string() << ") " << std::endl;
for (size_t i = 0; i < _deps.size(); ++i) {
GPU_DEBUG_TRACE_DETAIL << "- inputs[" << i << "] : " << _deps[i].first->id() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl;
bool need_args_update = false;
_mem_changed = false;
const auto orig_outputs = _outputs;
std::vector<event::ptr> dependencies;
if (is_dynamic() && !has_inner_networks()) {
do_runtime_in_place_concat();
Expand Down Expand Up @@ -1173,9 +1182,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
// Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation.
// Only try update weight and realloc when impl is updated.
if (shape_changed() || !_impl || (!shape_changed() && _impl->is_dynamic())) {
need_args_update = true;

if (update_impl()) {
need_args_update = true;
auto ev = update_weights();
if (ev)
dependencies.push_back(ev);
Expand All @@ -1194,16 +1202,29 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
// Dynamic insts may reallocate its' output buffer, so we need to update kernel's args respectively
bool has_dynamic_dependencies_insts = std::any_of(_deps.begin(), _deps.end(),
[](const std::pair<primitive_inst*, int32_t>& dep) {
return dep.first->is_dynamic();
return dep.first->mem_changed();
});

// Output buffer may be changed under the following conditions, so we need to set args to kernel on each iteration
if ((is_dynamic() && need_args_update) || has_mutable_input() || is_output() || (!is_dynamic() && has_dynamic_dependencies_insts)) {
if ((is_dynamic() && need_args_update) || has_mutable_input() || is_output() || has_dynamic_dependencies_insts) {
set_arguments();
}
on_execute();

GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", "
if (!_node->is_type<condition>() && !_node->is_type<loop>()) {
for (size_t i = 0; i < _outputs.size(); ++i) {
if ((!orig_outputs[i] && _outputs[i]) || (orig_outputs[i] && !_outputs[i])) {
_mem_changed = true;
break;
}
if (!_network.get_engine().is_the_same_buffer(*orig_outputs[i], *_outputs[i])) {
_mem_changed = true;
break;
}
}
}
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic()
<< ", "
<< "can_be_optimized=" << can_be_optimized() << ")" << std::endl;

const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ struct KernelData {
int autoTuneIndex = -1;

bool can_reuse_memory = true;
bool needs_sub_kernels_sync = true;

static bool SkipKernelExecution(const base_params& params, size_t kernel_id = 0) {
for (const auto& input : params.inputs) {
Expand Down Expand Up @@ -125,6 +126,7 @@ struct KernelData {
kd.reorderInput = false; // for KW
kd.autoTuneIndex = -1;
kd.can_reuse_memory = true;
kd.needs_sub_kernels_sync = true;

for (auto& kernel : kd.kernels) {
kernel.skip_execution = SkipKernelExecution(orgParams);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ KernelsData ConcatenationKernelBase::GetCommonKernelsData(const Params& params,

const concatenation_params& orgParams = static_cast<const concatenation_params&>(params);
KernelData kd = KernelData::Default<concatenation_params>(params, orgParams.inputs.size());
kd.needs_sub_kernels_sync = false;
GetUpdateDispatchDataFunc(kd);

bool is_dynamic = orgParams.has_dynamic_tensors();
Expand Down

0 comments on commit 820448b

Please sign in to comment.