Skip to content

Commit

Permalink
[GPU] Added more information for memory allocation in DumpProfilingDa…
Browse files Browse the repository at this point in the history
…ta (openvinotoolkit#22974)

### Details:
- Added more information for memory allocation in DumpProfilingData
(new_alloc/reuse_buffer/can_be_optimized/from_pool)

### Tickets:
 - *ticket-id*
  • Loading branch information
yeonbok authored Feb 22, 2024
1 parent 942f23b commit 30cc6bb
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ enum class LogLevel : int8_t {
auto stage_prof = cldnn::instrumentation::profiled_stage<primitive_inst>(\
!cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage)
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val)
#define GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(info) stage_prof.add_memalloc_info(info)

#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
Expand All @@ -75,6 +76,7 @@ enum class LogLevel : int8_t {
#define GPU_DEBUG_DEFINE_MEM_LOGGER(stage)
#define GPU_DEBUG_PROFILED_STAGE(stage)
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val)
#define GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(info)
#define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix
#endif

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ struct memory {
#endif

std::shared_ptr<MemoryTracker> get_mem_tracker() const { return m_mem_tracker; }
GPU_DEBUG_CODE(bool from_memory_pool = false);

protected:
engine* _engine;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ struct perf_counter_key {
pipeline_stage stage;
int64_t iteration_num;
bool cache_hit;
std::string memalloc_info;
};

struct perf_counter_hash {
Expand Down Expand Up @@ -170,10 +171,11 @@ class profiled_stage {
auto custom_stage_duration = std::chrono::duration_cast<us>(custom_duration).count();
auto total_duration = custom_stage_duration == 0 ? stage_duration
: custom_stage_duration;
_obj.add_profiling_data(_stage, cache_hit, total_duration, _per_iter_mode);
_obj.add_profiling_data(_stage, cache_hit, memalloc_info, total_duration, _per_iter_mode);
}
}
void set_cache_hit(bool val = true) { cache_hit = val; }
void add_memalloc_info(std::string info = "") { memalloc_info += info; }
void set_custom_stage_duration(std::chrono::nanoseconds duration) { custom_duration = duration; }

private:
Expand All @@ -185,6 +187,7 @@ class profiled_stage {
instrumentation::pipeline_stage _stage;
bool _per_iter_mode = false;
bool cache_hit = false;
std::string memalloc_info = "";
};

class mem_usage_logger {
Expand Down
9 changes: 6 additions & 3 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ class primitive_inst {
void rebuild_exec_deps(std::unordered_map<primitive_id, primitive_inst*> const& primitives);
std::string get_implementation_name() const;

void add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, int64_t time, bool per_iter_mode = false);
void add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, std::string memalloc_info, int64_t time, bool per_iter_mode = false);
const std::unordered_map<size_t, std::tuple<int64_t, size_t>>& get_profiling_data() const { return _profiling_data; }
const std::unordered_map<size_t, instrumentation::perf_counter_key>& get_profiling_info() const { return _profiling_info; }

Expand Down Expand Up @@ -377,9 +377,12 @@ class primitive_inst {
size_t _max_output_layout_count = 0;
std::vector<size_t> max_intermediates_memory_sizes;

std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr,
bool reset_mem = true,
bool runtime_alloc = false);
memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
static std::vector<primitive_inst*> build_exec_deps(std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
static std::vector<primitive_inst*> build_exec_deps(
std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
int32_t get_index_in_deps(memory::cptr arg) const;

// event function called by primitive_inst::execute after checking if primitive should rerun and before calling
Expand Down
10 changes: 9 additions & 1 deletion src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
if (a_info.cache_hit != b_info.cache_hit)
return a_info.cache_hit;

if (a_info.memalloc_info != b_info.memalloc_info)
return a_info.memalloc_info.length() < b_info.memalloc_info.length();

size_t total_out_size_a = 0;
size_t total_out_size_b = 0;
for (auto& ol : a_info.output_layouts) {
Expand All @@ -124,9 +127,14 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
std::string net_in_l_str = layouts_to_str(key.network_input_layouts);
std::string in_l_str = layouts_to_str(key.input_layouts);
std::string out_l_str = layouts_to_str(key.output_layouts);
std::string stage_suffix = "";
if (key.cache_hit)
stage_suffix += " (cache_hit) ";
if (key.memalloc_info != "")
stage_suffix += " (" + key.memalloc_info + ") ";
of << prim_id << ","
<< inst->desc()->type_string() << ","
<< key.stage << (key.cache_hit ? " (cache_hit)" : "") << ","
<< key.stage << stage_suffix << ","
<< net_in_l_str << ","
<< in_l_str << ","
<< out_l_str << ","
Expand Down
24 changes: 20 additions & 4 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ event::ptr primitive_inst::realloc_if_needed() {
// so there is no need for output memory reallocation
if (can_be_optimized()) {
_max_output_layout_count = variable.get_actual_mem_size() / (dt_size / 8);
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
return ev;
}
}
Expand Down Expand Up @@ -540,6 +541,7 @@ event::ptr primitive_inst::realloc_if_needed() {
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() || _node->is_type<strided_slice>()) {
if (can_be_optimized()) {
_max_output_layout_count = _deps[0].first->_max_output_layout_count;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
return ev;
} else if (_outputs[0] && dep_memory_ptr(0) &&
_network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
Expand Down Expand Up @@ -569,6 +571,7 @@ event::ptr primitive_inst::realloc_if_needed() {
// Handle runtime dynamic concat optimization
if (_node->is_type<concatenation>() && can_be_optimized() && allocation_done_by_other) {
allocation_done_by_other = false;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("concat_alloc_by_other");
return ev;
}

Expand All @@ -589,7 +592,6 @@ event::ptr primitive_inst::realloc_if_needed() {

if (updated_params.output_layouts[0].get_buffer_size().count() < updated_layout.get_buffer_size().count())
updated_params.output_layouts[0] = updated_layout;

if (can_reuse_buffer) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer - "
<< actual_layout.count() << "/" << _max_output_layout_count
Expand All @@ -601,11 +603,19 @@ event::ptr primitive_inst::realloc_if_needed() {
GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;
ev = _outputs[0]->fill(_network.get_stream());
}
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("reuse_buffer");
} else {
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
<< " Current buffer_size=" << _max_output_layout_count
<< " Requested buffer_size=" << updated_layout.count() << std::endl;
_outputs = allocate_outputs(&updated_params, need_reset_output_memory(), true);
GPU_DEBUG_CODE(std::string memalloc_info = "");
GPU_DEBUG_CODE(for (size_t out_idx = 0; out_idx < _outputs.size(); ++out_idx) {
memalloc_info += (((_outputs.size() > 1) ? ("o" + to_string(out_idx) + ":") : "") +
(_outputs[out_idx]->from_memory_pool ? "from_pool" : "new_alloc"));
})
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);

// TODO : need to handle multiple outputs
_max_output_layout_count = updated_params.output_layouts[0].get_buffer_size().count();
}
Expand Down Expand Up @@ -672,11 +682,12 @@ event::ptr primitive_inst::realloc_if_needed() {
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
if (ibuf_layouts.empty())
return ev;

GPU_DEBUG_CODE(std::string memalloc_info = "");
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
// can reuse
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
GPU_DEBUG_CODE(memalloc_info += ((_intermediates_memory.size() > 1) ? ("i" + to_string(i) + ":") : "") + "reuse_buffer");
} else {
// TODO: If there is a kernel which requires reset internal buffer in the future,
// we'll need additional handle for that purpose like need_reset_output_memory
Expand All @@ -689,8 +700,12 @@ event::ptr primitive_inst::realloc_if_needed() {
_intermediates_memory.push_back(allocate_internal_buffer(i, need_reset));
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
}
GPU_DEBUG_CODE(memalloc_info +=
(((_intermediates_memory.size() > 1) ? ("i" + to_string(i) + ":") : "") +
(_intermediates_memory[i]->from_memory_pool ? "from_pool" : "new_alloc")));
}
}
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);
}
return ev;
}
Expand Down Expand Up @@ -2056,7 +2071,7 @@ bool primitive_inst::is_valid_fusion() const {
return true;
}

void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, int64_t time, bool per_iter_mode) {
void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, std::string memalloc_info, int64_t time, bool per_iter_mode) {
instrumentation::perf_counter_key key {
_network.get_input_layouts(),
_impl_params->input_layouts,
Expand All @@ -2068,7 +2083,8 @@ void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, b
#else
0,
#endif
cache_hit
cache_hit,
memalloc_info
};

auto hash = instrumentation::perf_counter_hash()(key);
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/src/runtime/memory_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
!has_conflict(it->second._users, restrictions, network_id)) {
it->second._users.insert(memory_user(id, network_id));
auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
return ret_mem;
} else {
++it;
Expand All @@ -153,7 +154,6 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
const std::set<primitive_id>& restrictions,
allocation_type type) {
auto first_level_cache = _padded_pool.find(layout);

if (first_level_cache != _padded_pool.end()) {
for (auto& rec_list : first_level_cache->second) {
if (rec_list._network_id == network_id &&
Expand All @@ -168,6 +168,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
!has_conflict(rec_list._users, restrictions, network_id)) {
rec_list._users.insert({id, network_id});
auto ret_mem = _engine->reinterpret_buffer(*(rec_list._memory), layout);
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
return ret_mem;
}
}
Expand Down Expand Up @@ -199,6 +200,7 @@ memory::ptr memory_pool::get_from_across_networks_pool(const layout& layout,
if (!has_conflict(it->second._users, {}, network_id)) {
it->second._users.insert(memory_user(id, network_id));
auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
return ret_mem;
}
}
Expand Down

0 comments on commit 30cc6bb

Please sign in to comment.