[GPU] Added more information for memory allocation in DumpProfilingDa…

…ta (openvinotoolkit#22974) ### Details: - Added more information for memory allocation in DumpProfilingData (new_alloc/reuse_buffer/can_be_optimized/from_pool) ### Tickets: - *ticket-id*
akuporos · Feb 22, 2024 · 30cc6bb · 30cc6bb
1 parent 942f23b
commit 30cc6bb
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 10 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -57,6 +57,7 @@ enum class LogLevel : int8_t {
     auto stage_prof = cldnn::instrumentation::profiled_stage<primitive_inst>(\
         !cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage)
 #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val)
+#define GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(info) stage_prof.add_memalloc_info(info)
 
 #define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
     ((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
@@ -75,6 +76,7 @@ enum class LogLevel : int8_t {
 #define GPU_DEBUG_DEFINE_MEM_LOGGER(stage)
 #define GPU_DEBUG_PROFILED_STAGE(stage)
 #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val)
+#define GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(info)
 #define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix
 #endif
 

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -92,6 +92,7 @@ struct memory {
 #endif
 
     std::shared_ptr<MemoryTracker> get_mem_tracker() const { return m_mem_tracker; }
+    GPU_DEBUG_CODE(bool from_memory_pool = false);
 
 protected:
     engine* _engine;

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/profiling.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/profiling.hpp
@@ -121,6 +121,7 @@ struct perf_counter_key {
     pipeline_stage stage;
     int64_t iteration_num;
     bool cache_hit;
+    std::string memalloc_info;
 };
 
 struct perf_counter_hash {
@@ -170,10 +171,11 @@ class profiled_stage {
             auto custom_stage_duration = std::chrono::duration_cast<us>(custom_duration).count();
             auto total_duration = custom_stage_duration == 0 ? stage_duration
                                                              : custom_stage_duration;
-            _obj.add_profiling_data(_stage, cache_hit, total_duration, _per_iter_mode);
+            _obj.add_profiling_data(_stage, cache_hit, memalloc_info, total_duration, _per_iter_mode);
         }
     }
     void set_cache_hit(bool val = true) { cache_hit = val; }
+    void add_memalloc_info(std::string info = "") { memalloc_info += info; }
     void set_custom_stage_duration(std::chrono::nanoseconds duration) { custom_duration = duration; }
 
 private:
@@ -185,6 +187,7 @@ class profiled_stage {
     instrumentation::pipeline_stage _stage;
     bool _per_iter_mode = false;
     bool cache_hit = false;
+    std::string memalloc_info = "";
 };
 
 class mem_usage_logger {

diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -279,7 +279,7 @@ class primitive_inst {
     void rebuild_exec_deps(std::unordered_map<primitive_id, primitive_inst*> const& primitives);
     std::string get_implementation_name() const;
 
-    void add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, int64_t time, bool per_iter_mode = false);
+    void add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, std::string memalloc_info, int64_t time, bool per_iter_mode = false);
     const std::unordered_map<size_t, std::tuple<int64_t, size_t>>& get_profiling_data() const { return _profiling_data; }
     const std::unordered_map<size_t, instrumentation::perf_counter_key>& get_profiling_info() const { return _profiling_info; }
 
@@ -377,9 +377,12 @@ class primitive_inst {
     size_t _max_output_layout_count = 0;
     std::vector<size_t> max_intermediates_memory_sizes;
 
-    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
+    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr,
+                                              bool reset_mem = true,
+                                              bool runtime_alloc = false);
     memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
-    static std::vector<primitive_inst*> build_exec_deps(std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
+    static std::vector<primitive_inst*> build_exec_deps(
+        std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
     int32_t get_index_in_deps(memory::cptr arg) const;
 
     // event function called by primitive_inst::execute after checking if primitive should rerun and before calling

diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -105,6 +105,9 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
                 if (a_info.cache_hit != b_info.cache_hit)
                     return a_info.cache_hit;
 
+                if (a_info.memalloc_info != b_info.memalloc_info)
+                    return a_info.memalloc_info.length() < b_info.memalloc_info.length();
+
                 size_t total_out_size_a = 0;
                 size_t total_out_size_b = 0;
                 for (auto& ol : a_info.output_layouts) {
@@ -124,9 +127,14 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
                 std::string net_in_l_str = layouts_to_str(key.network_input_layouts);
                 std::string in_l_str = layouts_to_str(key.input_layouts);
                 std::string out_l_str = layouts_to_str(key.output_layouts);
+                std::string stage_suffix = "";
+                if (key.cache_hit)
+                    stage_suffix += " (cache_hit) ";
+                if (key.memalloc_info != "")
+                    stage_suffix += " (" + key.memalloc_info + ") ";
                 of << prim_id << ","
                 << inst->desc()->type_string() << ","
-                << key.stage << (key.cache_hit ? " (cache_hit)" : "") << ","
+                << key.stage << stage_suffix << ","
                 << net_in_l_str << ","
                 << in_l_str << ","
                 << out_l_str << ","

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -512,6 +512,7 @@ event::ptr primitive_inst::realloc_if_needed() {
         // so there is no need for output memory reallocation
         if (can_be_optimized()) {
             _max_output_layout_count = variable.get_actual_mem_size() / (dt_size / 8);
+            GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
             return ev;
         }
     }
@@ -540,6 +541,7 @@ event::ptr primitive_inst::realloc_if_needed() {
     if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() || _node->is_type<strided_slice>()) {
         if (can_be_optimized()) {
             _max_output_layout_count = _deps[0].first->_max_output_layout_count;
+            GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
             return ev;
         } else if (_outputs[0] && dep_memory_ptr(0) &&
                    _network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
@@ -569,6 +571,7 @@ event::ptr primitive_inst::realloc_if_needed() {
     // Handle runtime dynamic concat optimization
     if (_node->is_type<concatenation>() && can_be_optimized() && allocation_done_by_other) {
         allocation_done_by_other = false;
+        GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("concat_alloc_by_other");
         return ev;
     }
 
@@ -589,7 +592,6 @@ event::ptr primitive_inst::realloc_if_needed() {
 
     if (updated_params.output_layouts[0].get_buffer_size().count() < updated_layout.get_buffer_size().count())
         updated_params.output_layouts[0] = updated_layout;
-
     if (can_reuse_buffer) {
         GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer - "
                                << actual_layout.count() << "/" << _max_output_layout_count
@@ -601,11 +603,19 @@ event::ptr primitive_inst::realloc_if_needed() {
             GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;
             ev = _outputs[0]->fill(_network.get_stream());
         }
+        GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("reuse_buffer");
     } else {
         GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
                                <<  " Current buffer_size=" << _max_output_layout_count
                                <<  " Requested buffer_size=" << updated_layout.count() << std::endl;
         _outputs = allocate_outputs(&updated_params, need_reset_output_memory(), true);
+        GPU_DEBUG_CODE(std::string memalloc_info = "");
+        GPU_DEBUG_CODE(for (size_t out_idx = 0; out_idx < _outputs.size(); ++out_idx) {
+            memalloc_info += (((_outputs.size() > 1) ? ("o" + to_string(out_idx) + ":") : "") +
+                              (_outputs[out_idx]->from_memory_pool ? "from_pool" : "new_alloc"));
+        })
+        GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);
+
         // TODO : need to handle multiple outputs
         _max_output_layout_count = updated_params.output_layouts[0].get_buffer_size().count();
     }
@@ -672,11 +682,12 @@ event::ptr primitive_inst::realloc_if_needed() {
         const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
         if (ibuf_layouts.empty())
             return ev;
-
+        GPU_DEBUG_CODE(std::string memalloc_info = "");
         for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
             if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
                 // can reuse
                 _intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
+               GPU_DEBUG_CODE(memalloc_info += ((_intermediates_memory.size() > 1) ? ("i" + to_string(i) + ":") : "") + "reuse_buffer");
             } else {
                 // TODO: If there is a kernel which requires reset internal buffer in the future,
                 // we'll need additional handle for that purpose like need_reset_output_memory
@@ -689,8 +700,12 @@ event::ptr primitive_inst::realloc_if_needed() {
                     _intermediates_memory.push_back(allocate_internal_buffer(i, need_reset));
                     max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
                 }
+                GPU_DEBUG_CODE(memalloc_info +=
+                               (((_intermediates_memory.size() > 1) ? ("i" + to_string(i) + ":") : "") +
+                                (_intermediates_memory[i]->from_memory_pool ? "from_pool" : "new_alloc")));
             }
         }
+        GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);
     }
     return ev;
 }
@@ -2056,7 +2071,7 @@ bool primitive_inst::is_valid_fusion() const {
     return true;
 }
 
-void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, int64_t time, bool per_iter_mode) {
+void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, bool cache_hit, std::string memalloc_info, int64_t time, bool per_iter_mode) {
     instrumentation::perf_counter_key key {
             _network.get_input_layouts(),
             _impl_params->input_layouts,
@@ -2068,7 +2083,8 @@ void primitive_inst::add_profiling_data(instrumentation::pipeline_stage stage, b
 #else
             0,
 #endif
-            cache_hit
+            cache_hit,
+            memalloc_info
     };
 
     auto hash = instrumentation::perf_counter_hash()(key);

diff --git a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@@ -132,6 +132,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
             !has_conflict(it->second._users, restrictions, network_id)) {
             it->second._users.insert(memory_user(id, network_id));
             auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
+            GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
             return ret_mem;
         } else {
             ++it;
@@ -153,7 +154,6 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
                                               const std::set<primitive_id>& restrictions,
                                               allocation_type type) {
     auto first_level_cache = _padded_pool.find(layout);
-
     if (first_level_cache != _padded_pool.end()) {
         for (auto& rec_list : first_level_cache->second) {
             if (rec_list._network_id == network_id &&
@@ -168,6 +168,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
                 !has_conflict(rec_list._users, restrictions, network_id)) {
                 rec_list._users.insert({id, network_id});
                 auto ret_mem = _engine->reinterpret_buffer(*(rec_list._memory), layout);
+                GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
                 return ret_mem;
             }
         }
@@ -199,6 +200,7 @@ memory::ptr memory_pool::get_from_across_networks_pool(const layout& layout,
             if (!has_conflict(it->second._users, {}, network_id)) {
                 it->second._users.insert(memory_user(id, network_id));
                 auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
+                GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
                 return ret_mem;
             }
         }