From 44919c995a8b3aab7c11ff7f4539aa9faa9d2734 Mon Sep 17 00:00:00 2001 From: Sajina Kandy Date: Sun, 15 Dec 2024 19:54:00 -0500 Subject: [PATCH] Test GPU metrics API in rocm smi library Fetch the GPU metrics using the rsmi_dev_gpu_metrics_info_get and test the values. Signed-off-by: Sajina P Kandy --- source/lib/core/categories.hpp | 2 ++ source/lib/core/components/fwd.hpp | 13 ++++++++ source/lib/core/config.cpp | 9 +++--- .../rocprofiler-systems/categories.h | 1 + source/lib/rocprof-sys/library/CMakeLists.txt | 3 ++ source/lib/rocprof-sys/library/rocm_smi.cpp | 30 ++++++++++++++++++- source/lib/rocprof-sys/library/rocm_smi.hpp | 30 ++++++++++++------- source/lib/rocprof-sys/library/sampling.cpp | 7 +++++ 8 files changed, 80 insertions(+), 15 deletions(-) diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index 0f09f4f1..cdf122ba 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -105,6 +105,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_vcn_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions") ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions") ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions") @@ -167,6 +168,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_vcn_activity), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \ ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \ ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \ diff --git a/source/lib/core/components/fwd.hpp b/source/lib/core/components/fwd.hpp index 8e9343d9..272ecb0f 100644 --- a/source/lib/core/components/fwd.hpp +++ b/source/lib/core/components/fwd.hpp @@ -82,6 +82,8 @@ struct backtrace_gpu_power {}; struct backtrace_gpu_memory {}; +struct backtrace_gpu_vcn +{}; using sampling_wall_clock = data_tracker; using sampling_cpu_clock = data_tracker; using sampling_percent = data_tracker; @@ -89,6 +91,7 @@ using sampling_gpu_busy = data_tracker; using sampling_gpu_temp = data_tracker; using sampling_gpu_power = data_tracker; using sampling_gpu_memory = data_tracker; +using sampling_gpu_vcn = data_tracker; template @@ -121,6 +124,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, fal ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_vcn, false_type) #endif TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer, project::rocprofsys, @@ -152,6 +156,9 @@ TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_temp, project::ro tpls::rocm, device::gpu, os::supports_linux, category::temperature, category::sampling, category::process_sampling) +TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_vcn, project::rocprofsys, + tpls::rocm, device::gpu, os::supports_linux, + category::sampling, category::process_sampling) TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::roctracer, "roctracer", "High-precision ROCm API and kernel tracing", "") @@ -180,6 +187,10 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_power, TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_temp, "sampling_gpu_temp", "GPU Temperature via ROCm-SMI", "Derived from sampling") +TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn, + "sampling_gpu_vcn", + "VCN Activity (% activity) via ROCm-SMI", + "Derived from sampling") // statistics type TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_wall_clock, double) @@ -188,6 +199,7 @@ TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_busy, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_temp, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_power, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_memory, double) +TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_vcn, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::comm_data_tracker_t, float) // enable timing units @@ -219,6 +231,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_busy, false ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_vcn, false_type) // reporting categories (mean) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type) diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index b5f249d4..2acf8cae 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -626,10 +626,11 @@ configure_settings(bool _init) rocprofiler_sdk::config_settings(_config); - ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS", - "rocm-smi metrics to collect: busy, temp, power, mem_usage", - "busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm", - "process_sampling", "advanced"); + ROCPROFSYS_CONFIG_SETTING( + std::string, "ROCPROFSYS_ROCM_SMI_METRICS", + "rocm-smi metrics to collect: busy, temp, power, mem_usage,vcn_activity", + "busy,temp,power,mem_usage,vcn_activity", "backend", "rocm_smi", "rocm", + "process_sampling", "advanced"); ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB", "Hint for shared-memory buffer size in perfetto (in KB)", diff --git a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index fbc17bc7..82db6199 100644 --- a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -57,6 +57,7 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, + ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, ROCPROFSYS_CATEGORY_ROCM_RCCL, ROCPROFSYS_CATEGORY_SAMPLING, ROCPROFSYS_CATEGORY_PTHREAD, diff --git a/source/lib/rocprof-sys/library/CMakeLists.txt b/source/lib/rocprof-sys/library/CMakeLists.txt index 5084c439..277fad80 100644 --- a/source/lib/rocprof-sys/library/CMakeLists.txt +++ b/source/lib/rocprof-sys/library/CMakeLists.txt @@ -34,6 +34,9 @@ set(library_headers target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources} ${library_headers}) +target_include_directories(rocprofiler-systems-core-library BEFORE + PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + if(ROCPROFSYS_USE_RCCL) target_sources(rocprofiler-systems-object-library PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) diff --git a/source/lib/rocprof-sys/library/rocm_smi.cpp b/source/lib/rocprof-sys/library/rocm_smi.cpp index 202a8cd8..b86b892d 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.cpp +++ b/source/lib/rocprof-sys/library/rocm_smi.cpp @@ -159,6 +159,8 @@ data::sample(uint32_t _dev_id) &m_power, &power_type) ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get, _dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage); + ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).vcn_activity, + rsmi_dev_gpu_metrics_info_get, _dev_id, &m_gpu_metrics); #undef ROCPROFSYS_RSMI_GET } @@ -257,6 +259,7 @@ data::post_process(uint32_t _dev_id) using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; + using component::sampling_gpu_vcn; if(device_count < _dev_id) return; @@ -273,7 +276,7 @@ data::post_process(uint32_t _dev_id) auto _settings = get_settings(_dev_id); auto _process_perfetto = [&]() { - auto _idx = std::array{}; + auto _idx = std::array{}; { _idx.fill(_idx.size()); uint64_t nidx = 0; @@ -281,6 +284,7 @@ data::post_process(uint32_t _dev_id) if(_settings.temp) _idx.at(1) = nidx++; if(_settings.power) _idx.at(2) = nidx++; if(_settings.mem_usage) _idx.at(3) = nidx++; + if(_settings.vcn_activity) _idx.at(4) = nidx++; } for(auto& itr : _rocm_smi) @@ -301,6 +305,15 @@ data::post_process(uint32_t _dev_id) if(_settings.mem_usage) counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); + if(_settings.vcn_activity) + { + for(std::size_t i = 0; i < std::size(itr.m_gpu_metrics.vcn_activity); + ++i) + counter_track::emplace( + _dev_id, + addendum(("VCN Activity on " + std::to_string(i)).c_str()), + "%"); + } } uint64_t _ts = itr.m_ts; if(!_thread_info->is_valid_time(_ts)) continue; @@ -322,6 +335,16 @@ data::post_process(uint32_t _dev_id) if(_settings.mem_usage) TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, _idx.at(3)), _ts, _usage); + if(_settings.vcn_activity) + { + uint64_t idx = _idx.at(4); + for(const auto& temp : itr.m_gpu_metrics.vcn_activity) + { + TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx), + _ts, temp); + ++idx; + } + } } }; @@ -411,6 +434,7 @@ setup() key_pair_t{ "temp", get_settings(dev_id).temp }, key_pair_t{ "power", get_settings(dev_id).power }, key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage }, + key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity }, }; get_settings(dev_id) = { false, false, false, false }; @@ -491,3 +515,7 @@ ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) diff --git a/source/lib/rocprof-sys/library/rocm_smi.hpp b/source/lib/rocprof-sys/library/rocm_smi.hpp index ef1b3d43..14b039f4 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.hpp +++ b/source/lib/rocprof-sys/library/rocm_smi.hpp @@ -34,6 +34,8 @@ #include "core/state.hpp" #include "library/thread_data.hpp" +#include + #include #include #include @@ -45,6 +47,8 @@ #include #include +//struct rsmi_gpu_metrics_t; + namespace rocprofsys { namespace rocm_smi @@ -71,10 +75,11 @@ device_count(); struct settings { - bool busy = true; - bool temp = true; - bool power = true; - bool mem_usage = true; + bool busy = true; + bool temp = true; + bool power = true; + bool mem_usage = true; + bool vcn_activity = true; }; struct data @@ -99,12 +104,13 @@ struct data static void post_process(uint32_t _dev_id); - uint32_t m_dev_id = std::numeric_limits::max(); - timestamp_t m_ts = 0; - busy_perc_t m_busy_perc = 0; - temp_t m_temp = 0; - power_t m_power = 0; - mem_usage_t m_mem_usage = 0; + uint32_t m_dev_id = std::numeric_limits::max(); + timestamp_t m_ts = 0; + busy_perc_t m_busy_perc = 0; + temp_t m_temp = 0; + power_t m_power = 0; + mem_usage_t m_mem_usage = 0; + rsmi_gpu_metrics_t m_gpu_metrics; friend std::ostream& operator<<(std::ostream& _os, const data& _v) { @@ -179,5 +185,9 @@ ROCPROFSYS_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + # endif #endif diff --git a/source/lib/rocprof-sys/library/sampling.cpp b/source/lib/rocprof-sys/library/sampling.cpp index 6615fcd1..8bd394cd 100644 --- a/source/lib/rocprof-sys/library/sampling.cpp +++ b/source/lib/rocprof-sys/library/sampling.cpp @@ -129,6 +129,7 @@ using component::sampling_gpu_busy; using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; +using component::sampling_gpu_vcn; using component::sampling_percent; using component::sampling_wall_clock; } // namespace sampling @@ -1572,6 +1573,12 @@ struct sampling_initialization sampling_gpu_temp::display_unit() = "degC"; sampling_gpu_temp::set_precision(1); sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags()); + + sampling_gpu_vcn::label() = "sampling_gpu_vcn_percent"; + sampling_gpu_vcn::description() = "Utilization of VCN(s)"; + sampling_gpu_vcn::set_precision(0); + sampling_gpu_vcn::set_format_flags(sampling_gpu_vcn::get_format_flags() & + std::ios_base::showpoint); } }; } // namespace