diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index 0f09f4f1..cdf122ba 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -105,6 +105,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_vcn_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions") ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions") ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions") @@ -167,6 +168,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_vcn_activity), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \ ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \ ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \ diff --git a/source/lib/core/components/fwd.hpp b/source/lib/core/components/fwd.hpp index 8e9343d9..272ecb0f 100644 --- a/source/lib/core/components/fwd.hpp +++ b/source/lib/core/components/fwd.hpp @@ -82,6 +82,8 @@ struct backtrace_gpu_power {}; struct backtrace_gpu_memory {}; +struct backtrace_gpu_vcn +{}; using sampling_wall_clock = data_tracker; using sampling_cpu_clock = data_tracker; using sampling_percent = data_tracker; @@ -89,6 +91,7 @@ using sampling_gpu_busy = data_tracker; using sampling_gpu_temp = data_tracker; using sampling_gpu_power = data_tracker; using sampling_gpu_memory = data_tracker; +using sampling_gpu_vcn = data_tracker; template @@ -121,6 +124,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, fal ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_vcn, false_type) #endif TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer, project::rocprofsys, @@ -152,6 +156,9 @@ TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_temp, project::ro tpls::rocm, device::gpu, os::supports_linux, category::temperature, category::sampling, category::process_sampling) +TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_vcn, project::rocprofsys, + tpls::rocm, device::gpu, os::supports_linux, + category::sampling, category::process_sampling) TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::roctracer, "roctracer", "High-precision ROCm API and kernel tracing", "") @@ -180,6 +187,10 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_power, TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_temp, "sampling_gpu_temp", "GPU Temperature via ROCm-SMI", "Derived from sampling") +TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn, + "sampling_gpu_vcn", + "VCN Activity (% activity) via ROCm-SMI", + "Derived from sampling") // statistics type TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_wall_clock, double) @@ -188,6 +199,7 @@ TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_busy, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_temp, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_power, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_memory, double) +TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_vcn, double) TIMEMORY_STATISTICS_TYPE(rocprofsys::component::comm_data_tracker_t, float) // enable timing units @@ -219,6 +231,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_busy, false ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type) +ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_vcn, false_type) // reporting categories (mean) ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type) diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index b5f249d4..67df2a46 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -627,8 +627,8 @@ configure_settings(bool _init) rocprofiler_sdk::config_settings(_config); ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS", - "rocm-smi metrics to collect: busy, temp, power, mem_usage", - "busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm", + "rocm-smi metrics to collect: busy, temp, power, mem_usage,vcn_activity", + "busy,temp,power,mem_usage,vcn_activity", "backend", "rocm_smi", "rocm", "process_sampling", "advanced"); ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB", diff --git a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index fbc17bc7..82db6199 100644 --- a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -57,6 +57,7 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, + ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, ROCPROFSYS_CATEGORY_ROCM_RCCL, ROCPROFSYS_CATEGORY_SAMPLING, ROCPROFSYS_CATEGORY_PTHREAD, diff --git a/source/lib/rocprof-sys/library/rocm_smi.cpp b/source/lib/rocprof-sys/library/rocm_smi.cpp index 202a8cd8..9b9aa914 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.cpp +++ b/source/lib/rocprof-sys/library/rocm_smi.cpp @@ -159,6 +159,8 @@ data::sample(uint32_t _dev_id) &m_power, &power_type) ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get, _dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage); + ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).vcn_activity, rsmi_dev_gpu_metrics_info_get, + _dev_id, &m_gpu_metrics); #undef ROCPROFSYS_RSMI_GET } @@ -168,7 +170,7 @@ data::print(std::ostream& _os) const { std::stringstream _ss{}; _ss << "device: " << m_dev_id << ", busy = " << m_busy_perc << "%, temp = " << m_temp - << ", power = " << m_power << ", memory usage = " << m_mem_usage; + << ", power = " << m_power << ", memory usage = " << m_mem_usage ; _os << _ss.str(); } @@ -257,6 +259,7 @@ data::post_process(uint32_t _dev_id) using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; + using component::sampling_gpu_vcn; if(device_count < _dev_id) return; @@ -273,7 +276,7 @@ data::post_process(uint32_t _dev_id) auto _settings = get_settings(_dev_id); auto _process_perfetto = [&]() { - auto _idx = std::array{}; + auto _idx = std::array{}; { _idx.fill(_idx.size()); uint64_t nidx = 0; @@ -281,6 +284,7 @@ data::post_process(uint32_t _dev_id) if(_settings.temp) _idx.at(1) = nidx++; if(_settings.power) _idx.at(2) = nidx++; if(_settings.mem_usage) _idx.at(3) = nidx++; + if(_settings.vcn_activity) _idx.at(4) = nidx++; } for(auto& itr : _rocm_smi) @@ -293,14 +297,18 @@ data::post_process(uint32_t _dev_id) return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)"); }; - if(_settings.busy) counter_track::emplace(_dev_id, addendum("Busy"), "%"); + if(_settings.busy) + counter_track::emplace(_dev_id, addendum("Busy"), "%"); if(_settings.temp) counter_track::emplace(_dev_id, addendum("Temperature"), "deg C"); if(_settings.power) counter_track::emplace(_dev_id, addendum("Power"), "watts"); if(_settings.mem_usage) - counter_track::emplace(_dev_id, addendum("Memory Usage"), - "megabytes"); + counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); + if(_settings.vcn_activity) { + for (std::size_t i = 0; i < std::size(itr.m_gpu_metrics.vcn_activity); ++i) + counter_track::emplace(_dev_id, addendum(("VCN Activity on " + std::to_string(i)).c_str()), "%"); + } } uint64_t _ts = itr.m_ts; if(!_thread_info->is_valid_time(_ts)) continue; @@ -320,8 +328,18 @@ data::post_process(uint32_t _dev_id) TRACE_COUNTER("device_power", counter_track::at(_dev_id, _idx.at(2)), _ts, _power); if(_settings.mem_usage) - TRACE_COUNTER("device_memory_usage", - counter_track::at(_dev_id, _idx.at(3)), _ts, _usage); + TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, _idx.at(3)), _ts, + _usage); + if(_settings.vcn_activity) + { + uint64_t idx = _idx.at(4); + for (const auto& temp : itr.m_gpu_metrics.vcn_activity) + { + TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx), _ts, + temp); + ++idx; + } + } } }; @@ -411,6 +429,7 @@ setup() key_pair_t{ "temp", get_settings(dev_id).temp }, key_pair_t{ "power", get_settings(dev_id).power }, key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage }, + key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity }, }; get_settings(dev_id) = { false, false, false, false }; @@ -491,3 +510,7 @@ ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) diff --git a/source/lib/rocprof-sys/library/rocm_smi.hpp b/source/lib/rocprof-sys/library/rocm_smi.hpp index ef1b3d43..fac91b8d 100644 --- a/source/lib/rocprof-sys/library/rocm_smi.hpp +++ b/source/lib/rocprof-sys/library/rocm_smi.hpp @@ -34,6 +34,8 @@ #include "core/state.hpp" #include "library/thread_data.hpp" +#include "rocm_smi/rocm_smi.h" + #include #include #include @@ -75,6 +77,7 @@ struct settings bool temp = true; bool power = true; bool mem_usage = true; + bool vcn_activity = true; }; struct data @@ -105,6 +108,7 @@ struct data temp_t m_temp = 0; power_t m_power = 0; mem_usage_t m_mem_usage = 0; + rsmi_gpu_metrics_t m_gpu_metrics; friend std::ostream& operator<<(std::ostream& _os, const data& _v) { @@ -179,5 +183,9 @@ ROCPROFSYS_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + # endif #endif diff --git a/source/lib/rocprof-sys/library/sampling.cpp b/source/lib/rocprof-sys/library/sampling.cpp index 6615fcd1..8ad6a98b 100644 --- a/source/lib/rocprof-sys/library/sampling.cpp +++ b/source/lib/rocprof-sys/library/sampling.cpp @@ -129,6 +129,7 @@ using component::sampling_gpu_busy; using component::sampling_gpu_memory; using component::sampling_gpu_power; using component::sampling_gpu_temp; +using component::sampling_gpu_vcn; using component::sampling_percent; using component::sampling_wall_clock; } // namespace sampling @@ -1572,6 +1573,12 @@ struct sampling_initialization sampling_gpu_temp::display_unit() = "degC"; sampling_gpu_temp::set_precision(1); sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags()); + + sampling_gpu_vcn::label() = "sampling_gpu_vcn_percent"; + sampling_gpu_vcn::description() = "Utilization of VCN(s)"; + sampling_gpu_vcn::set_precision(0); + sampling_gpu_vcn::set_format_flags(sampling_gpu_vcn::get_format_flags() & + std::ios_base::showpoint); } }; } // namespace