Skip to content

Commit

Permalink
Test GPU metrics API in rocm smi library
Browse files Browse the repository at this point in the history
Fetch the GPU metrics using the rsmi_dev_gpu_metrics_info_get
and test the values.

Signed-off-by: Sajina P Kandy <[email protected]>
  • Loading branch information
Sajina Kandy authored and sputhala-amd committed Dec 16, 2024
1 parent 88aa2d3 commit 61adbfe
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 9 deletions.
2 changes: 2 additions & 0 deletions source/lib/core/categories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_busy, ROCPROFSYS_CATEGORY_ROCM_SMI
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_temp, ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP, "device_temp", "Temperature of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_power, ROCPROFSYS_CATEGORY_ROCM_SMI_POWER, "device_power", "Power consumption of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_memory_usage, ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_smi_vcn_activity, ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
Expand Down Expand Up @@ -167,6 +168,7 @@ using name = perfetto_category<Tp...>;
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_temp), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_power), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_memory_usage), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_smi_vcn_activity), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
Expand Down
13 changes: 13 additions & 0 deletions source/lib/core/components/fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,16 @@ struct backtrace_gpu_power
{};
struct backtrace_gpu_memory
{};
struct backtrace_gpu_vcn
{};
using sampling_wall_clock = data_tracker<double, backtrace_wall_clock>;
using sampling_cpu_clock = data_tracker<double, backtrace_cpu_clock>;
using sampling_percent = data_tracker<double, backtrace_fraction>;
using sampling_gpu_busy = data_tracker<double, backtrace_gpu_busy>;
using sampling_gpu_temp = data_tracker<double, backtrace_gpu_temp>;
using sampling_gpu_power = data_tracker<double, backtrace_gpu_power>;
using sampling_gpu_memory = data_tracker<double, backtrace_gpu_memory>;
using sampling_gpu_vcn = data_tracker<double, backtrace_gpu_vcn>;

template <typename ApiT, typename StartFuncT = default_functor_t,
typename StopFuncT = default_functor_t>
Expand Down Expand Up @@ -121,6 +124,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, fal
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_vcn, false_type)
#endif

TIMEMORY_SET_COMPONENT_API(rocprofsys::component::roctracer, project::rocprofsys,
Expand Down Expand Up @@ -152,6 +156,9 @@ TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_temp, project::ro
tpls::rocm, device::gpu, os::supports_linux,
category::temperature, category::sampling,
category::process_sampling)
TIMEMORY_SET_COMPONENT_API(rocprofsys::component::sampling_gpu_vcn, project::rocprofsys,
tpls::rocm, device::gpu, os::supports_linux,
category::sampling, category::process_sampling)

TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::roctracer, "roctracer",
"High-precision ROCm API and kernel tracing", "")
Expand Down Expand Up @@ -180,6 +187,10 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_power,
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_temp,
"sampling_gpu_temp", "GPU Temperature via ROCm-SMI",
"Derived from sampling")
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn,
"sampling_gpu_vcn",
"VCN Activity (% activity) via ROCm-SMI",
"Derived from sampling")

// statistics type
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_wall_clock, double)
Expand All @@ -188,6 +199,7 @@ TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_busy, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_temp, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_power, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_memory, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::sampling_gpu_vcn, double)
TIMEMORY_STATISTICS_TYPE(rocprofsys::component::comm_data_tracker_t, float)

// enable timing units
Expand Down Expand Up @@ -219,6 +231,7 @@ ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_busy, false
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_vcn, false_type)

// reporting categories (mean)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type)
Expand Down
4 changes: 2 additions & 2 deletions source/lib/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -627,8 +627,8 @@ configure_settings(bool _init)
rocprofiler_sdk::config_settings(_config);

ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_ROCM_SMI_METRICS",
"rocm-smi metrics to collect: busy, temp, power, mem_usage",
"busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm",
"rocm-smi metrics to collect: busy, temp, power, mem_usage,vcn_activity",
"busy,temp,power,mem_usage,vcn_activity", "backend", "rocm_smi", "rocm",
"process_sampling", "advanced");

ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ extern "C"
ROCPROFSYS_CATEGORY_ROCM_SMI_TEMP,
ROCPROFSYS_CATEGORY_ROCM_SMI_POWER,
ROCPROFSYS_CATEGORY_ROCM_SMI_MEMORY_USAGE,
ROCPROFSYS_CATEGORY_ROCM_SMI_VCN_ACTIVITY,
ROCPROFSYS_CATEGORY_ROCM_RCCL,
ROCPROFSYS_CATEGORY_SAMPLING,
ROCPROFSYS_CATEGORY_PTHREAD,
Expand Down
37 changes: 30 additions & 7 deletions source/lib/rocprof-sys/library/rocm_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ data::sample(uint32_t _dev_id)
&m_power, &power_type)
ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get,
_dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage);
ROCPROFSYS_RSMI_GET(get_settings(m_dev_id).vcn_activity, rsmi_dev_gpu_metrics_info_get,
_dev_id, &m_gpu_metrics);

#undef ROCPROFSYS_RSMI_GET
}
Expand All @@ -168,7 +170,7 @@ data::print(std::ostream& _os) const
{
std::stringstream _ss{};
_ss << "device: " << m_dev_id << ", busy = " << m_busy_perc << "%, temp = " << m_temp
<< ", power = " << m_power << ", memory usage = " << m_mem_usage;
<< ", power = " << m_power << ", memory usage = " << m_mem_usage ;
_os << _ss.str();
}

Expand Down Expand Up @@ -257,6 +259,7 @@ data::post_process(uint32_t _dev_id)
using component::sampling_gpu_memory;
using component::sampling_gpu_power;
using component::sampling_gpu_temp;
using component::sampling_gpu_vcn;

if(device_count < _dev_id) return;

Expand All @@ -273,14 +276,15 @@ data::post_process(uint32_t _dev_id)
auto _settings = get_settings(_dev_id);

auto _process_perfetto = [&]() {
auto _idx = std::array<uint64_t, 4>{};
auto _idx = std::array<uint64_t, 5>{};
{
_idx.fill(_idx.size());
uint64_t nidx = 0;
if(_settings.busy) _idx.at(0) = nidx++;
if(_settings.temp) _idx.at(1) = nidx++;
if(_settings.power) _idx.at(2) = nidx++;
if(_settings.mem_usage) _idx.at(3) = nidx++;
if(_settings.vcn_activity) _idx.at(4) = nidx++;
}

for(auto& itr : _rocm_smi)
Expand All @@ -293,14 +297,18 @@ data::post_process(uint32_t _dev_id)
return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)");
};

if(_settings.busy) counter_track::emplace(_dev_id, addendum("Busy"), "%");
if(_settings.busy)
counter_track::emplace(_dev_id, addendum("Busy"), "%");
if(_settings.temp)
counter_track::emplace(_dev_id, addendum("Temperature"), "deg C");
if(_settings.power)
counter_track::emplace(_dev_id, addendum("Power"), "watts");
if(_settings.mem_usage)
counter_track::emplace(_dev_id, addendum("Memory Usage"),
"megabytes");
counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes");
if(_settings.vcn_activity) {
for (std::size_t i = 0; i < std::size(itr.m_gpu_metrics.vcn_activity); ++i)
counter_track::emplace(_dev_id, addendum(("VCN Activity on " + std::to_string(i)).c_str()), "%");
}
}
uint64_t _ts = itr.m_ts;
if(!_thread_info->is_valid_time(_ts)) continue;
Expand All @@ -320,8 +328,18 @@ data::post_process(uint32_t _dev_id)
TRACE_COUNTER("device_power", counter_track::at(_dev_id, _idx.at(2)), _ts,
_power);
if(_settings.mem_usage)
TRACE_COUNTER("device_memory_usage",
counter_track::at(_dev_id, _idx.at(3)), _ts, _usage);
TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, _idx.at(3)), _ts,
_usage);
if(_settings.vcn_activity)
{
uint64_t idx = _idx.at(4);
for (const auto& temp : itr.m_gpu_metrics.vcn_activity)
{
TRACE_COUNTER("device_vcn_activity", counter_track::at(_dev_id, idx), _ts,
temp);
++idx;
}
}
}
};

Expand Down Expand Up @@ -411,6 +429,7 @@ setup()
key_pair_t{ "temp", get_settings(dev_id).temp },
key_pair_t{ "power", get_settings(dev_id).power },
key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage },
key_pair_t{ "vcn_activity", get_settings(dev_id).vcn_activity },
};

get_settings(dev_id) = { false, false, false, false };
Expand Down Expand Up @@ -491,3 +510,7 @@ ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_memory>), true,
double)

ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_vcn>), true,
double)
8 changes: 8 additions & 0 deletions source/lib/rocprof-sys/library/rocm_smi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include "core/state.hpp"
#include "library/thread_data.hpp"

#include "rocm_smi/rocm_smi.h"

#include <chrono>
#include <cstdint>
#include <deque>
Expand Down Expand Up @@ -75,6 +77,7 @@ struct settings
bool temp = true;
bool power = true;
bool mem_usage = true;
bool vcn_activity = true;
};

struct data
Expand Down Expand Up @@ -105,6 +108,7 @@ struct data
temp_t m_temp = 0;
power_t m_power = 0;
mem_usage_t m_mem_usage = 0;
rsmi_gpu_metrics_t m_gpu_metrics;

friend std::ostream& operator<<(std::ostream& _os, const data& _v)
{
Expand Down Expand Up @@ -179,5 +183,9 @@ ROCPROFSYS_DECLARE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_memory>), true,
double)

ROCPROFSYS_DECLARE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<double, rocprofsys::component::backtrace_gpu_vcn>), true,
double)

# endif
#endif
7 changes: 7 additions & 0 deletions source/lib/rocprof-sys/library/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ using component::sampling_gpu_busy;
using component::sampling_gpu_memory;
using component::sampling_gpu_power;
using component::sampling_gpu_temp;
using component::sampling_gpu_vcn;
using component::sampling_percent;
using component::sampling_wall_clock;
} // namespace sampling
Expand Down Expand Up @@ -1572,6 +1573,12 @@ struct sampling_initialization
sampling_gpu_temp::display_unit() = "degC";
sampling_gpu_temp::set_precision(1);
sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags());

sampling_gpu_vcn::label() = "sampling_gpu_vcn_percent";
sampling_gpu_vcn::description() = "Utilization of VCN(s)";
sampling_gpu_vcn::set_precision(0);
sampling_gpu_vcn::set_format_flags(sampling_gpu_vcn::get_format_flags() &
std::ios_base::showpoint);
}
};
} // namespace
Expand Down

0 comments on commit 61adbfe

Please sign in to comment.