Skip to content

Commit

Permalink
Merge amd-dev into amd-master 20231215
Browse files Browse the repository at this point in the history
Signed-off-by: guanyu12 <[email protected]>
Change-Id: I411755ed67b0d92d37676ca4b7f0709971f4a80f
  • Loading branch information
guanyu12 committed Dec 15, 2023
2 parents 2e133e1 + 16ed186 commit 5c53dd3
Show file tree
Hide file tree
Showing 13 changed files with 565 additions and 32 deletions.
3 changes: 2 additions & 1 deletion amdsmi_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,6 @@ LIMIT:
DRIVER:
DRIVER_NAME: amdgpu
DRIVER_VERSION: 6.1.10
DRIVER_DATE: 2015/01/01 00:00
VRAM:
VRAM_TYPE: GDDR6
VRAM_VENDOR: SAMSUNG
Expand All @@ -448,6 +447,8 @@ CACHE:
CACHE 0:
CACHE_SIZE: 16 KB
CACHE_LEVEL: 1
MAX_NUM_CU_SHARED: 1
NUM_CACHE_INSTANCE: 304
RAS:
EEPROM_VERSION: N/A
PARITY_SCHEMA: N/A
Expand Down
1 change: 1 addition & 0 deletions amdsmi_cli/amdsmi_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def _print_error(e, destination):
amd_smi_commands.topology,
amd_smi_commands.set_value,
amd_smi_commands.reset,
amd_smi_commands.monitor,
amd_smi_commands.rocm_smi)
try:
try:
Expand Down
416 changes: 410 additions & 6 deletions amdsmi_cli/amdsmi_commands.py

Large diffs are not rendered by default.

56 changes: 47 additions & 9 deletions amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import json
import re
import time
from typing import Dict
import yaml
from enum import Enum

Expand All @@ -37,6 +38,8 @@ def __init__(self, format='human_readable', destination='stdout') -> None:
self.watch_output = []
self.format = format # csv, json, or human_readable
self.destination = destination # stdout, path to a file (append)
self.table_header = ""
self.table_title = ""
self.helpers = AMDSMIHelpers()


Expand Down Expand Up @@ -95,7 +98,30 @@ def _capitalize_keys(self, input_dict):
return output_dict


def _convert_json_to_human_readable(self, json_object):
def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False):
# TODO make dynamic
if tabular:
table_values = ''
for key, value in json_object.items():
value = str(value)
if key == 'gpu':
table_values += value.rjust(3)
elif key == 'timestamp':
table_values += value.rjust(10) + ' '
elif key == 'power_usage':
table_values += value.rjust(7)
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
table_values += value.rjust(11)
elif key == 'vram_total' or 'ecc' in key:
table_values += value.rjust(12)
elif key in ('throttle_status', 'pcie_replay'):
table_values += value.rjust(13)
elif 'gpu_' in key: # handle topology tables
table_values += value.rjust(13)
else:
table_values += value.rjust(10)
return table_values.rstrip()

# First Capitalize all keys in the json object
capitalized_json = self._capitalize_keys(json_object)
json_string = json.dumps(capitalized_json, indent=4)
Expand Down Expand Up @@ -266,7 +292,7 @@ def store_watch_output(self, multiple_device_enabled=False):
self.output = {}


def print_output(self, multiple_device_enabled=False, watching_output=False):
def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
""" Print current output acording to format and then destination
params:
multiple_device_enabled (bool) - True if printing output from
Expand All @@ -280,10 +306,11 @@ def print_output(self, multiple_device_enabled=False, watching_output=False):
watching_output=watching_output)
elif self.is_csv_format():
self._print_csv_output(multiple_device_enabled=multiple_device_enabled,
watching_output=watching_output)
watching_output=watching_output)
elif self.is_human_readable_format():
self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
watching_output=watching_output)
watching_output=watching_output,
tabular=tabular)


def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
Expand Down Expand Up @@ -360,14 +387,19 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
writer.writerows(stored_csv_output)


def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False):
def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
human_readable_output = ''
if tabular:
if self.table_title:
human_readable_output += self.table_title + ':\n'
human_readable_output += self.table_header + '\n'

if multiple_device_enabled:
human_readable_output = ''
for output in self.multiple_device_output:
human_readable_output += self._convert_json_to_human_readable(output)
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
human_readable_output += '\n'
else:
human_readable_output = self._convert_json_to_human_readable(self.output)
human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular)

if self.destination == 'stdout':
try:
Expand All @@ -380,8 +412,14 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
if watching_output:
with self.destination.open('w') as output_file:
human_readable_output = ''
if tabular:
if self.table_title:
human_readable_output += self.table_title + '\n'
human_readable_output += self.table_header + '\n'
for output in self.watch_output:
human_readable_output += self._convert_json_to_human_readable(output)
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
if tabular:
human_readable_output += '\n'
output_file.write(human_readable_output + '\n')
else:
with self.destination.open('a') as output_file:
Expand Down
52 changes: 51 additions & 1 deletion amdsmi_cli/amdsmi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser):
argparse (ArgumentParser): argparse.ArgumentParser
"""
def __init__(self, version, list, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, rocmsmi):
process, profile, event, topology, set_value, reset, monitor,
rocmsmi):

# Helper variables
self.helpers = AMDSMIHelpers()
Expand Down Expand Up @@ -105,6 +106,7 @@ def __init__(self, version, list, static, firmware, bad_pages, metric,
self._add_topology_parser(self.subparsers, topology)
self._add_set_value_parser(self.subparsers, set_value)
self._add_reset_parser(self.subparsers, reset)
self._add_monitor_parser(self.subparsers, monitor)
self._add_rocm_smi_parser(self.subparsers, rocmsmi)


Expand Down Expand Up @@ -813,6 +815,54 @@ def _add_reset_parser(self, subparsers, func):
reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)


def _add_monitor_parser(self, subparsers, func):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return

# Subparser help text
monitor_help = "Monitor metrics for target devices"
monitor_subcommand_help = "Monitor a target device for the specified arguments.\
\nIf no arguments are provided, all arguments will be enabled.\
\nUse the watch arguments to run continuously"
monitor_optionals_title = "Monitor Arguments"

# Help text for Arguments only on Guest and BM platforms
power_usage_help = "Monitor power usage in Watts"
temperature_help = "Monitor temperature in Celsius"
gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
throttle_help = "Monitor thermal throttle status"
ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
mem_usage_help = "Monitor memory usage in MB"
pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"

# Create monitor subparser
monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
monitor_parser._optionals.title = monitor_optionals_title
monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
monitor_parser.set_defaults(func=func)

# Add Universal Arguments
self._add_command_modifiers(monitor_parser)
self._add_device_arguments(monitor_parser, required=False)
self._add_watch_arguments(monitor_parser)

# Add monitor arguments
monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)


def _add_rocm_smi_parser(self, subparsers, func):
return
# Subparser help text
Expand Down
3 changes: 3 additions & 0 deletions example/amd_smi_drm_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ int main() {
cache_info.cache[i].cache_level,
cache_info.cache[i].cache_size_kb,
cache_info.cache[i].flags);
printf("\tMax number CU shared: %d, Number of instances: %d\n",
cache_info.cache[i].max_num_cu_shared,
cache_info.cache[i].num_cache_instance);
}

// Get power measure
Expand Down
2 changes: 2 additions & 0 deletions include/amd_smi/amdsmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,8 @@ typedef struct {
uint32_t cache_size_kb; /* In KB */
uint32_t cache_level;
uint32_t flags; // amdsmi_cache_flags_type_t which is a bitmask
uint32_t max_num_cu_shared; /* Indicates how many Compute Units share this cache instance */
uint32_t num_cache_instance; /* total number of instance of this cache type */
uint32_t reserved[3];
} cache[AMDSMI_MAX_CACHE_TYPES];
uint32_t reserved[15];
Expand Down
10 changes: 10 additions & 0 deletions py-interface/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,14 @@
from .amdsmi_interface import amdsmi_is_P2P_accessible
from .amdsmi_interface import amdsmi_get_xgmi_info

# # Partition Functions
from .amdsmi_interface import amdsmi_get_gpu_compute_partition
from .amdsmi_interface import amdsmi_set_gpu_compute_partition
from .amdsmi_interface import amdsmi_reset_gpu_compute_partition
from .amdsmi_interface import amdsmi_get_gpu_memory_partition
from .amdsmi_interface import amdsmi_set_gpu_memory_partition
from .amdsmi_interface import amdsmi_reset_gpu_memory_partition

# # Individual GPU Metrics Functions
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem
Expand Down Expand Up @@ -263,6 +271,8 @@
from .amdsmi_interface import AmdSmiTemperatureMetric
from .amdsmi_interface import AmdSmiVoltageMetric
from .amdsmi_interface import AmdSmiVoltageType
from .amdsmi_interface import AmdSmiComputePartitionType
from .amdsmi_interface import AmdSmiMemoryPartitionType
from .amdsmi_interface import AmdSmiPowerProfilePresetMasks
from .amdsmi_interface import AmdSmiGpuBlock
from .amdsmi_interface import AmdSmiRasErrState
Expand Down
34 changes: 20 additions & 14 deletions py-interface/amdsmi_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,17 +1308,22 @@ def amdsmi_get_gpu_cache_info(
for cache_index in range(cache_info.num_cache_types):
cache_size = cache_info.cache[cache_index].cache_size_kb
cache_level = cache_info.cache[cache_index].cache_level
max_num_cu_shared = cache_info.cache[cache_index].max_num_cu_shared
num_cache_instance = cache_info.cache[cache_index].num_cache_instance
cache_flags = cache_info.cache[cache_index].flags
data_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE)
inst_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_INST_CACHE)
cpu_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE)
simd_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE)
cache_info_dict[f"cache {cache_index}"] = {"cache_size": cache_size,
"cache_level": cache_level,
"data_cache": data_cache,
"instruction_cache": inst_cache,
"cpu_cache": cpu_cache,
"simd_cache": simd_cache}
"max_num_cu_shared": max_num_cu_shared,
"num_cache_instance": num_cache_instance}
if (data_cache): cache_info_dict[f"cache {cache_index}"]["data_cache"] = data_cache
if (inst_cache): cache_info_dict[f"cache {cache_index}"]["inst_cache"] = inst_cache
if (cpu_cache): cache_info_dict[f"cache {cache_index}"]["cpu_cache"] = cpu_cache
if (simd_cache): cache_info_dict[f"cache {cache_index}"]["simd_cache"] = simd_cache


if cache_info_dict == {}:
raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA)
Expand Down Expand Up @@ -1624,8 +1629,7 @@ def amdsmi_get_gpu_driver_info(

return {
"driver_name": info.driver_name.decode("utf-8"),
"driver_version": info.driver_version.decode("utf-8"),
"driver_date": info.driver_date.decode("utf-8")
"driver_version": info.driver_version.decode("utf-8")
}


Expand Down Expand Up @@ -3244,7 +3248,7 @@ def amdsmi_get_gpu_metrics_temp_hotspot(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

hotspot_value = ctypes.c_int16()
hotspot_value = ctypes.c_uint16()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hotspot(
processor_handle, ctypes.byref(hotspot_value)
Expand All @@ -3265,7 +3269,7 @@ def amdsmi_get_gpu_metrics_temp_mem(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

mem_value = ctypes.c_int16()
mem_value = ctypes.c_uint16()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_mem(
processor_handle, ctypes.byref(mem_value)
Expand All @@ -3286,7 +3290,7 @@ def amdsmi_get_gpu_metrics_temp_vrsoc(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

vrsoc_value = ctypes.c_int16()
vrsoc_value = ctypes.c_uint16()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrsoc(
processor_handle, ctypes.byref(vrsoc_value)
Expand Down Expand Up @@ -3754,7 +3758,7 @@ def amdsmi_get_gpu_metrics_vcn_activity(
)
)

return [vcn_activity.value for vcn_activity in vcn_activity_value]
return vcn_activity_value


def amdsmi_get_gpu_metrics_xgmi_read_data(
Expand Down Expand Up @@ -3811,7 +3815,9 @@ def amdsmi_get_gpu_metrics_curr_gfxclk(
)
)

return [curr_gfxclk.value for curr_gfxclk in current_gfxclk_value]
print([curr_gfxclk for curr_gfxclk in current_gfxclk_value])

return [curr_gfxclk for curr_gfxclk in current_gfxclk_value]


def amdsmi_get_gpu_metrics_curr_socclk(
Expand Down Expand Up @@ -3879,7 +3885,7 @@ def amdsmi_get_gpu_metrics_temp_edge(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

edge_value = ctypes.c_int16()
edge_value = ctypes.c_uint16()

_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_edge(
Expand All @@ -3901,7 +3907,7 @@ def amdsmi_get_gpu_metrics_temp_vrgfx(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

vrgfx_value = ctypes.c_int16()
vrgfx_value = ctypes.c_uint16()

_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrgfx(
Expand All @@ -3923,7 +3929,7 @@ def amdsmi_get_gpu_metrics_temp_vrmem(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)

vrmem_value = ctypes.c_int16()
vrmem_value = ctypes.c_uint16()

_check_res(
amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrmem(
Expand Down
2 changes: 2 additions & 0 deletions py-interface/amdsmi_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,8 @@ class struct_cache_(Structure):
('cache_size_kb', ctypes.c_uint32),
('cache_level', ctypes.c_uint32),
('flags', ctypes.c_uint32),
('max_num_cu_shared', ctypes.c_uint32),
('num_cache_instance', ctypes.c_uint32),
('reserved', ctypes.c_uint32 * 3),
]

Expand Down
2 changes: 2 additions & 0 deletions rocm_smi/include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,8 @@ typedef struct {
so HSA_CACHE_TYPE_DATA|HSA_CACHE_TYPE_HSACU == 9
*/
uint32_t flags;
uint32_t max_num_cu_shared; /* Indicates how many Compute Units share this cache instance */
uint32_t num_cache_instance; /* total number of instance of this cache type */
} cache[RSMI_MAX_CACHE_TYPES];
} rsmi_gpu_cache_info_t;
/// \cond Ignore in docs.
Expand Down
Loading

0 comments on commit 5c53dd3

Please sign in to comment.