Skip to content

Commit

Permalink
Merge amd-master into rocm-rel-6.0 20231214
Browse files Browse the repository at this point in the history
Signed-off-by: Maisam Arif <[email protected]>
Change-Id: Icff41b8e3c8a8374dde68dc21e4f57488e6fbc09
  • Loading branch information
marifamd committed Dec 15, 2023
2 parents 3f96bd1 + 5c53dd3 commit dba4f54
Show file tree
Hide file tree
Showing 20 changed files with 594 additions and 54 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan
5 changes: 3 additions & 2 deletions amdsmi_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ ASIC:
DEVICE_ID: 0x73bf
REV_ID: 0xc3
ASIC_SERIAL: 0xffffffffffffffff
XGMI_PHYSICAL_ID: N/A
OAM_ID: N/A
BUS:
BDF: 0000:23:00.0
MAX_PCIE_SPEED: 16 GT/s
Expand Down Expand Up @@ -439,7 +439,6 @@ LIMIT:
DRIVER:
DRIVER_NAME: amdgpu
DRIVER_VERSION: 6.1.10
DRIVER_DATE: 2015/01/01 00:00
VRAM:
VRAM_TYPE: GDDR6
VRAM_VENDOR: SAMSUNG
Expand All @@ -448,6 +447,8 @@ CACHE:
CACHE 0:
CACHE_SIZE: 16 KB
CACHE_LEVEL: 1
MAX_NUM_CU_SHARED: 1
NUM_CACHE_INSTANCE: 304
RAS:
EEPROM_VERSION: N/A
PARITY_SCHEMA: N/A
Expand Down
1 change: 1 addition & 0 deletions amdsmi_cli/amdsmi_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def _print_error(e, destination):
amd_smi_commands.topology,
amd_smi_commands.set_value,
amd_smi_commands.reset,
amd_smi_commands.monitor,
amd_smi_commands.rocm_smi)
try:
try:
Expand Down
420 changes: 412 additions & 8 deletions amdsmi_cli/amdsmi_commands.py

Large diffs are not rendered by default.

56 changes: 47 additions & 9 deletions amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import json
import re
import time
from typing import Dict
import yaml
from enum import Enum

Expand All @@ -37,6 +38,8 @@ def __init__(self, format='human_readable', destination='stdout') -> None:
self.watch_output = []
self.format = format # csv, json, or human_readable
self.destination = destination # stdout, path to a file (append)
self.table_header = ""
self.table_title = ""
self.helpers = AMDSMIHelpers()


Expand Down Expand Up @@ -95,7 +98,30 @@ def _capitalize_keys(self, input_dict):
return output_dict


def _convert_json_to_human_readable(self, json_object):
def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False):
# TODO make dynamic
if tabular:
table_values = ''
for key, value in json_object.items():
value = str(value)
if key == 'gpu':
table_values += value.rjust(3)
elif key == 'timestamp':
table_values += value.rjust(10) + ' '
elif key == 'power_usage':
table_values += value.rjust(7)
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
table_values += value.rjust(11)
elif key == 'vram_total' or 'ecc' in key:
table_values += value.rjust(12)
elif key in ('throttle_status', 'pcie_replay'):
table_values += value.rjust(13)
elif 'gpu_' in key: # handle topology tables
table_values += value.rjust(13)
else:
table_values += value.rjust(10)
return table_values.rstrip()

# First Capitalize all keys in the json object
capitalized_json = self._capitalize_keys(json_object)
json_string = json.dumps(capitalized_json, indent=4)
Expand Down Expand Up @@ -266,7 +292,7 @@ def store_watch_output(self, multiple_device_enabled=False):
self.output = {}


def print_output(self, multiple_device_enabled=False, watching_output=False):
def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
""" Print current output acording to format and then destination
params:
multiple_device_enabled (bool) - True if printing output from
Expand All @@ -280,10 +306,11 @@ def print_output(self, multiple_device_enabled=False, watching_output=False):
watching_output=watching_output)
elif self.is_csv_format():
self._print_csv_output(multiple_device_enabled=multiple_device_enabled,
watching_output=watching_output)
watching_output=watching_output)
elif self.is_human_readable_format():
self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
watching_output=watching_output)
watching_output=watching_output,
tabular=tabular)


def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
Expand Down Expand Up @@ -360,14 +387,19 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
writer.writerows(stored_csv_output)


def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False):
def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
human_readable_output = ''
if tabular:
if self.table_title:
human_readable_output += self.table_title + ':\n'
human_readable_output += self.table_header + '\n'

if multiple_device_enabled:
human_readable_output = ''
for output in self.multiple_device_output:
human_readable_output += self._convert_json_to_human_readable(output)
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
human_readable_output += '\n'
else:
human_readable_output = self._convert_json_to_human_readable(self.output)
human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular)

if self.destination == 'stdout':
try:
Expand All @@ -380,8 +412,14 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
if watching_output:
with self.destination.open('w') as output_file:
human_readable_output = ''
if tabular:
if self.table_title:
human_readable_output += self.table_title + '\n'
human_readable_output += self.table_header + '\n'
for output in self.watch_output:
human_readable_output += self._convert_json_to_human_readable(output)
human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
if tabular:
human_readable_output += '\n'
output_file.write(human_readable_output + '\n')
else:
with self.destination.open('a') as output_file:
Expand Down
52 changes: 51 additions & 1 deletion amdsmi_cli/amdsmi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser):
argparse (ArgumentParser): argparse.ArgumentParser
"""
def __init__(self, version, list, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, rocmsmi):
process, profile, event, topology, set_value, reset, monitor,
rocmsmi):

# Helper variables
self.helpers = AMDSMIHelpers()
Expand Down Expand Up @@ -105,6 +106,7 @@ def __init__(self, version, list, static, firmware, bad_pages, metric,
self._add_topology_parser(self.subparsers, topology)
self._add_set_value_parser(self.subparsers, set_value)
self._add_reset_parser(self.subparsers, reset)
self._add_monitor_parser(self.subparsers, monitor)
self._add_rocm_smi_parser(self.subparsers, rocmsmi)


Expand Down Expand Up @@ -813,6 +815,54 @@ def _add_reset_parser(self, subparsers, func):
reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)


def _add_monitor_parser(self, subparsers, func):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return

# Subparser help text
monitor_help = "Monitor metrics for target devices"
monitor_subcommand_help = "Monitor a target device for the specified arguments.\
\nIf no arguments are provided, all arguments will be enabled.\
\nUse the watch arguments to run continuously"
monitor_optionals_title = "Monitor Arguments"

# Help text for Arguments only on Guest and BM platforms
power_usage_help = "Monitor power usage in Watts"
temperature_help = "Monitor temperature in Celsius"
gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
throttle_help = "Monitor thermal throttle status"
ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
mem_usage_help = "Monitor memory usage in MB"
pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"

# Create monitor subparser
monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
monitor_parser._optionals.title = monitor_optionals_title
monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
monitor_parser.set_defaults(func=func)

# Add Universal Arguments
self._add_command_modifiers(monitor_parser)
self._add_device_arguments(monitor_parser, required=False)
self._add_watch_arguments(monitor_parser)

# Add monitor arguments
monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)


def _add_rocm_smi_parser(self, subparsers, func):
return
# Subparser help text
Expand Down
3 changes: 3 additions & 0 deletions example/amd_smi_drm_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ int main() {
cache_info.cache[i].cache_level,
cache_info.cache[i].cache_size_kb,
cache_info.cache[i].flags);
printf("\tMax number CU shared: %d, Number of instances: %d\n",
cache_info.cache[i].max_num_cu_shared,
cache_info.cache[i].num_cache_instance);
}

// Get power measure
Expand Down
3 changes: 1 addition & 2 deletions example/amd_smi_nodrm_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ int main() {
printf("\tVendorID: 0x%x\n", asic_info.vendor_id);
printf("\tRevisionID: 0x%x\n", asic_info.rev_id);
printf("\tAsic serial: 0x%s\n", asic_info.asic_serial);
printf("\tXGMI physical id: 0x%x\n\n",
asic_info.xgmi_physical_id);
printf("\tOAM id: 0x%x\n\n", asic_info.oam_id);

// Get VBIOS info
amdsmi_vbios_info_t vbios_info = {};
Expand Down
4 changes: 3 additions & 1 deletion include/amd_smi/amdsmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,8 @@ typedef struct {
uint32_t cache_size_kb; /* In KB */
uint32_t cache_level;
uint32_t flags; // amdsmi_cache_flags_type_t which is a bitmask
uint32_t max_num_cu_shared; /* Indicates how many Compute Units share this cache instance */
uint32_t num_cache_instance; /* total number of instance of this cache type */
uint32_t reserved[3];
} cache[AMDSMI_MAX_CACHE_TYPES];
uint32_t reserved[15];
Expand All @@ -498,7 +500,7 @@ typedef struct {
uint64_t device_id; //< The device id of a GPU
uint32_t rev_id;
char asic_serial[AMDSMI_NORMAL_STRING_LENGTH];
uint16_t xgmi_physical_id; //< 0xFFFF if not supported
uint16_t oam_id; //< 0xFFFF if not supported
uint16_t reserved[37];
} amdsmi_asic_info_t;

Expand Down
7 changes: 6 additions & 1 deletion include/amd_smi/impl/amd_smi_system.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ class AMDSmiSystem {
#endif
private:
AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {}
amdsmi_status_t get_gpu_bdf_by_index(uint32_t index, std::string& bdf);

/* The GPU socket id is used to identify the socket, so that the XCDs
on the same physical device will be collected under the same socket.
The BD part of the BDF is used as GPU socket to represent a phyiscal device.
*/
amdsmi_status_t get_gpu_socket_id(uint32_t index, std::string& socketid);
amdsmi_status_t populate_amd_gpu_devices();
uint64_t init_flag_;
AMDSmiDrm drm_;
Expand Down
4 changes: 2 additions & 2 deletions py-interface/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ Field | Content
`device_id` | device id
`rev_id` | revision id
`asic_serial` | asic serial
`xgmi_physical_id` | xgmi physical id
`oam_id` | oam id

Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function:

Expand All @@ -375,7 +375,7 @@ try:
print(hex(asic_info['device_id']))
print(hex(asic_info['rev_id']))
print(asic_info['asic_serial'])
print(asic_info['xgmi_physical_id'])
print(asic_info['oam_id'])
except AmdSmiException as e:
print(e)
```
Expand Down
10 changes: 10 additions & 0 deletions py-interface/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,14 @@
from .amdsmi_interface import amdsmi_is_P2P_accessible
from .amdsmi_interface import amdsmi_get_xgmi_info

# # Partition Functions
from .amdsmi_interface import amdsmi_get_gpu_compute_partition
from .amdsmi_interface import amdsmi_set_gpu_compute_partition
from .amdsmi_interface import amdsmi_reset_gpu_compute_partition
from .amdsmi_interface import amdsmi_get_gpu_memory_partition
from .amdsmi_interface import amdsmi_set_gpu_memory_partition
from .amdsmi_interface import amdsmi_reset_gpu_memory_partition

# # Individual GPU Metrics Functions
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem
Expand Down Expand Up @@ -263,6 +271,8 @@
from .amdsmi_interface import AmdSmiTemperatureMetric
from .amdsmi_interface import AmdSmiVoltageMetric
from .amdsmi_interface import AmdSmiVoltageType
from .amdsmi_interface import AmdSmiComputePartitionType
from .amdsmi_interface import AmdSmiMemoryPartitionType
from .amdsmi_interface import AmdSmiPowerProfilePresetMasks
from .amdsmi_interface import AmdSmiGpuBlock
from .amdsmi_interface import AmdSmiRasErrState
Expand Down
Loading

0 comments on commit dba4f54

Please sign in to comment.