diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..ffdc4f48 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index c7473a0e..4f1a9a5a 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -409,7 +409,7 @@ ASIC: DEVICE_ID: 0x73bf REV_ID: 0xc3 ASIC_SERIAL: 0xffffffffffffffff - XGMI_PHYSICAL_ID: N/A + OAM_ID: N/A BUS: BDF: 0000:23:00.0 MAX_PCIE_SPEED: 16 GT/s @@ -439,7 +439,6 @@ LIMIT: DRIVER: DRIVER_NAME: amdgpu DRIVER_VERSION: 6.1.10 - DRIVER_DATE: 2015/01/01 00:00 VRAM: VRAM_TYPE: GDDR6 VRAM_VENDOR: SAMSUNG @@ -448,6 +447,8 @@ CACHE: CACHE 0: CACHE_SIZE: 16 KB CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 1 + NUM_CACHE_INSTANCE: 304 RAS: EEPROM_VERSION: N/A PARITY_SCHEMA: N/A diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py index 3bafe179..263bd0c7 100755 --- a/amdsmi_cli/amdsmi_cli.py +++ b/amdsmi_cli/amdsmi_cli.py @@ -68,6 +68,7 @@ def _print_error(e, destination): amd_smi_commands.topology, amd_smi_commands.set_value, amd_smi_commands.reset, + amd_smi_commands.monitor, amd_smi_commands.rocm_smi) try: try: diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 35ba4f4e..e8ec181d 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -245,8 +245,8 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, asic_info['rev_id'] = hex(asic_info['rev_id']) if asic_info['asic_serial'] != '': asic_info['asic_serial'] = hex(int(asic_info['asic_serial'], base=16)) - if asic_info['xgmi_physical_id'] == 0xFFFF: # uint 16 max - asic_info['xgmi_physical_id'] = "N/A" + if asic_info['oam_id'] == 0xFFFF: # uint 16 max + asic_info['oam_id'] = "N/A" static_dict['asic'] = asic_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['asic'] = "N/A" @@ -431,8 +431,8 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, static_dict['limit'] = limit_info if args.driver: driver_info = {"driver_name" : "N/A", - "driver_version" : "N/A", - "driver_date" : "N/A"} + "driver_version" : "N/A" + } try: driver_info = amdsmi_interface.amdsmi_get_gpu_driver_info(args.gpu) @@ -477,12 +477,11 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, cache_info = amdsmi_interface.amdsmi_get_gpu_cache_info(args.gpu) for cache_key, cache_dict in cache_info.items(): for key, value in cache_dict.items(): - if key == 'cache_size' or key == 'cache_level': + if key == 'cache_size' or key == 'cache_level' or \ + key == 'max_num_cu_shared' or key == 'num_cache_instance': continue if value: cache_info[cache_key][key] = "ENABLED" - else: - cache_info[cache_key][key] = "DISABLED" if self.logger.is_human_readable_format(): for _ , cache_values in cache_info.items(): cache_values['cache_size'] = f"{cache_values['cache_size']} KB" @@ -1729,14 +1728,21 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw]): args.access = args.weight = args.hops = args.link_type= args.numa_bw = True + # Clear the table header; TODO make this a function + self.logger.table_header = ''.rjust(12) + # Populate the possible gpus topo_values = [] for gpu in args.gpu: gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) topo_values.append({"gpu" : gpu_id}) + gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) + self.logger.table_header += gpu_bdf.rjust(13) if args.access: + tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): + tabular_output_dict = {'gpu': amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)} src_gpu_links = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -1757,8 +1763,18 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links + tabular_output_dict.update(src_gpu_links) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "ACCESS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + if args.weight: + tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): + tabular_output_dict = {'gpu': amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)} src_gpu_weight = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -1780,8 +1796,18 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, topo_values[src_gpu_index]['weight'] = src_gpu_weight + tabular_output_dict.update(src_gpu_weight) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "WEIGHT TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + if args.hops: + tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): + tabular_output_dict = {'gpu': amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)} src_gpu_hops = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -1803,8 +1829,18 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, topo_values[src_gpu_index]['hops'] = src_gpu_hops + tabular_output_dict.update(src_gpu_hops) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "HOPS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + if args.link_type: + tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): + tabular_output_dict = {'gpu': amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -1831,8 +1867,18 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, topo_values[src_gpu_index]['link_type'] = src_gpu_link_type + tabular_output_dict.update(src_gpu_link_type) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "LINK TYPE TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + if args.numa_bw: + tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): + tabular_output_dict = {'gpu': amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -1868,6 +1914,14 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type + tabular_output_dict.update(src_gpu_link_type) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "NUMA BW TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.multiple_device_output = topo_values if self.logger.is_csv_format(): @@ -1876,7 +1930,8 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, new_output.append(self.logger.flatten_dict(elem, topology_override=True)) self.logger.multiple_device_output = new_output - self.logger.print_output(multiple_device_enabled=True) + if not self.logger.is_human_readable_format(): + self.logger.print_output(multiple_device_enabled=True) def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, @@ -2254,6 +2309,355 @@ def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, self.logger.print_output() + def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, + watch=None, watch_time=None, iterations=None, power_usage=None, + temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None, + throttle_status=None, ecc=None, vram_usage=None, pcie=None): + """ Populate a table with each GPU as an index to rows of targeted data + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + watch (bool, optional): Value override for args.watch. Defaults to None. + watch_time (int, optional): Value override for args.watch_time. Defaults to None. + iterations (int, optional): Value override for args.iterations. Defaults to None. + power_usage (bool, optional): Value override for args.power_usage. Defaults to None. + temperature (bool, optional): Value override for args.temperature. Defaults to None. + gfx (bool, optional): Value override for args.gfx. Defaults to None. + mem (bool, optional): Value override for args.mem. Defaults to None. + encoder (bool, optional): Value override for args.encoder. Defaults to None. + decoder (bool, optional): Value override for args.decoder. Defaults to None. + throttle_status (bool, optional): Value override for args.throttle_status. Defaults to None. + ecc (bool, optional): Value override for args.ecc. Defaults to None. + vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None. + pcie (bool, optional): Value override for args.pcie. Defaults to None. + + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if watch: + args.watch = watch + if watch_time: + args.watch_time = watch_time + if iterations: + args.iterations = iterations + + # monitor args + if power_usage: + args.power_usage = power_usage + if temperature: + args.temperature = temperature + if gfx_util: + args.gfx = gfx_util + if mem_util: + args.mem = mem_util + if encoder: + args.encoder = encoder + if decoder: + args.decoder = decoder + if throttle_status: + args.throttle_status = throttle_status + if ecc: + args.ecc = ecc + if vram_usage: + args.vram_usage = vram_usage + if pcie: + args.pcie = pcie + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + # If all arguments are False, the print all values + if not any([args.power_usage, args.temperature, args.gfx, args.mem, + args.encoder, args.decoder, args.throttle_status, args.ecc, + args.vram_usage, args.pcie]): + args.power_usage = args.temperature = args.gfx = args.mem = \ + args.encoder = args.decoder = args.throttle_status = args.ecc = \ + args.vram_usage = args.pcie = True + + # Handle watch logic, will only enter this block once + if args.watch: + self.helpers.handle_watch(args=args, subcommand=self.monitor, logger=self.logger) + return + + # Handle multiple GPUs + if isinstance(args.gpu, list): + if len(args.gpu) > 1: + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) + + # Store output from multiple devices + for device_handle in args.gpu: + self.monitor(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + # Print multiple device output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True) + + # Add output to total watch output and clear multiple device output + if watching_output: + self.logger.store_watch_output(multiple_device_enabled=True) + + # Flush the watching output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True) + + return + elif len(args.gpu) == 1: + args.gpu = args.gpu[0] + else: + raise IndexError("args.gpu should not be an empty list") + + monitor_values = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + # Clear the table header; TODO make this a function + self.logger.table_header = '' + + # Store timestamp for watch output + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.table_header += 'TIMESTAMP'.rjust(10) + ' ' + + self.logger.table_header += 'GPU' + + if args.power_usage: + try: + gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + power_usage = gpu_metrics_info['current_socket_power'] + if power_usage >= 0xFFFFFFFF: + power_usage = gpu_metrics_info['average_socket_power'] + if power_usage >= 0xFFFFFFFF: + power_usage = "N/A" + monitor_values['power_usage'] = power_usage + if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A": + monitor_values['power_usage'] = f"{monitor_values['power_usage']} W" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['power_usage'] = "N/A" + logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'POWER'.rjust(7) + if args.temperature: + try: + temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_hotspot'] + monitor_values['hotspot_temperature'] = temperature + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['hotspot_temperature'] = "N/A" + logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info()) + + try: + temperature = amdsmi_interface.amdsmi_get_gpu_metrics_temp_mem(args.gpu) + monitor_values['memory_temperature'] = temperature + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['memory_temperature'] = "N/A" + logging.debug("Failed to get memory temperature on gpu %s | %s", gpu_id, e.get_error_info()) + + if self.logger.is_human_readable_format() and monitor_values['hotspot_temperature'] != "N/A": + monitor_values['hotspot_temperature'] = f"{monitor_values['hotspot_temperature']} \N{DEGREE SIGN}C" + + if self.logger.is_human_readable_format() and monitor_values['memory_temperature'] != "N/A": + monitor_values['memory_temperature'] = f"{monitor_values['memory_temperature']} \N{DEGREE SIGN}C" + + self.logger.table_header += 'GPU_TEMP'.rjust(10) + self.logger.table_header += 'MEM_TEMP'.rjust(10) + if args.gfx: + try: + gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_gfx_activity(args.gpu) + monitor_values['gfx'] = gfx_util + if self.logger.is_human_readable_format(): + monitor_values['gfx'] = f"{monitor_values['gfx']} %" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['gfx'] = "N/A" + logging.debug("Failed to get gfx utilization on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'GFX_UTIL'.rjust(10) + + try: + gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk'] + monitor_values['gfx_clock'] = gfx_clock + if self.logger.is_human_readable_format(): + monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} MHz" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['gfx_clock'] = "N/A" + logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'GFX_CLOCK'.rjust(11) + if args.mem: + try: + mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_umc_activity(args.gpu) + monitor_values['mem'] = mem_util + if self.logger.is_human_readable_format(): + monitor_values['mem'] = f"{monitor_values['mem']} %" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['mem'] = "N/A" + logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'MEM_UTIL'.rjust(10) + + try: + mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk'] + monitor_values['mem_clock'] = mem_clock + if self.logger.is_human_readable_format(): + monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} MHz" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['mem_clock'] = "N/A" + logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'MEM_CLOCK'.rjust(11) + if args.encoder: + try: + # Get List of vcn activity values + encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_vcn_activity(args.gpu) + encoding_activity_avg = [] + for value in encoder_util: + if value < 150: # each encoder chiplet's value range should be a percent + encoding_activity_avg.append(value) + # Averaging the possible encoding activity values + if encoding_activity_avg: + encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg) + else: + encoding_activity_avg = "N/A" + monitor_values['encoder'] = encoding_activity_avg + if self.logger.is_human_readable_format() and monitor_values['encoder'] != "N/A": + monitor_values['encoder'] = f"{monitor_values['encoder']} %" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['encoder'] = "N/A" + logging.debug("Failed to get encoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'ENC_UTIL'.rjust(10) + + try: + encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0'] + monitor_values['encoder_clock'] = encoder_clock + if self.logger.is_human_readable_format(): + monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} MHz" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['encoder_clock'] = "N/A" + logging.debug("Failed to get encoder clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'ENC_CLOCK'.rjust(11) + if args.decoder: + try: + decoder_util = "N/A" # Not yet implemented + monitor_values['decoder'] = decoder_util + # if self.logger.is_human_readable_format(): + # monitor_values['decoder'] = f"{monitor_values['decoder']} %" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['decoder'] = "N/A" + logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'DEC_UTIL'.rjust(10) + + try: + decoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0'] + monitor_values['decoder_clock'] = decoder_clock + if self.logger.is_human_readable_format(): + monitor_values['decoder_clock'] = f"{monitor_values['decoder_clock']} MHz" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['decoder_clock'] = "N/A" + logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'DEC_CLOCK'.rjust(11) + if args.throttle_status: + try: + throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu) + if throttle_status: + throttle_status = "THROTTLED" + else: + throttle_status = "UNTHROTTLED" + monitor_values['throttle_status'] = throttle_status + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['throttle_status'] = "N/A" + logging.debug("Failed to get throttle status on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'THROTTLE'.rjust(13) + if args.ecc: + try: + ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) + monitor_values['single_bit_ecc'] = ecc['correctable_count'] + monitor_values['double_bit_ecc'] = ecc['uncorrectable_count'] + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['ecc'] = "N/A" + logging.debug("Failed to get ecc on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'SINGLE_ECC'.rjust(12) + self.logger.table_header += 'DOUBLE_ECC'.rjust(12) + + try: + pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) + monitor_values['pcie_replay'] = pcie_replay + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['pcie_replay'] = "N/A" + logging.debug("Failed to get pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'PCIE_REPLAY'.rjust(13) + if args.vram_usage: + try: + vram_usage = amdsmi_interface.amdsmi_get_gpu_vram_usage(args.gpu) + monitor_values['vram_used'] = vram_usage['vram_used'] + monitor_values['vram_total'] = vram_usage['vram_total'] + if self.logger.is_human_readable_format(): + monitor_values['vram_used'] = f"{monitor_values['vram_used']} MB" + monitor_values['vram_total'] = f"{monitor_values['vram_total']} MB" + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['vram_used'] = "N/A" + monitor_values['vram_total'] = "N/A" + logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'VRAM_USED'.rjust(11) + self.logger.table_header += 'VRAM_TOTAL'.rjust(12) + if args.pcie: + try: + pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) + sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] + received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] + + if self.logger.is_human_readable_format(): + if sent > 0: + sent = sent // 1024 // 1024 + sent = f"{sent} MB/s" + + if received > 0: + received = received // 1024 // 1024 + received = f"{received} MB/s" + pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} B" + + monitor_values['pcie_tx'] = sent + monitor_values['pcie_rx'] = received + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['pcie_tx'] = "N/A" + monitor_values['pcie_rx'] = "N/A" + logging.debug("Failed to get pci throughput on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'PCIE_TX'.rjust(10) + self.logger.table_header += 'PCIE_RX'.rjust(10) + + self.logger.store_output(args.gpu, 'values', monitor_values) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output(watching_output=watching_output, tabular=True) + + if watching_output: # End of single gpu add to watch_output + self.logger.store_watch_output(multiple_device_enabled=False) + + def rocm_smi(self, args): print("Placeholder for rocm-smi legacy commands") diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index 3ebf1198..fb2c2dbe 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -24,6 +24,7 @@ import json import re import time +from typing import Dict import yaml from enum import Enum @@ -37,6 +38,8 @@ def __init__(self, format='human_readable', destination='stdout') -> None: self.watch_output = [] self.format = format # csv, json, or human_readable self.destination = destination # stdout, path to a file (append) + self.table_header = "" + self.table_title = "" self.helpers = AMDSMIHelpers() @@ -95,7 +98,30 @@ def _capitalize_keys(self, input_dict): return output_dict - def _convert_json_to_human_readable(self, json_object): + def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False): + # TODO make dynamic + if tabular: + table_values = '' + for key, value in json_object.items(): + value = str(value) + if key == 'gpu': + table_values += value.rjust(3) + elif key == 'timestamp': + table_values += value.rjust(10) + ' ' + elif key == 'power_usage': + table_values += value.rjust(7) + elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'): + table_values += value.rjust(11) + elif key == 'vram_total' or 'ecc' in key: + table_values += value.rjust(12) + elif key in ('throttle_status', 'pcie_replay'): + table_values += value.rjust(13) + elif 'gpu_' in key: # handle topology tables + table_values += value.rjust(13) + else: + table_values += value.rjust(10) + return table_values.rstrip() + # First Capitalize all keys in the json object capitalized_json = self._capitalize_keys(json_object) json_string = json.dumps(capitalized_json, indent=4) @@ -266,7 +292,7 @@ def store_watch_output(self, multiple_device_enabled=False): self.output = {} - def print_output(self, multiple_device_enabled=False, watching_output=False): + def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False): """ Print current output acording to format and then destination params: multiple_device_enabled (bool) - True if printing output from @@ -280,10 +306,11 @@ def print_output(self, multiple_device_enabled=False, watching_output=False): watching_output=watching_output) elif self.is_csv_format(): self._print_csv_output(multiple_device_enabled=multiple_device_enabled, - watching_output=watching_output) + watching_output=watching_output) elif self.is_human_readable_format(): self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled, - watching_output=watching_output) + watching_output=watching_output, + tabular=tabular) def _print_json_output(self, multiple_device_enabled=False, watching_output=False): @@ -360,14 +387,19 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False writer.writerows(stored_csv_output) - def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False): + def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False): + human_readable_output = '' + if tabular: + if self.table_title: + human_readable_output += self.table_title + ':\n' + human_readable_output += self.table_header + '\n' + if multiple_device_enabled: - human_readable_output = '' for output in self.multiple_device_output: - human_readable_output += self._convert_json_to_human_readable(output) + human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular) human_readable_output += '\n' else: - human_readable_output = self._convert_json_to_human_readable(self.output) + human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular) if self.destination == 'stdout': try: @@ -380,8 +412,14 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o if watching_output: with self.destination.open('w') as output_file: human_readable_output = '' + if tabular: + if self.table_title: + human_readable_output += self.table_title + '\n' + human_readable_output += self.table_header + '\n' for output in self.watch_output: - human_readable_output += self._convert_json_to_human_readable(output) + human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular) + if tabular: + human_readable_output += '\n' output_file.write(human_readable_output + '\n') else: with self.destination.open('a') as output_file: diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 262b2358..4828854c 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser): argparse (ArgumentParser): argparse.ArgumentParser """ def __init__(self, version, list, static, firmware, bad_pages, metric, - process, profile, event, topology, set_value, reset, rocmsmi): + process, profile, event, topology, set_value, reset, monitor, + rocmsmi): # Helper variables self.helpers = AMDSMIHelpers() @@ -105,6 +106,7 @@ def __init__(self, version, list, static, firmware, bad_pages, metric, self._add_topology_parser(self.subparsers, topology) self._add_set_value_parser(self.subparsers, set_value) self._add_reset_parser(self.subparsers, reset) + self._add_monitor_parser(self.subparsers, monitor) self._add_rocm_smi_parser(self.subparsers, rocmsmi) @@ -813,6 +815,54 @@ def _add_reset_parser(self, subparsers, func): reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) + def _add_monitor_parser(self, subparsers, func): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux + return + + # Subparser help text + monitor_help = "Monitor metrics for target devices" + monitor_subcommand_help = "Monitor a target device for the specified arguments.\ + \nIf no arguments are provided, all arguments will be enabled.\ + \nUse the watch arguments to run continuously" + monitor_optionals_title = "Monitor Arguments" + + # Help text for Arguments only on Guest and BM platforms + power_usage_help = "Monitor power usage in Watts" + temperature_help = "Monitor temperature in Celsius" + gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)" + mem_util_help = "Monitor memory utilization (%%) and clock (MHz)" + encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)" + decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)" + throttle_help = "Monitor thermal throttle status" + ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" + mem_usage_help = "Monitor memory usage in MB" + pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s" + + # Create monitor subparser + monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help) + monitor_parser._optionals.title = monitor_optionals_title + monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + monitor_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(monitor_parser) + self._add_device_arguments(monitor_parser, required=False) + self._add_watch_arguments(monitor_parser) + + # Add monitor arguments + monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help) + monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help) + monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) + monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help) + monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help) + monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help) + monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) + monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help) + + def _add_rocm_smi_parser(self, subparsers, func): return # Subparser help text diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index 7d046cf4..280011b0 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -316,6 +316,9 @@ int main() { cache_info.cache[i].cache_level, cache_info.cache[i].cache_size_kb, cache_info.cache[i].flags); + printf("\tMax number CU shared: %d, Number of instances: %d\n", + cache_info.cache[i].max_num_cu_shared, + cache_info.cache[i].num_cache_instance); } // Get power measure diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc index 05ff84d6..0f829375 100644 --- a/example/amd_smi_nodrm_example.cc +++ b/example/amd_smi_nodrm_example.cc @@ -151,8 +151,7 @@ int main() { printf("\tVendorID: 0x%x\n", asic_info.vendor_id); printf("\tRevisionID: 0x%x\n", asic_info.rev_id); printf("\tAsic serial: 0x%s\n", asic_info.asic_serial); - printf("\tXGMI physical id: 0x%x\n\n", - asic_info.xgmi_physical_id); + printf("\tOAM id: 0x%x\n\n", asic_info.oam_id); // Get VBIOS info amdsmi_vbios_info_t vbios_info = {}; diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 66c2a3d6..6dff3c33 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -475,6 +475,8 @@ typedef struct { uint32_t cache_size_kb; /* In KB */ uint32_t cache_level; uint32_t flags; // amdsmi_cache_flags_type_t which is a bitmask + uint32_t max_num_cu_shared; /* Indicates how many Compute Units share this cache instance */ + uint32_t num_cache_instance; /* total number of instance of this cache type */ uint32_t reserved[3]; } cache[AMDSMI_MAX_CACHE_TYPES]; uint32_t reserved[15]; @@ -498,7 +500,7 @@ typedef struct { uint64_t device_id; //< The device id of a GPU uint32_t rev_id; char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; - uint16_t xgmi_physical_id; //< 0xFFFF if not supported + uint16_t oam_id; //< 0xFFFF if not supported uint16_t reserved[37]; } amdsmi_asic_info_t; diff --git a/include/amd_smi/impl/amd_smi_system.h b/include/amd_smi/impl/amd_smi_system.h index 348782cf..596c9092 100644 --- a/include/amd_smi/impl/amd_smi_system.h +++ b/include/amd_smi/impl/amd_smi_system.h @@ -99,7 +99,12 @@ class AMDSmiSystem { #endif private: AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {} - amdsmi_status_t get_gpu_bdf_by_index(uint32_t index, std::string& bdf); + + /* The GPU socket id is used to identify the socket, so that the XCDs + on the same physical device will be collected under the same socket. + The BD part of the BDF is used as GPU socket to represent a phyiscal device. + */ + amdsmi_status_t get_gpu_socket_id(uint32_t index, std::string& socketid); amdsmi_status_t populate_amd_gpu_devices(); uint64_t init_flag_; AMDSmiDrm drm_; diff --git a/py-interface/README.md b/py-interface/README.md index dbe0e9fa..013c3477 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -351,7 +351,7 @@ Field | Content `device_id` | device id `rev_id` | revision id `asic_serial` | asic serial -`xgmi_physical_id` | xgmi physical id +`oam_id` | oam id Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function: @@ -375,7 +375,7 @@ try: print(hex(asic_info['device_id'])) print(hex(asic_info['rev_id'])) print(asic_info['asic_serial']) - print(asic_info['xgmi_physical_id']) + print(asic_info['oam_id']) except AmdSmiException as e: print(e) ``` diff --git a/py-interface/__init__.py b/py-interface/__init__.py index 4d98d7fd..2c54cfff 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -194,6 +194,14 @@ from .amdsmi_interface import amdsmi_is_P2P_accessible from .amdsmi_interface import amdsmi_get_xgmi_info +# # Partition Functions +from .amdsmi_interface import amdsmi_get_gpu_compute_partition +from .amdsmi_interface import amdsmi_set_gpu_compute_partition +from .amdsmi_interface import amdsmi_reset_gpu_compute_partition +from .amdsmi_interface import amdsmi_get_gpu_memory_partition +from .amdsmi_interface import amdsmi_set_gpu_memory_partition +from .amdsmi_interface import amdsmi_reset_gpu_memory_partition + # # Individual GPU Metrics Functions from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem @@ -263,6 +271,8 @@ from .amdsmi_interface import AmdSmiTemperatureMetric from .amdsmi_interface import AmdSmiVoltageMetric from .amdsmi_interface import AmdSmiVoltageType +from .amdsmi_interface import AmdSmiComputePartitionType +from .amdsmi_interface import AmdSmiMemoryPartitionType from .amdsmi_interface import AmdSmiPowerProfilePresetMasks from .amdsmi_interface import AmdSmiGpuBlock from .amdsmi_interface import AmdSmiRasErrState diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 924c782e..dc4a9245 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1243,7 +1243,7 @@ def amdsmi_get_gpu_asic_info( "device_id": asic_info.device_id, "rev_id": asic_info.rev_id, "asic_serial": asic_info.asic_serial.decode("utf-8"), - "xgmi_physical_id": asic_info.xgmi_physical_id + "oam_id": asic_info.oam_id } @@ -1308,6 +1308,8 @@ def amdsmi_get_gpu_cache_info( for cache_index in range(cache_info.num_cache_types): cache_size = cache_info.cache[cache_index].cache_size_kb cache_level = cache_info.cache[cache_index].cache_level + max_num_cu_shared = cache_info.cache[cache_index].max_num_cu_shared + num_cache_instance = cache_info.cache[cache_index].num_cache_instance cache_flags = cache_info.cache[cache_index].flags data_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE) inst_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_INST_CACHE) @@ -1315,10 +1317,13 @@ def amdsmi_get_gpu_cache_info( simd_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE) cache_info_dict[f"cache {cache_index}"] = {"cache_size": cache_size, "cache_level": cache_level, - "data_cache": data_cache, - "instruction_cache": inst_cache, - "cpu_cache": cpu_cache, - "simd_cache": simd_cache} + "max_num_cu_shared": max_num_cu_shared, + "num_cache_instance": num_cache_instance} + if (data_cache): cache_info_dict[f"cache {cache_index}"]["data_cache"] = data_cache + if (inst_cache): cache_info_dict[f"cache {cache_index}"]["inst_cache"] = inst_cache + if (cpu_cache): cache_info_dict[f"cache {cache_index}"]["cpu_cache"] = cpu_cache + if (simd_cache): cache_info_dict[f"cache {cache_index}"]["simd_cache"] = simd_cache + if cache_info_dict == {}: raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA) @@ -1624,8 +1629,7 @@ def amdsmi_get_gpu_driver_info( return { "driver_name": info.driver_name.decode("utf-8"), - "driver_version": info.driver_version.decode("utf-8"), - "driver_date": info.driver_date.decode("utf-8") + "driver_version": info.driver_version.decode("utf-8") } @@ -3244,7 +3248,7 @@ def amdsmi_get_gpu_metrics_temp_hotspot( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - hotspot_value = ctypes.c_int16() + hotspot_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hotspot( processor_handle, ctypes.byref(hotspot_value) @@ -3265,7 +3269,7 @@ def amdsmi_get_gpu_metrics_temp_mem( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - mem_value = ctypes.c_int16() + mem_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_mem( processor_handle, ctypes.byref(mem_value) @@ -3286,7 +3290,7 @@ def amdsmi_get_gpu_metrics_temp_vrsoc( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - vrsoc_value = ctypes.c_int16() + vrsoc_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrsoc( processor_handle, ctypes.byref(vrsoc_value) @@ -3754,7 +3758,7 @@ def amdsmi_get_gpu_metrics_vcn_activity( ) ) - return [vcn_activity.value for vcn_activity in vcn_activity_value] + return vcn_activity_value def amdsmi_get_gpu_metrics_xgmi_read_data( @@ -3811,7 +3815,9 @@ def amdsmi_get_gpu_metrics_curr_gfxclk( ) ) - return [curr_gfxclk.value for curr_gfxclk in current_gfxclk_value] + print([curr_gfxclk for curr_gfxclk in current_gfxclk_value]) + + return [curr_gfxclk for curr_gfxclk in current_gfxclk_value] def amdsmi_get_gpu_metrics_curr_socclk( @@ -3879,7 +3885,7 @@ def amdsmi_get_gpu_metrics_temp_edge( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - edge_value = ctypes.c_int16() + edge_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_edge( @@ -3901,7 +3907,7 @@ def amdsmi_get_gpu_metrics_temp_vrgfx( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - vrgfx_value = ctypes.c_int16() + vrgfx_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrgfx( @@ -3923,7 +3929,7 @@ def amdsmi_get_gpu_metrics_temp_vrmem( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - vrmem_value = ctypes.c_int16() + vrmem_value = ctypes.c_uint16() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrmem( diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 9ab1307a..dccdcd81 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -773,6 +773,8 @@ class struct_cache_(Structure): ('cache_size_kb', ctypes.c_uint32), ('cache_level', ctypes.c_uint32), ('flags', ctypes.c_uint32), + ('max_num_cu_shared', ctypes.c_uint32), + ('num_cache_instance', ctypes.c_uint32), ('reserved', ctypes.c_uint32 * 3), ] @@ -820,7 +822,7 @@ class struct_amdsmi_asic_info_t(Structure): ('device_id', ctypes.c_uint64), ('rev_id', ctypes.c_uint32), ('asic_serial', ctypes.c_char * 32), - ('xgmi_physical_id', ctypes.c_uint16), + ('oam_id', ctypes.c_uint16), ('reserved', ctypes.c_uint16 * 37), ] diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index e397ad27..0236803e 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -876,6 +876,8 @@ typedef struct { so HSA_CACHE_TYPE_DATA|HSA_CACHE_TYPE_HSACU == 9 */ uint32_t flags; + uint32_t max_num_cu_shared; /* Indicates how many Compute Units share this cache instance */ + uint32_t num_cache_instance; /* total number of instance of this cache type */ } cache[RSMI_MAX_CACHE_TYPES]; } rsmi_gpu_cache_info_t; /// \cond Ignore in docs. @@ -1746,7 +1748,7 @@ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * */ -rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id); +rsmi_status_t rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id); /** @} */ // end of IDQuer diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 3a71904b..98870867 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -941,7 +941,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { } rsmi_status_t -rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { +rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) { std::ostringstream ss; rsmi_status_t ret; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 9aa92b7f..469d7cbf 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -395,7 +395,7 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, - {"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, + {"rsmi_dev_oam_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_name_get", {{kDevVendorIDFName, diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index 2c74b289..fb2c2157 100755 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -930,17 +930,27 @@ int KFDNode::get_cache_info(rsmi_gpu_cache_info_t *info) { int cache_type = std::stoi(type); if (cache_type <= 0) continue; - // only count once + // num_cu_shared – this can be fetched by counting the number of 1’s in the sibling_map. + std::string sibling_map = + get_properties_from_file(prop_file, "sibling_map "); + uint32_t num_cu_shared = + std::count(sibling_map.begin(), sibling_map.end(), '1'); + + // known cache type bool is_count_already = false; for (unsigned int i=0; i < info->num_cache_types; i++) { if (info->cache[i].cache_level == static_cast(cache_level) && info->cache[i].flags == static_cast(cache_type)) { is_count_already = true; + if (info->cache[i].max_num_cu_shared < num_cu_shared) + info->cache[i].max_num_cu_shared = num_cu_shared; + info->cache[i].num_cache_instance++; break; } } if (is_count_already) continue; + // new cache type if (info->num_cache_types >= RSMI_MAX_CACHE_TYPES) return 1; std::string size = get_properties_from_file(prop_file, "size "); int cache_size = std::stoi(size); @@ -948,6 +958,8 @@ int KFDNode::get_cache_info(rsmi_gpu_cache_info_t *info) { info->cache[info->num_cache_types].cache_level = cache_level; info->cache[info->num_cache_types].cache_size_kb = cache_size; + info->cache[info->num_cache_types].max_num_cu_shared = num_cu_shared; + info->cache[info->num_cache_types].num_cache_instance = 1; info->cache[info->num_cache_types].flags = cache_type; info->num_cache_types++; } catch (...) { diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index f92e28c2..917784d5 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -520,6 +520,8 @@ amdsmi_status_t amdsmi_get_gpu_cache_info( for (unsigned int i =0; i < rsmi_info.num_cache_types; i++) { info->cache[i].cache_size_kb = rsmi_info.cache[i].cache_size_kb; info->cache[i].cache_level = rsmi_info.cache[i].cache_level; + info->cache[i].max_num_cu_shared = rsmi_info.cache[i].max_num_cu_shared; + info->cache[i].num_cache_instance = rsmi_info.cache[i].num_cache_instance; // convert from sysfs type to CRAT type(HSA Cache Affinity type) info->cache[i].flags = 0; if (rsmi_info.cache[i].flags & HSA_CACHE_TYPE_DATA) @@ -753,9 +755,9 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->vendor_name, AMDSMI_MAX_STRING_LENGTH); // default to 0xffff as not supported - info->xgmi_physical_id = std::numeric_limits::max(); - status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, - &(info->xgmi_physical_id)); + info->oam_id = std::numeric_limits::max(); + status = rsmi_wrapper(rsmi_dev_oam_id_get, processor_handle, + &(info->oam_id)); return AMDSMI_STATUS_SUCCESS; } diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index 939dea6f..e9fa857b 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -231,7 +231,7 @@ amdsmi_status_t AMDSmiSystem::populate_amd_gpu_devices() { for (uint32_t i=0; i < device_count; i++) { // GPU device uses the bdf as the socket id std::string socket_id; - amd_smi_status = get_gpu_bdf_by_index(i, socket_id); + amd_smi_status = get_gpu_socket_id(i, socket_id); if (amd_smi_status != AMDSMI_STATUS_SUCCESS) { return amd_smi_status; } @@ -256,8 +256,8 @@ amdsmi_status_t AMDSmiSystem::populate_amd_gpu_devices() { return AMDSMI_STATUS_SUCCESS; } -amdsmi_status_t AMDSmiSystem::get_gpu_bdf_by_index(uint32_t index, - std::string& bdf) { +amdsmi_status_t AMDSmiSystem::get_gpu_socket_id(uint32_t index, + std::string& socket_id) { uint64_t bdfid = 0; rsmi_status_t ret = rsmi_dev_pci_id_get(index, &bdfid); if (ret != RSMI_STATUS_SUCCESS) { @@ -269,11 +269,13 @@ amdsmi_status_t AMDSmiSystem::get_gpu_bdf_by_index(uint32_t index, uint64_t device_id = (bdfid >> 3) & 0x1f; uint64_t function = bdfid & 0x7; + // The BD part of the BDF is used as the socket id as it + // represents a physical device. std::stringstream ss; ss << std::setfill('0') << std::uppercase << std::hex << std::setw(4) << domain << ":" << std::setw(2) << bus << ":" - << std::setw(2) << device_id << "." << std::setw(2) << function; - bdf = ss.str(); + << std::setw(2) << device_id; + socket_id = ss.str(); return AMDSMI_STATUS_SUCCESS; }