Merge amd-dev into amd-master 20231215

Signed-off-by: guanyu12 <[email protected]> Change-Id: I411755ed67b0d92d37676ca4b7f0709971f4a80f
ROCm · Dec 15, 2023 · 5c53dd3 · 5c53dd3
2 parents 2e133e1 + 16ed186
commit 5c53dd3
Show file tree

Hide file tree

Showing 13 changed files with 565 additions and 32 deletions.
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
@@ -439,7 +439,6 @@ LIMIT:
 DRIVER:
     DRIVER_NAME: amdgpu
     DRIVER_VERSION: 6.1.10
-    DRIVER_DATE: 2015/01/01 00:00
 VRAM:
     VRAM_TYPE: GDDR6
     VRAM_VENDOR: SAMSUNG
@@ -448,6 +447,8 @@ CACHE:
     CACHE 0:
         CACHE_SIZE: 16 KB
         CACHE_LEVEL: 1
+        MAX_NUM_CU_SHARED: 1
+        NUM_CACHE_INSTANCE: 304
 RAS:
     EEPROM_VERSION: N/A
     PARITY_SCHEMA: N/A

diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py
@@ -68,6 +68,7 @@ def _print_error(e, destination):
                                     amd_smi_commands.topology,
                                     amd_smi_commands.set_value,
                                     amd_smi_commands.reset,
+                                    amd_smi_commands.monitor,
                                     amd_smi_commands.rocm_smi)
     try:
         try:

diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py
@@ -24,6 +24,7 @@
 import json
 import re
 import time
+from typing import Dict
 import yaml
 from enum import Enum
 
@@ -37,6 +38,8 @@ def __init__(self, format='human_readable', destination='stdout') -> None:
         self.watch_output = []
         self.format = format # csv, json, or human_readable
         self.destination = destination # stdout, path to a file (append)
+        self.table_header = ""
+        self.table_title = ""
         self.helpers = AMDSMIHelpers()
 
 
@@ -95,7 +98,30 @@ def _capitalize_keys(self, input_dict):
         return output_dict
 
 
-    def _convert_json_to_human_readable(self, json_object):
+    def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False):
+        # TODO make dynamic
+        if tabular:
+            table_values = ''
+            for key, value in json_object.items():
+                value = str(value)
+                if key == 'gpu':
+                    table_values += value.rjust(3)
+                elif key == 'timestamp':
+                    table_values += value.rjust(10) + '  '
+                elif key == 'power_usage':
+                    table_values += value.rjust(7)
+                elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
+                    table_values += value.rjust(11)
+                elif key == 'vram_total' or 'ecc' in key:
+                    table_values += value.rjust(12)
+                elif key in ('throttle_status', 'pcie_replay'):
+                    table_values += value.rjust(13)
+                elif 'gpu_' in key: # handle topology tables
+                    table_values += value.rjust(13)
+                else:
+                    table_values += value.rjust(10)
+            return table_values.rstrip()
+
         # First Capitalize all keys in the json object
         capitalized_json = self._capitalize_keys(json_object)
         json_string = json.dumps(capitalized_json, indent=4)
@@ -266,7 +292,7 @@ def store_watch_output(self, multiple_device_enabled=False):
             self.output = {}
 
 
-    def print_output(self, multiple_device_enabled=False, watching_output=False):
+    def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
         """ Print current output acording to format and then destination
             params:
                 multiple_device_enabled (bool) - True if printing output from
@@ -280,10 +306,11 @@ def print_output(self, multiple_device_enabled=False, watching_output=False):
                                     watching_output=watching_output)
         elif self.is_csv_format():
             self._print_csv_output(multiple_device_enabled=multiple_device_enabled,
-                                    watching_output=watching_output)
+                                   watching_output=watching_output)
         elif self.is_human_readable_format():
             self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
-                                                watching_output=watching_output)
+                                              watching_output=watching_output,
+                                              tabular=tabular)
 
 
     def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
@@ -360,14 +387,19 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
                     writer.writerows(stored_csv_output)
 
 
-    def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False):
+    def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
+        human_readable_output = ''
+        if tabular:
+            if self.table_title:
+                human_readable_output += self.table_title + ':\n'
+            human_readable_output += self.table_header + '\n'
+
         if multiple_device_enabled:
-            human_readable_output = ''
             for output in self.multiple_device_output:
-                human_readable_output += self._convert_json_to_human_readable(output)
+                human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
                 human_readable_output += '\n'
         else:
-            human_readable_output = self._convert_json_to_human_readable(self.output)
+            human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular)
 
         if self.destination == 'stdout':
             try:
@@ -380,8 +412,14 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
             if watching_output:
                 with self.destination.open('w') as output_file:
                     human_readable_output = ''
+                    if tabular:
+                        if self.table_title:
+                            human_readable_output += self.table_title + '\n'
+                        human_readable_output += self.table_header + '\n'
                     for output in self.watch_output:
-                        human_readable_output +=  self._convert_json_to_human_readable(output)
+                        human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
+                        if tabular:
+                            human_readable_output += '\n'
                     output_file.write(human_readable_output + '\n')
             else:
                 with self.destination.open('a') as output_file:

diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
@@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser):
         argparse (ArgumentParser): argparse.ArgumentParser
     """
     def __init__(self, version, list, static, firmware, bad_pages, metric,
-                 process, profile, event, topology, set_value, reset, rocmsmi):
+                 process, profile, event, topology, set_value, reset, monitor,
+                 rocmsmi):
 
         # Helper variables
         self.helpers = AMDSMIHelpers()
@@ -105,6 +106,7 @@ def __init__(self, version, list, static, firmware, bad_pages, metric,
         self._add_topology_parser(self.subparsers, topology)
         self._add_set_value_parser(self.subparsers, set_value)
         self._add_reset_parser(self.subparsers, reset)
+        self._add_monitor_parser(self.subparsers, monitor)
         self._add_rocm_smi_parser(self.subparsers, rocmsmi)
 
 
@@ -813,6 +815,54 @@ def _add_reset_parser(self, subparsers, func):
         reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)
 
 
+    def _add_monitor_parser(self, subparsers, func):
+        if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
+            # This subparser is only applicable to Baremetal Linux
+            return
+
+        # Subparser help text
+        monitor_help = "Monitor metrics for target devices"
+        monitor_subcommand_help = "Monitor a target device for the specified arguments.\
+                                  \nIf no arguments are provided, all arguments will be enabled.\
+                                  \nUse the watch arguments to run continuously"
+        monitor_optionals_title = "Monitor Arguments"
+
+        # Help text for Arguments only on Guest and BM platforms
+        power_usage_help = "Monitor power usage in Watts"
+        temperature_help = "Monitor temperature in Celsius"
+        gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
+        mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
+        encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
+        decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
+        throttle_help = "Monitor thermal throttle status"
+        ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
+        mem_usage_help = "Monitor memory usage in MB"
+        pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"
+
+        # Create monitor subparser
+        monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
+        monitor_parser._optionals.title = monitor_optionals_title
+        monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
+        monitor_parser.set_defaults(func=func)
+
+        # Add Universal Arguments
+        self._add_command_modifiers(monitor_parser)
+        self._add_device_arguments(monitor_parser, required=False)
+        self._add_watch_arguments(monitor_parser)
+
+        # Add monitor arguments
+        monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
+        monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
+        monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
+        monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
+        monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
+        monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
+        monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
+        monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
+        monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
+        monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)
+
+
     def _add_rocm_smi_parser(self, subparsers, func):
         return
         # Subparser help text

diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc
@@ -316,6 +316,9 @@ int main() {
                     cache_info.cache[i].cache_level,
                     cache_info.cache[i].cache_size_kb,
                     cache_info.cache[i].flags);
+                printf("\tMax number CU shared: %d, Number of instances: %d\n",
+                    cache_info.cache[i].max_num_cu_shared,
+                    cache_info.cache[i].num_cache_instance);
             }
 
             // Get power measure

diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
@@ -475,6 +475,8 @@ typedef struct {
     uint32_t cache_size_kb; /* In KB */
     uint32_t cache_level;
     uint32_t flags;  // amdsmi_cache_flags_type_t which is a bitmask
+    uint32_t max_num_cu_shared;  /* Indicates how many Compute Units share this cache instance */
+    uint32_t num_cache_instance;  /* total number of instance of this cache type */
     uint32_t reserved[3];
   } cache[AMDSMI_MAX_CACHE_TYPES];
   uint32_t reserved[15];

diff --git a/py-interface/__init__.py b/py-interface/__init__.py
@@ -194,6 +194,14 @@
 from .amdsmi_interface import amdsmi_is_P2P_accessible
 from .amdsmi_interface import amdsmi_get_xgmi_info
 
+# # Partition Functions
+from .amdsmi_interface import amdsmi_get_gpu_compute_partition
+from .amdsmi_interface import amdsmi_set_gpu_compute_partition
+from .amdsmi_interface import amdsmi_reset_gpu_compute_partition
+from .amdsmi_interface import amdsmi_get_gpu_memory_partition
+from .amdsmi_interface import amdsmi_set_gpu_memory_partition
+from .amdsmi_interface import amdsmi_reset_gpu_memory_partition
+
 # # Individual GPU Metrics Functions
 from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot
 from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem
@@ -263,6 +271,8 @@
 from .amdsmi_interface import AmdSmiTemperatureMetric
 from .amdsmi_interface import AmdSmiVoltageMetric
 from .amdsmi_interface import AmdSmiVoltageType
+from .amdsmi_interface import AmdSmiComputePartitionType
+from .amdsmi_interface import AmdSmiMemoryPartitionType
 from .amdsmi_interface import AmdSmiPowerProfilePresetMasks
 from .amdsmi_interface import AmdSmiGpuBlock
 from .amdsmi_interface import AmdSmiRasErrState

diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
@@ -1308,17 +1308,22 @@ def amdsmi_get_gpu_cache_info(
     for cache_index in range(cache_info.num_cache_types):
         cache_size = cache_info.cache[cache_index].cache_size_kb
         cache_level = cache_info.cache[cache_index].cache_level
+        max_num_cu_shared = cache_info.cache[cache_index].max_num_cu_shared
+        num_cache_instance = cache_info.cache[cache_index].num_cache_instance
         cache_flags = cache_info.cache[cache_index].flags
         data_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE)
         inst_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_INST_CACHE)
         cpu_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE)
         simd_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE)
         cache_info_dict[f"cache {cache_index}"] = {"cache_size": cache_size,
                                                    "cache_level": cache_level,
-                                                   "data_cache": data_cache,
-                                                   "instruction_cache": inst_cache,
-                                                   "cpu_cache": cpu_cache,
-                                                   "simd_cache": simd_cache}
+                                                   "max_num_cu_shared": max_num_cu_shared,
+                                                  "num_cache_instance": num_cache_instance}
+        if (data_cache): cache_info_dict[f"cache {cache_index}"]["data_cache"] = data_cache
+        if (inst_cache): cache_info_dict[f"cache {cache_index}"]["inst_cache"] = inst_cache
+        if (cpu_cache): cache_info_dict[f"cache {cache_index}"]["cpu_cache"] = cpu_cache
+        if (simd_cache): cache_info_dict[f"cache {cache_index}"]["simd_cache"] = simd_cache
+
 
     if cache_info_dict == {}:
         raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA)
@@ -1624,8 +1629,7 @@ def amdsmi_get_gpu_driver_info(
 
     return {
         "driver_name": info.driver_name.decode("utf-8"),
-        "driver_version": info.driver_version.decode("utf-8"),
-        "driver_date": info.driver_date.decode("utf-8")
+        "driver_version": info.driver_version.decode("utf-8")
     }
 
 
@@ -3244,7 +3248,7 @@ def amdsmi_get_gpu_metrics_temp_hotspot(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    hotspot_value = ctypes.c_int16()
+    hotspot_value = ctypes.c_uint16()
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hotspot(
             processor_handle, ctypes.byref(hotspot_value)
@@ -3265,7 +3269,7 @@ def amdsmi_get_gpu_metrics_temp_mem(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    mem_value = ctypes.c_int16()
+    mem_value = ctypes.c_uint16()
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_mem(
             processor_handle, ctypes.byref(mem_value)
@@ -3286,7 +3290,7 @@ def amdsmi_get_gpu_metrics_temp_vrsoc(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    vrsoc_value = ctypes.c_int16()
+    vrsoc_value = ctypes.c_uint16()
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrsoc(
             processor_handle, ctypes.byref(vrsoc_value)
@@ -3754,7 +3758,7 @@ def amdsmi_get_gpu_metrics_vcn_activity(
         )
     )
 
-    return [vcn_activity.value for vcn_activity in vcn_activity_value]
+    return vcn_activity_value
 
 
 def amdsmi_get_gpu_metrics_xgmi_read_data(
@@ -3811,7 +3815,9 @@ def amdsmi_get_gpu_metrics_curr_gfxclk(
         )
     )
 
-    return [curr_gfxclk.value for curr_gfxclk in current_gfxclk_value]
+    print([curr_gfxclk for curr_gfxclk in current_gfxclk_value])
+
+    return [curr_gfxclk for curr_gfxclk in current_gfxclk_value]
 
 
 def amdsmi_get_gpu_metrics_curr_socclk(
@@ -3879,7 +3885,7 @@ def amdsmi_get_gpu_metrics_temp_edge(
         processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    edge_value = ctypes.c_int16()
+    edge_value = ctypes.c_uint16()
 
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_edge(
@@ -3901,7 +3907,7 @@ def amdsmi_get_gpu_metrics_temp_vrgfx(
         processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    vrgfx_value = ctypes.c_int16()
+    vrgfx_value = ctypes.c_uint16()
 
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrgfx(
@@ -3923,7 +3929,7 @@ def amdsmi_get_gpu_metrics_temp_vrmem(
         processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    vrmem_value = ctypes.c_int16()
+    vrmem_value = ctypes.c_uint16()
 
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrmem(

diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
@@ -773,6 +773,8 @@ class struct_cache_(Structure):
     ('cache_size_kb', ctypes.c_uint32),
     ('cache_level', ctypes.c_uint32),
     ('flags', ctypes.c_uint32),
+    ('max_num_cu_shared', ctypes.c_uint32),
+    ('num_cache_instance', ctypes.c_uint32),
     ('reserved', ctypes.c_uint32 * 3),
 ]
 

diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -876,6 +876,8 @@ typedef struct {
     so HSA_CACHE_TYPE_DATA|HSA_CACHE_TYPE_HSACU == 9
     */
     uint32_t flags;
+    uint32_t max_num_cu_shared;  /* Indicates how many Compute Units share this cache instance */
+    uint32_t num_cache_instance;  /* total number of instance of this cache type */
   } cache[RSMI_MAX_CACHE_TYPES];
 } rsmi_gpu_cache_info_t;
 /// \cond Ignore in docs.