Merge amd-master into rocm-rel-6.0 20231214

Signed-off-by: Maisam Arif <[email protected]> Change-Id: Icff41b8e3c8a8374dde68dc21e4f57488e6fbc09
ROCm · Dec 15, 2023 · dba4f54 · dba4f54
2 parents 3f96bd1 + 5c53dd3
commit dba4f54
Show file tree

Hide file tree

Showing 20 changed files with 594 additions and 54 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
@@ -409,7 +409,7 @@ ASIC:
     DEVICE_ID: 0x73bf
     REV_ID: 0xc3
     ASIC_SERIAL: 0xffffffffffffffff
-    XGMI_PHYSICAL_ID: N/A
+    OAM_ID: N/A
 BUS:
     BDF: 0000:23:00.0
     MAX_PCIE_SPEED: 16 GT/s
@@ -439,7 +439,6 @@ LIMIT:
 DRIVER:
     DRIVER_NAME: amdgpu
     DRIVER_VERSION: 6.1.10
-    DRIVER_DATE: 2015/01/01 00:00
 VRAM:
     VRAM_TYPE: GDDR6
     VRAM_VENDOR: SAMSUNG
@@ -448,6 +447,8 @@ CACHE:
     CACHE 0:
         CACHE_SIZE: 16 KB
         CACHE_LEVEL: 1
+        MAX_NUM_CU_SHARED: 1
+        NUM_CACHE_INSTANCE: 304
 RAS:
     EEPROM_VERSION: N/A
     PARITY_SCHEMA: N/A

diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py
@@ -68,6 +68,7 @@ def _print_error(e, destination):
                                     amd_smi_commands.topology,
                                     amd_smi_commands.set_value,
                                     amd_smi_commands.reset,
+                                    amd_smi_commands.monitor,
                                     amd_smi_commands.rocm_smi)
     try:
         try:

diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py
@@ -24,6 +24,7 @@
 import json
 import re
 import time
+from typing import Dict
 import yaml
 from enum import Enum
 
@@ -37,6 +38,8 @@ def __init__(self, format='human_readable', destination='stdout') -> None:
         self.watch_output = []
         self.format = format # csv, json, or human_readable
         self.destination = destination # stdout, path to a file (append)
+        self.table_header = ""
+        self.table_title = ""
         self.helpers = AMDSMIHelpers()
 
 
@@ -95,7 +98,30 @@ def _capitalize_keys(self, input_dict):
         return output_dict
 
 
-    def _convert_json_to_human_readable(self, json_object):
+    def _convert_json_to_human_readable(self, json_object: Dict[str, any], tabular=False):
+        # TODO make dynamic
+        if tabular:
+            table_values = ''
+            for key, value in json_object.items():
+                value = str(value)
+                if key == 'gpu':
+                    table_values += value.rjust(3)
+                elif key == 'timestamp':
+                    table_values += value.rjust(10) + '  '
+                elif key == 'power_usage':
+                    table_values += value.rjust(7)
+                elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
+                    table_values += value.rjust(11)
+                elif key == 'vram_total' or 'ecc' in key:
+                    table_values += value.rjust(12)
+                elif key in ('throttle_status', 'pcie_replay'):
+                    table_values += value.rjust(13)
+                elif 'gpu_' in key: # handle topology tables
+                    table_values += value.rjust(13)
+                else:
+                    table_values += value.rjust(10)
+            return table_values.rstrip()
+
         # First Capitalize all keys in the json object
         capitalized_json = self._capitalize_keys(json_object)
         json_string = json.dumps(capitalized_json, indent=4)
@@ -266,7 +292,7 @@ def store_watch_output(self, multiple_device_enabled=False):
             self.output = {}
 
 
-    def print_output(self, multiple_device_enabled=False, watching_output=False):
+    def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
         """ Print current output acording to format and then destination
             params:
                 multiple_device_enabled (bool) - True if printing output from
@@ -280,10 +306,11 @@ def print_output(self, multiple_device_enabled=False, watching_output=False):
                                     watching_output=watching_output)
         elif self.is_csv_format():
             self._print_csv_output(multiple_device_enabled=multiple_device_enabled,
-                                    watching_output=watching_output)
+                                   watching_output=watching_output)
         elif self.is_human_readable_format():
             self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
-                                                watching_output=watching_output)
+                                              watching_output=watching_output,
+                                              tabular=tabular)
 
 
     def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
@@ -360,14 +387,19 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
                     writer.writerows(stored_csv_output)
 
 
-    def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False):
+    def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False):
+        human_readable_output = ''
+        if tabular:
+            if self.table_title:
+                human_readable_output += self.table_title + ':\n'
+            human_readable_output += self.table_header + '\n'
+
         if multiple_device_enabled:
-            human_readable_output = ''
             for output in self.multiple_device_output:
-                human_readable_output += self._convert_json_to_human_readable(output)
+                human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
                 human_readable_output += '\n'
         else:
-            human_readable_output = self._convert_json_to_human_readable(self.output)
+            human_readable_output += self._convert_json_to_human_readable(self.output, tabular=tabular)
 
         if self.destination == 'stdout':
             try:
@@ -380,8 +412,14 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
             if watching_output:
                 with self.destination.open('w') as output_file:
                     human_readable_output = ''
+                    if tabular:
+                        if self.table_title:
+                            human_readable_output += self.table_title + '\n'
+                        human_readable_output += self.table_header + '\n'
                     for output in self.watch_output:
-                        human_readable_output +=  self._convert_json_to_human_readable(output)
+                        human_readable_output += self._convert_json_to_human_readable(output, tabular=tabular)
+                        if tabular:
+                            human_readable_output += '\n'
                     output_file.write(human_readable_output + '\n')
             else:
                 with self.destination.open('a') as output_file:

diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
@@ -66,7 +66,8 @@ class AMDSMIParser(argparse.ArgumentParser):
         argparse (ArgumentParser): argparse.ArgumentParser
     """
     def __init__(self, version, list, static, firmware, bad_pages, metric,
-                 process, profile, event, topology, set_value, reset, rocmsmi):
+                 process, profile, event, topology, set_value, reset, monitor,
+                 rocmsmi):
 
         # Helper variables
         self.helpers = AMDSMIHelpers()
@@ -105,6 +106,7 @@ def __init__(self, version, list, static, firmware, bad_pages, metric,
         self._add_topology_parser(self.subparsers, topology)
         self._add_set_value_parser(self.subparsers, set_value)
         self._add_reset_parser(self.subparsers, reset)
+        self._add_monitor_parser(self.subparsers, monitor)
         self._add_rocm_smi_parser(self.subparsers, rocmsmi)
 
 
@@ -813,6 +815,54 @@ def _add_reset_parser(self, subparsers, func):
         reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)
 
 
+    def _add_monitor_parser(self, subparsers, func):
+        if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
+            # This subparser is only applicable to Baremetal Linux
+            return
+
+        # Subparser help text
+        monitor_help = "Monitor metrics for target devices"
+        monitor_subcommand_help = "Monitor a target device for the specified arguments.\
+                                  \nIf no arguments are provided, all arguments will be enabled.\
+                                  \nUse the watch arguments to run continuously"
+        monitor_optionals_title = "Monitor Arguments"
+
+        # Help text for Arguments only on Guest and BM platforms
+        power_usage_help = "Monitor power usage in Watts"
+        temperature_help = "Monitor temperature in Celsius"
+        gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
+        mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
+        encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
+        decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)"
+        throttle_help = "Monitor thermal throttle status"
+        ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts"
+        mem_usage_help = "Monitor memory usage in MB"
+        pcie_throughput_help = "Monitor PCIe Tx/Rx in MB/s"
+
+        # Create monitor subparser
+        monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
+        monitor_parser._optionals.title = monitor_optionals_title
+        monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
+        monitor_parser.set_defaults(func=func)
+
+        # Add Universal Arguments
+        self._add_command_modifiers(monitor_parser)
+        self._add_device_arguments(monitor_parser, required=False)
+        self._add_watch_arguments(monitor_parser)
+
+        # Add monitor arguments
+        monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
+        monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
+        monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
+        monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
+        monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
+        monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
+        monitor_parser.add_argument('-s', '--throttle-status', action='store_true', required=False, help=throttle_help)
+        monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
+        monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
+        monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_throughput_help)
+
+
     def _add_rocm_smi_parser(self, subparsers, func):
         return
         # Subparser help text

diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc
@@ -316,6 +316,9 @@ int main() {
                     cache_info.cache[i].cache_level,
                     cache_info.cache[i].cache_size_kb,
                     cache_info.cache[i].flags);
+                printf("\tMax number CU shared: %d, Number of instances: %d\n",
+                    cache_info.cache[i].max_num_cu_shared,
+                    cache_info.cache[i].num_cache_instance);
             }
 
             // Get power measure

diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc
@@ -151,8 +151,7 @@ int main() {
             printf("\tVendorID: 0x%x\n", asic_info.vendor_id);
             printf("\tRevisionID: 0x%x\n", asic_info.rev_id);
             printf("\tAsic serial: 0x%s\n", asic_info.asic_serial);
-            printf("\tXGMI physical id: 0x%x\n\n",
-                            asic_info.xgmi_physical_id);
+            printf("\tOAM id: 0x%x\n\n", asic_info.oam_id);
 
             // Get VBIOS info
             amdsmi_vbios_info_t vbios_info = {};

diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
@@ -475,6 +475,8 @@ typedef struct {
     uint32_t cache_size_kb; /* In KB */
     uint32_t cache_level;
     uint32_t flags;  // amdsmi_cache_flags_type_t which is a bitmask
+    uint32_t max_num_cu_shared;  /* Indicates how many Compute Units share this cache instance */
+    uint32_t num_cache_instance;  /* total number of instance of this cache type */
     uint32_t reserved[3];
   } cache[AMDSMI_MAX_CACHE_TYPES];
   uint32_t reserved[15];
@@ -498,7 +500,7 @@ typedef struct {
   uint64_t device_id;   //< The device id of a GPU
   uint32_t rev_id;
   char asic_serial[AMDSMI_NORMAL_STRING_LENGTH];
-  uint16_t xgmi_physical_id;   //< 0xFFFF if not supported
+  uint16_t oam_id;   //< 0xFFFF if not supported
   uint16_t reserved[37];
 } amdsmi_asic_info_t;
 

diff --git a/include/amd_smi/impl/amd_smi_system.h b/include/amd_smi/impl/amd_smi_system.h
@@ -99,7 +99,12 @@ class AMDSmiSystem {
 #endif
  private:
     AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {}
-    amdsmi_status_t get_gpu_bdf_by_index(uint32_t index, std::string& bdf);
+
+    /* The GPU socket id is used to identify the socket, so that the XCDs
+    on the same physical device will be collected under the same socket.
+    The BD part of the BDF is used as GPU socket to represent a phyiscal device.
+    */
+    amdsmi_status_t get_gpu_socket_id(uint32_t index, std::string& socketid);
     amdsmi_status_t populate_amd_gpu_devices();
     uint64_t init_flag_;
     AMDSmiDrm drm_;

diff --git a/py-interface/README.md b/py-interface/README.md
@@ -351,7 +351,7 @@ Field | Content
 `device_id` |  device id
 `rev_id` |  revision id
 `asic_serial` | asic serial
-`xgmi_physical_id` | xgmi physical id
+`oam_id` | oam id
 
 Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function:
 
@@ -375,7 +375,7 @@ try:
             print(hex(asic_info['device_id']))
             print(hex(asic_info['rev_id']))
             print(asic_info['asic_serial'])
-            print(asic_info['xgmi_physical_id'])
+            print(asic_info['oam_id'])
 except AmdSmiException as e:
     print(e)
 ```

diff --git a/py-interface/__init__.py b/py-interface/__init__.py
@@ -194,6 +194,14 @@
 from .amdsmi_interface import amdsmi_is_P2P_accessible
 from .amdsmi_interface import amdsmi_get_xgmi_info
 
+# # Partition Functions
+from .amdsmi_interface import amdsmi_get_gpu_compute_partition
+from .amdsmi_interface import amdsmi_set_gpu_compute_partition
+from .amdsmi_interface import amdsmi_reset_gpu_compute_partition
+from .amdsmi_interface import amdsmi_get_gpu_memory_partition
+from .amdsmi_interface import amdsmi_set_gpu_memory_partition
+from .amdsmi_interface import amdsmi_reset_gpu_memory_partition
+
 # # Individual GPU Metrics Functions
 from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot
 from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem
@@ -263,6 +271,8 @@
 from .amdsmi_interface import AmdSmiTemperatureMetric
 from .amdsmi_interface import AmdSmiVoltageMetric
 from .amdsmi_interface import AmdSmiVoltageType
+from .amdsmi_interface import AmdSmiComputePartitionType
+from .amdsmi_interface import AmdSmiMemoryPartitionType
 from .amdsmi_interface import AmdSmiPowerProfilePresetMasks
 from .amdsmi_interface import AmdSmiGpuBlock
 from .amdsmi_interface import AmdSmiRasErrState
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		* @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan