From 087176e839c93ad0891b520c2e6d8972d5a7cc4d Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Sun, 12 Feb 2023 17:23:47 +0800 Subject: [PATCH] [Mellanox] Advance hw-mgmt to v.7.0020.4104 (#13372) - Why I did it Advance hw-mgmt service to V.7.0020.4100 Add missing thermal sensors that are supported by hw-mgmt package Delay system health service before hw-mgmt has started on Mellanox platform in order to avoid reading some sensors before ready. Depends on sonic-net/sonic-linux-kernel#305 - How I did it 1. Update hw mgmt version 2. Add missing sensors 3. Delay service - How to verify it Regression test. Signed-off-by: Stephen Sun --- .../x86_64-mlnx_msn2010-r0/platform.json | 3 + .../x86_64-mlnx_msn2100-r0/platform.json | 3 + .../x86_64-mlnx_msn2410-r0/platform.json | 3 + .../x86_64-mlnx_msn2700-r0/platform.json | 3 + .../x86_64-nvidia_sn2201-r0/platform.json | 3 + platform/mellanox/hw-management.mk | 2 +- ...mine-reboot-cause-service-start-afte.patch | 10 ++-- ...02-Disable-hw-mgmt-on-SimX-platforms.patch | 58 ++++++++----------- ...on-upstream-kernel-modules-from-load.patch | 8 +-- ...th-service-starts-after-hw-managemen.patch | 29 ++++++++++ platform/mellanox/hw-management/hw-mgmt | 2 +- 11 files changed, 78 insertions(+), 46 deletions(-) create mode 100644 platform/mellanox/hw-management/0004-Make-system-health-service-starts-after-hw-managemen.patch diff --git a/device/mellanox/x86_64-mlnx_msn2010-r0/platform.json b/device/mellanox/x86_64-mlnx_msn2010-r0/platform.json index 1e8fe747f906..d48b4e8e8fc8 100644 --- a/device/mellanox/x86_64-mlnx_msn2010-r0/platform.json +++ b/device/mellanox/x86_64-mlnx_msn2010-r0/platform.json @@ -67,6 +67,9 @@ }, { "name": "CPU Core 3 Temp" + }, + { + "name": "SODIMM 1 Temp" } ], "sfps": [ diff --git a/device/mellanox/x86_64-mlnx_msn2100-r0/platform.json b/device/mellanox/x86_64-mlnx_msn2100-r0/platform.json index 74b4397759c1..485a3614b66b 100644 --- a/device/mellanox/x86_64-mlnx_msn2100-r0/platform.json +++ b/device/mellanox/x86_64-mlnx_msn2100-r0/platform.json @@ -67,6 +67,9 @@ }, { "name": "CPU Core 3 Temp" + }, + { + "name": "SODIMM 1 Temp" } ], "sfps": [ diff --git a/device/mellanox/x86_64-mlnx_msn2410-r0/platform.json b/device/mellanox/x86_64-mlnx_msn2410-r0/platform.json index 6ed5a3a84aa8..98a60a5e898f 100644 --- a/device/mellanox/x86_64-mlnx_msn2410-r0/platform.json +++ b/device/mellanox/x86_64-mlnx_msn2410-r0/platform.json @@ -114,6 +114,9 @@ }, { "name": "CPU Pack Temp" + }, + { + "name": "SODIMM 1 Temp" } ], "sfps": [ diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/platform.json b/device/mellanox/x86_64-mlnx_msn2700-r0/platform.json index 063a211e785c..22599926e3fa 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/platform.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/platform.json @@ -114,6 +114,9 @@ }, { "name": "CPU Pack Temp" + }, + { + "name": "SODIMM 1 Temp" } ], "sfps": [ diff --git a/device/mellanox/x86_64-nvidia_sn2201-r0/platform.json b/device/mellanox/x86_64-nvidia_sn2201-r0/platform.json index 83250c75f74e..b0ab14793910 100644 --- a/device/mellanox/x86_64-nvidia_sn2201-r0/platform.json +++ b/device/mellanox/x86_64-nvidia_sn2201-r0/platform.json @@ -101,6 +101,9 @@ }, { "name": "ASIC" + }, + { + "name": "SODIMM 1 Temp" } ], "sfps": [{ diff --git a/platform/mellanox/hw-management.mk b/platform/mellanox/hw-management.mk index 0c8c8c1b548d..a12bd9b32aca 100644 --- a/platform/mellanox/hw-management.mk +++ b/platform/mellanox/hw-management.mk @@ -16,7 +16,7 @@ # # Mellanox HW Management -MLNX_HW_MANAGEMENT_VERSION = 7.0020.3006 +MLNX_HW_MANAGEMENT_VERSION = 7.0020.4104 export MLNX_HW_MANAGEMENT_VERSION diff --git a/platform/mellanox/hw-management/0001-Make-SONiC-determine-reboot-cause-service-start-afte.patch b/platform/mellanox/hw-management/0001-Make-SONiC-determine-reboot-cause-service-start-afte.patch index 11539e606369..f7667ed6c4ea 100644 --- a/platform/mellanox/hw-management/0001-Make-SONiC-determine-reboot-cause-service-start-afte.patch +++ b/platform/mellanox/hw-management/0001-Make-SONiC-determine-reboot-cause-service-start-afte.patch @@ -1,8 +1,8 @@ -From 1a1011b6da491d35001df5a7204d4eecb2769767 Mon Sep 17 00:00:00 2001 +From 489764eb124e03087eb408dec27d769fa4f98459 Mon Sep 17 00:00:00 2001 From: keboliu Date: Fri, 15 Jan 2021 14:41:16 +0800 -Subject: [PATCH] Make SONiC determine-reboot-cause service start after hw-mgmt - service +Subject: [PATCH 1/4] Make SONiC determine-reboot-cause service start after + hw-mgmt service Signed-off-by: Kebo Liu --- @@ -10,7 +10,7 @@ Signed-off-by: Kebo Liu 1 file changed, 1 insertion(+) diff --git a/debian/hw-management.hw-management.service b/debian/hw-management.hw-management.service -index 39a2a54..2104b87 100755 +index 8bdcaef..1c25ffb 100755 --- a/debian/hw-management.hw-management.service +++ b/debian/hw-management.hw-management.service @@ -1,6 +1,7 @@ @@ -22,5 +22,5 @@ index 39a2a54..2104b87 100755 [Service] Type=oneshot -- -1.9.1 +2.20.1 diff --git a/platform/mellanox/hw-management/0002-Disable-hw-mgmt-on-SimX-platforms.patch b/platform/mellanox/hw-management/0002-Disable-hw-mgmt-on-SimX-platforms.patch index c12474712b9b..61aab18090a2 100644 --- a/platform/mellanox/hw-management/0002-Disable-hw-mgmt-on-SimX-platforms.patch +++ b/platform/mellanox/hw-management/0002-Disable-hw-mgmt-on-SimX-platforms.patch @@ -1,59 +1,47 @@ -From 79dadd5b0d2f5e860b525c12d4d3843607b03a9f Mon Sep 17 00:00:00 2001 +From 422b64397f2f33b394d037820f0ceb4c09e3a725 Mon Sep 17 00:00:00 2001 From: Alexander Allen Date: Fri, 21 Jan 2022 16:47:19 +0000 -Subject: [PATCH] Disable hw-mgmt on SimX platforms +Subject: [PATCH 2/4] Disable hw-mgmt on SimX platforms --- - usr/usr/bin/hw-management-ready.sh | 31 ++++++++++++++++-------------- + usr/usr/bin/hw-management-ready.sh | 11 +++++++---- usr/usr/bin/hw-management.sh | 9 +++++++++ - 2 files changed, 26 insertions(+), 14 deletions(-) + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/usr/usr/bin/hw-management-ready.sh b/usr/usr/bin/hw-management-ready.sh -index 5a9698c..364f906 100755 +index 88672a8..7558c68 100755 --- a/usr/usr/bin/hw-management-ready.sh +++ b/usr/usr/bin/hw-management-ready.sh -@@ -51,19 +51,22 @@ if [ -d /var/run/hw-management ]; then +@@ -51,17 +51,20 @@ if [ -d /var/run/hw-management ]; then rm -fr /var/run/hw-management fi -case $board_type in -VMOD0014) -- while [ ! -d /sys/devices/pci0000:00/0000:00:1f.0/NVSN2201:00/mlxreg-hotplug/hwmon ] -- do -- sleep 1 -- done -- ;; ++if [ -z "$(lspci -vvv | grep SimX)" ]; then ++ case $board_type in ++ VMOD0014) + if [ ! -d /sys/devices/pci0000:00/0000:00:1f.0/NVSN2201:00/mlxreg-hotplug/hwmon ]; then + timeout 180 bash -c 'until [ -d /sys/devices/pci0000:00/0000:00:1f.0/NVSN2201:00/mlxreg-hotplug/hwmon ]; do sleep 0.2; done' + fi + ;; -*) -- while [ ! -d /sys/devices/platform/mlxplat/mlxreg-hotplug/hwmon ] -- do -- sleep 1 -- done -- ;; ++ *) + if [ ! -d /sys/devices/platform/mlxplat/mlxreg-hotplug/hwmon ]; then + timeout 180 bash -c 'until [ -d /sys/devices/platform/mlxplat/mlxreg-hotplug/hwmon ]; do sleep 0.2; done' + fi + ;; -esac -+if [ -z "$(lspci -vvv | grep SimX)" ]; then -+ case $board_type in -+ VMOD0014) -+ while [ ! -d /sys/devices/pci0000:00/0000:00:1f.0/NVSN2201:00/mlxreg-hotplug/hwmon ] -+ do -+ sleep 1 -+ done -+ ;; -+ *) -+ while [ ! -d /sys/devices/platform/mlxplat/mlxreg-hotplug/hwmon ] -+ do -+ sleep 1 -+ done -+ ;; -+ esac ++ esac +fi + echo "Start Chassis HW management service." logger -t hw-management -p daemon.notice "Start Chassis HW management service." diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh -index ebfabb0..c0c038e 100755 +index 1ee05b5..50d922b 100755 --- a/usr/usr/bin/hw-management.sh +++ b/usr/usr/bin/hw-management.sh -@@ -1495,6 +1495,13 @@ do_chip_down() +@@ -2310,6 +2310,13 @@ do_chip_down() /usr/bin/hw-management-thermal-events.sh change hotplug_asic down %S %p } @@ -67,7 +55,7 @@ index ebfabb0..c0c038e 100755 __usage=" Usage: $(basename "$0") [Options] -@@ -1520,6 +1527,8 @@ Options: +@@ -2335,6 +2342,8 @@ Options: force-reload Performs hw-management 'stop' and the 'start. " @@ -77,5 +65,5 @@ index ebfabb0..c0c038e 100755 start) if [ -d /var/run/hw-management ]; then -- -2.17.1 +2.20.1 diff --git a/platform/mellanox/hw-management/0003-Remove-unused-non-upstream-kernel-modules-from-load.patch b/platform/mellanox/hw-management/0003-Remove-unused-non-upstream-kernel-modules-from-load.patch index 496085e9d4d8..ba83bf6764ee 100644 --- a/platform/mellanox/hw-management/0003-Remove-unused-non-upstream-kernel-modules-from-load.patch +++ b/platform/mellanox/hw-management/0003-Remove-unused-non-upstream-kernel-modules-from-load.patch @@ -1,14 +1,14 @@ -From 14b06a12802fc0e15116a64f419d002d0d21d695 Mon Sep 17 00:00:00 2001 +From 439639e939f896f9aee42a4dbd5216feb728220c Mon Sep 17 00:00:00 2001 From: Alexander Allen Date: Thu, 17 Feb 2022 04:19:50 +0000 -Subject: [PATCH] Remove unused non-upstream kernel modules from load +Subject: [PATCH 3/4] Remove unused non-upstream kernel modules from load --- usr/etc/modules-load.d/05-hw-management-modules.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/usr/etc/modules-load.d/05-hw-management-modules.conf b/usr/etc/modules-load.d/05-hw-management-modules.conf -index 39f621e..c0980bc 100644 +index cfcfaa4..dd3b5ca 100644 --- a/usr/etc/modules-load.d/05-hw-management-modules.conf +++ b/usr/etc/modules-load.d/05-hw-management-modules.conf @@ -15,8 +15,6 @@ xdpe12284 @@ -21,5 +21,5 @@ index 39f621e..c0980bc 100644 gpio-pca953x pmbus -- -2.17.1 +2.20.1 diff --git a/platform/mellanox/hw-management/0004-Make-system-health-service-starts-after-hw-managemen.patch b/platform/mellanox/hw-management/0004-Make-system-health-service-starts-after-hw-managemen.patch new file mode 100644 index 000000000000..6af57c842d2d --- /dev/null +++ b/platform/mellanox/hw-management/0004-Make-system-health-service-starts-after-hw-managemen.patch @@ -0,0 +1,29 @@ +From 038bce6bf808ec9d082e96fec4184e060b3a85a9 Mon Sep 17 00:00:00 2001 +From: Stephen Sun +Date: Mon, 28 Nov 2022 03:55:14 +0000 +Subject: [PATCH 4/4] Make system-health service starts after hw-management to + avoid failures + +On SN2410, it can fail to read the file led_status_capability if it starts from ONIE + +Signed-off-by: Stephen Sun +--- + debian/hw-management.hw-management.service | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/debian/hw-management.hw-management.service b/debian/hw-management.hw-management.service +index 1c25ffb..0fbd877 100755 +--- a/debian/hw-management.hw-management.service ++++ b/debian/hw-management.hw-management.service +@@ -1,7 +1,7 @@ + [Unit] + Description=Chassis HW management service of Mellanox systems + Documentation=man:hw-management.service(8) +-Before=determine-reboot-cause.service ++Before=determine-reboot-cause.service system-health.service + + [Service] + Type=oneshot +-- +2.20.1 + diff --git a/platform/mellanox/hw-management/hw-mgmt b/platform/mellanox/hw-management/hw-mgmt index 137109ed15be..9c3f6b2d6ac4 160000 --- a/platform/mellanox/hw-management/hw-mgmt +++ b/platform/mellanox/hw-management/hw-mgmt @@ -1 +1 @@ -Subproject commit 137109ed15be147a99b59ceb9dfa1799688dfb71 +Subproject commit 9c3f6b2d6ac4be2eda936522f369ff8920deb4e2