From 894046e199ff003aeb2307dc78b9d2a9e1242b40 Mon Sep 17 00:00:00 2001 From: Deepak Singhal <115033986+deepak-singhal0408@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:38:20 -0800 Subject: [PATCH 1/2] Disable systemd auto-restart of dependent services for spineRouters (#17203) Currently hostcfgd script overrides the systemd service files of the features depending upon auto_restart enable/disable. I am skipping dependent features(syncd, gbsyncd for now) to have "RESTART=Always" for them to not start immediately, and instead get started by SWSS through swss.sh script. The issue of syncd double stop is also applicable to pizza box platforms, however no traffic impact is seen there, whereas on VOQ chassis, we do see traffic impact due to early start of syncd service. --- src/sonic-host-services/scripts/hostcfgd | 15 ++- .../tests/hostcfgd/hostcfgd_test.py | 16 ++- .../tests/hostcfgd/test_vectors.py | 111 ++++++++++++++++-- 3 files changed, 123 insertions(+), 19 deletions(-) diff --git a/src/sonic-host-services/scripts/hostcfgd b/src/sonic-host-services/scripts/hostcfgd index 3c06bd7ba4f4..e7c5c9ebe3fd 100755 --- a/src/sonic-host-services/scripts/hostcfgd +++ b/src/sonic-host-services/scripts/hostcfgd @@ -364,7 +364,20 @@ class FeatureHandler(object): Returns: None. """ - restart_field_str = "always" if "enabled" in feature_config.auto_restart else "no" + # As per the current code(due to various dependencies) SWSS service stop/start also stops/starts the dependent services(syncd, teamd, bgpd etc) + # There is an issue seen of syncd service getting stopped twice upon a critical process crash in syncd service due to above reason. + # Also early start of syncd service has traffic impact on VOQ chassis. + # to fix the above issue, we are disabling the auto restart of syncd service as it will be started by swss service. + # This change can be extended to other dependent services as well in future and also on pizza box platforms. + + device_type = self._device_config.get('DEVICE_METADATA', {}).get('localhost', {}).get('type') + is_dependent_service = feature_config.name in ['syncd', 'gbsyncd'] + if device_type == 'SpineRouter' and is_dependent_service: + syslog.syslog(syslog.LOG_INFO, "Skipped setting Restart field in systemd for {}".format(feature_config.name)) + restart_field_str = "no" + else: + restart_field_str = "always" if "enabled" in feature_config.auto_restart else "no" + feature_systemd_config = "[Service]\nRestart={}\n".format(restart_field_str) feature_names, feature_suffixes = self.get_multiasic_feature_instances(feature_config) diff --git a/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py b/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py index 16d437de0c2d..268cbe361e02 100644 --- a/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py +++ b/src/sonic-host-services/tests/hostcfgd/hostcfgd_test.py @@ -45,7 +45,7 @@ def checks_config_table(self, feature_table, expected_table): return True if not ddiff else False - def checks_systemd_config_file(self, feature_table, feature_systemd_name_map=None): + def checks_systemd_config_file(self, device_type, feature_table, feature_systemd_name_map=None): """Checks whether the systemd configuration file of each feature was created or not and whether the `Restart=` field in the file is set correctly or not. @@ -62,6 +62,7 @@ def checks_systemd_config_file(self, feature_table, feature_systemd_name_map=Non 'auto_restart.conf') for feature_name in feature_table: + is_dependent_feature = True if feature_name in ['syncd', 'gbsyncd'] else False auto_restart_status = feature_table[feature_name].get('auto_restart', 'disabled') if "enabled" in auto_restart_status: auto_restart_status = "enabled" @@ -77,7 +78,10 @@ def checks_systemd_config_file(self, feature_table, feature_systemd_name_map=Non with open(feature_systemd_config_file_path) as systemd_config_file: status = systemd_config_file.read().strip() - assert status == '[Service]\nRestart={}'.format(truth_table[auto_restart_status]) + if device_type == 'SpineRouter' and is_dependent_feature: + assert status == '[Service]\nRestart=no' + else: + assert status == '[Service]\nRestart={}'.format(truth_table[auto_restart_status]) def get_state_db_set_calls(self, feature_table): """Returns a Mock call objects which recorded the `set` calls to `FEATURE` table in `STATE_DB`. @@ -134,6 +138,7 @@ def test_sync_state_field(self, test_scenario_name, config_data, fs): device_config = {} device_config['DEVICE_METADATA'] = MockConfigDb.CONFIG_DB['DEVICE_METADATA'] device_config.update(config_data['device_runtime_metadata']) + device_type = MockConfigDb.CONFIG_DB['DEVICE_METADATA']['localhost']['type'] feature_handler = hostcfgd.FeatureHandler(MockConfigDb(), feature_state_table_mock, device_config) feature_table = MockConfigDb.CONFIG_DB['FEATURE'] @@ -158,13 +163,13 @@ def test_sync_state_field(self, test_scenario_name, config_data, fs): feature_table_state_db_calls = self.get_state_db_set_calls(feature_table) - self.checks_systemd_config_file(config_data['config_db']['FEATURE'], feature_systemd_name_map) + self.checks_systemd_config_file(device_type, config_data['config_db']['FEATURE'], feature_systemd_name_map) mocked_subprocess.check_call.assert_has_calls(config_data['enable_feature_subprocess_calls'], any_order=True) mocked_subprocess.check_call.assert_has_calls(config_data['daemon_reload_subprocess_call'], any_order=True) feature_state_table_mock.set.assert_has_calls(feature_table_state_db_calls) - self.checks_systemd_config_file(config_data['config_db']['FEATURE'], feature_systemd_name_map) + self.checks_systemd_config_file(device_type, config_data['config_db']['FEATURE'], feature_systemd_name_map) @parameterized.expand(HOSTCFGD_TEST_VECTOR) @patchfs @@ -196,6 +201,7 @@ def test_handler(self, test_scenario_name, config_data, fs): device_config = {} device_config['DEVICE_METADATA'] = MockConfigDb.CONFIG_DB['DEVICE_METADATA'] device_config.update(config_data['device_runtime_metadata']) + device_type = MockConfigDb.CONFIG_DB['DEVICE_METADATA']['localhost']['type'] feature_handler = hostcfgd.FeatureHandler(MockConfigDb(), feature_state_table_mock, device_config) feature_table = MockConfigDb.CONFIG_DB['FEATURE'] @@ -207,7 +213,7 @@ def test_handler(self, test_scenario_name, config_data, fs): feature_names, _ = feature_handler.get_multiasic_feature_instances(feature) feature_systemd_name_map[feature_name] = feature_names - self.checks_systemd_config_file(config_data['config_db']['FEATURE'], feature_systemd_name_map) + self.checks_systemd_config_file(device_type, config_data['config_db']['FEATURE'], feature_systemd_name_map) mocked_subprocess.check_call.assert_has_calls(config_data['enable_feature_subprocess_calls'], any_order=True) mocked_subprocess.check_call.assert_has_calls(config_data['daemon_reload_subprocess_call'], diff --git a/src/sonic-host-services/tests/hostcfgd/test_vectors.py b/src/sonic-host-services/tests/hostcfgd/test_vectors.py index 43c7efedcd38..c083666f1db7 100644 --- a/src/sonic-host-services/tests/hostcfgd/test_vectors.py +++ b/src/sonic-host-services/tests/hostcfgd/test_vectors.py @@ -580,8 +580,14 @@ "auto_restart": "enabled", "high_mem_alert": "disabled" }, - - + "syncd": { + "state": "enabled", + "has_timer": "False", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "auto_restart": "enabled", + "high_mem_alert": "disabled" + }, }, }, "expected_config_db": { @@ -610,12 +616,23 @@ "high_mem_alert": "disabled", "state": "enabled" }, + "syncd": { + "auto_restart": "enabled", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "has_timer": "False", + "high_mem_alert": "disabled", + "state": "enabled" + }, }, }, "enable_feature_subprocess_calls": [ call("sudo systemctl stop bgp.service", shell=True), call("sudo systemctl disable bgp.service", shell=True), call("sudo systemctl mask bgp.service", shell=True), + call("sudo systemctl start syncd.service", shell=True), + call("sudo systemctl enable syncd.service", shell=True), + call("sudo systemctl unmask syncd.service", shell=True), ], "daemon_reload_subprocess_call": [ call("sudo systemctl daemon-reload", shell=True), @@ -675,8 +692,14 @@ "auto_restart": "enabled", "high_mem_alert": "disabled" }, - - + "syncd": { + "state": "enabled", + "has_timer": "False", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "auto_restart": "enabled", + "high_mem_alert": "disabled" + }, }, }, "expected_config_db": { @@ -705,12 +728,23 @@ "high_mem_alert": "disabled", "state": "enabled" }, + "syncd": { + "auto_restart": "enabled", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "has_timer": "False", + "high_mem_alert": "disabled", + "state": "enabled" + }, }, }, "enable_feature_subprocess_calls": [ call("sudo systemctl stop bgp.service", shell=True), call("sudo systemctl disable bgp.service", shell=True), call("sudo systemctl mask bgp.service", shell=True), + call("sudo systemctl start syncd.service", shell=True), + call("sudo systemctl enable syncd.service", shell=True), + call("sudo systemctl unmask syncd.service", shell=True), ], "daemon_reload_subprocess_call": [ call("sudo systemctl daemon-reload", shell=True), @@ -770,8 +804,14 @@ "auto_restart": "enabled", "high_mem_alert": "disabled" }, - - + "syncd": { + "state": "enabled", + "has_timer": "False", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "auto_restart": "enabled", + "high_mem_alert": "disabled" + }, }, }, "expected_config_db": { @@ -800,6 +840,14 @@ "high_mem_alert": "disabled", "state": "enabled" }, + "syncd": { + "auto_restart": "enabled", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "has_timer": "False", + "high_mem_alert": "disabled", + "state": "enabled" + }, }, }, "enable_feature_subprocess_calls": [ @@ -809,7 +857,9 @@ call("sudo systemctl start teamd.service", shell=True), call("sudo systemctl enable teamd.service", shell=True), call("sudo systemctl unmask teamd.service", shell=True), - + call("sudo systemctl start syncd.service", shell=True), + call("sudo systemctl enable syncd.service", shell=True), + call("sudo systemctl unmask syncd.service", shell=True), ], "daemon_reload_subprocess_call": [ call("sudo systemctl daemon-reload", shell=True), @@ -869,8 +919,14 @@ "auto_restart": "enabled", "high_mem_alert": "disabled" }, - - + "syncd": { + "state": "enabled", + "has_timer": "False", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "auto_restart": "enabled", + "high_mem_alert": "disabled" + }, }, }, "expected_config_db": { @@ -899,6 +955,14 @@ "high_mem_alert": "disabled", "state": "enabled" }, + "syncd": { + "auto_restart": "enabled", + "has_global_scope": "True", + "has_per_asic_scope": "True", + "has_timer": "False", + "high_mem_alert": "disabled", + "state": "enabled" + }, }, }, "enable_feature_subprocess_calls": [ @@ -908,7 +972,9 @@ call("sudo systemctl start teamd.service", shell=True), call("sudo systemctl enable teamd.service", shell=True), call("sudo systemctl unmask teamd.service", shell=True), - + call("sudo systemctl start syncd.service", shell=True), + call("sudo systemctl enable syncd.service", shell=True), + call("sudo systemctl unmask syncd.service", shell=True), ], "daemon_reload_subprocess_call": [ call("sudo systemctl daemon-reload", shell=True), @@ -969,8 +1035,14 @@ "auto_restart": "enabled", "high_mem_alert": "disabled" }, - - + "syncd": { + "state": "enabled", + "has_timer": "False", + "has_global_scope": "False", + "has_per_asic_scope": "True", + "auto_restart": "enabled", + "high_mem_alert": "disabled" + }, }, }, "expected_config_db": { @@ -999,6 +1071,14 @@ "high_mem_alert": "disabled", "state": "enabled" }, + "syncd": { + "auto_restart": "enabled", + "has_global_scope": "False", + "has_per_asic_scope": "True", + "has_timer": "False", + "high_mem_alert": "disabled", + "state": "enabled" + }, }, }, "enable_feature_subprocess_calls": [ @@ -1020,7 +1100,12 @@ call("sudo systemctl stop lldp@1.service", shell=True), call("sudo systemctl disable lldp@1.service", shell=True), call("sudo systemctl mask lldp@1.service", shell=True), - + call("sudo systemctl start syncd@0.service", shell=True), + call("sudo systemctl enable syncd@0.service", shell=True), + call("sudo systemctl unmask syncd@0.service", shell=True), + call("sudo systemctl start syncd@1.service", shell=True), + call("sudo systemctl enable syncd@1.service", shell=True), + call("sudo systemctl unmask syncd@1.service", shell=True), ], "daemon_reload_subprocess_call": [ call("sudo systemctl daemon-reload", shell=True), From 0d4710ec07f2e42cf641c0304e01b010c7593f54 Mon Sep 17 00:00:00 2001 From: Junhua Zhai Date: Thu, 26 Oct 2023 02:58:50 +0800 Subject: [PATCH 2/2] [gearbox] use credo sai v0.9.3 (#16860) Update credo sai package to the latest v0.9.3, which fixes the issue aristanetworks/sonic#92. --- platform/components/docker-gbsyncd-credo.mk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/platform/components/docker-gbsyncd-credo.mk b/platform/components/docker-gbsyncd-credo.mk index 34f7101759eb..2181add0f9c3 100644 --- a/platform/components/docker-gbsyncd-credo.mk +++ b/platform/components/docker-gbsyncd-credo.mk @@ -1,9 +1,9 @@ DOCKER_GBSYNCD_PLATFORM_CODE = credo -LIBSAI_CREDO = libsaicredo_0.8.2_amd64.deb -$(LIBSAI_CREDO)_URL = "https://sonicstorage.blob.core.windows.net/packages/credosai/libsaicredo_0.8.2_amd64.deb?sv=2021-04-10&st=2023-01-31T04%3A24%3A23Z&se=2100-01-31T04%3A24%3A00Z&sr=b&sp=r&sig=RZPbmaIetvDRtwifrVT4s%2FaQxB%2FBTOyCqXtMtoNRjmY%3D" -LIBSAI_CREDO_OWL = libsaicredo-owl_0.8.2_amd64.deb -$(LIBSAI_CREDO_OWL)_URL = "https://sonicstorage.blob.core.windows.net/packages/credosai/libsaicredo-owl_0.8.2_amd64.deb?sv=2021-04-10&st=2023-01-31T04%3A25%3A43Z&se=2100-01-31T04%3A25%3A00Z&sr=b&sp=r&sig=%2BdSFujwy0gY%2FiH50Ffi%2FsqZOAHBOFPUcBdR06fHEZkI%3D" +LIBSAI_CREDO = libsaicredo_0.9.3_amd64.deb +$(LIBSAI_CREDO)_URL = "https://sonicstorage.blob.core.windows.net/packages/credosai/libsaicredo_0.9.3_amd64.deb?sv=2021-04-10&st=2023-10-12T02%3A21%3A05Z&se=2031-10-13T02%3A21%3A00Z&sr=b&sp=r&sig=UXC%2FYKm%2BvHRjGmM3xjnFMQzY%2BMpxhKtMxNHQPdwvtN8%3D" +LIBSAI_CREDO_OWL = libsaicredo-owl_0.9.3_amd64.deb +$(LIBSAI_CREDO_OWL)_URL = "https://sonicstorage.blob.core.windows.net/packages/credosai/libsaicredo-owl_0.9.3_amd64.deb?sv=2021-04-10&st=2023-10-12T02%3A21%3A51Z&se=2031-10-13T02%3A21%3A00Z&sr=b&sp=r&sig=olu3%2Bq5eJYRtXCygJWgKUx%2FdlrlB%2FWE0i9ruftYdB7g%3D" ifneq ($($(LIBSAI_CREDO)_URL),) include $(PLATFORM_PATH)/../template/docker-gbsyncd-base.mk