From 8b3bc18e7c187a45df2040ddd7e0b2db2509de12 Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Sun, 28 Feb 2021 22:54:15 -0800 Subject: [PATCH] [reload] Improve reload by using sonic.target. (#1199) - What I did To remove the list of hardcoded order-dependent lists of services to stop/restart/reset-failed. - How I did it Used sonic.target to stop/restart/reset-failed. - How to verify it Execute config reload and observe the services do restart. Signed-off-by: Stepan Blyshchak --- config/main.py | 176 +++++++------------------------------------ tests/config_test.py | 60 ++------------- 2 files changed, 33 insertions(+), 203 deletions(-) diff --git a/config/main.py b/config/main.py index eb17ad350454..10035cdc3f03 100644 --- a/config/main.py +++ b/config/main.py @@ -9,7 +9,6 @@ import re import subprocess import sys -import threading import time from socket import AF_INET, AF_INET6 @@ -63,10 +62,6 @@ INIT_CFG_FILE = '/etc/sonic/init_cfg.json' -SYSTEMCTL_ACTION_STOP="stop" -SYSTEMCTL_ACTION_RESTART="restart" -SYSTEMCTL_ACTION_RESET_FAILED="reset-failed" - DEFAULT_NAMESPACE = '' CFG_LOOPBACK_PREFIX = "Loopback" CFG_LOOPBACK_PREFIX_LEN = len(CFG_LOOPBACK_PREFIX) @@ -224,54 +219,6 @@ def breakout_Ports(cm, delPorts=list(), portJson=dict(), force=False, \ # Helper functions # -# Execute action per NPU instance for multi instance services. -def execute_systemctl_per_asic_instance(inst, event, service, action): - try: - click.echo("Executing {} of service {}@{}...".format(action, service, inst)) - clicommon.run_command("systemctl {} {}@{}.service".format(action, service, inst)) - except SystemExit as e: - log.log_error("Failed to execute {} of service {}@{} with error {}".format(action, service, inst, e)) - # Set the event object if there is a failure and exception was raised. - event.set() - -# Execute action on list of systemd services -def execute_systemctl(list_of_services, action): - num_asic = multi_asic.get_num_asics() - generated_services_list, generated_multi_instance_services = _get_sonic_generated_services(num_asic) - if ((generated_services_list == []) and - (generated_multi_instance_services == [])): - log.log_error("Failed to get generated services") - return - - for service in list_of_services: - if (service + '.service' in generated_services_list): - try: - click.echo("Executing {} of service {}...".format(action, service)) - clicommon.run_command("systemctl {} {}".format(action, service)) - except SystemExit as e: - log.log_error("Failed to execute {} of service {} with error {}".format(action, service, e)) - raise - - if (service + '.service' in generated_multi_instance_services): - # With Multi NPU, Start a thread per instance to do the "action" on multi instance services. - if multi_asic.is_multi_asic(): - threads = [] - # Use this event object to co-ordinate if any threads raised exception - e = threading.Event() - - kwargs = {'service': service, 'action': action} - for inst in range(num_asic): - t = threading.Thread(target=execute_systemctl_per_asic_instance, args=(inst, e), kwargs=kwargs) - threads.append(t) - t.start() - - # Wait for all the threads to finish. - for inst in range(num_asic): - threads[inst].join() - - # Check if any of the threads have raised exception, if so exit the process. - if e.is_set(): - sys.exit(1) def _get_device_type(): """ @@ -720,97 +667,26 @@ def _get_disabled_services_list(config_db): return disabled_services_list -def _stop_services(config_db): - # This list is order-dependent. Please add services in the order they should be stopped - # on Mellanox platform pmon is stopped by syncd - services_to_stop = [ - 'telemetry', - 'restapi', - 'swss', - 'lldp', - 'pmon', - 'bgp', - 'hostcfgd', - 'nat' - ] - - if asic_type == 'mellanox' and 'pmon' in services_to_stop: - services_to_stop.remove('pmon') - - disabled_services = _get_disabled_services_list(config_db) - - for service in disabled_services: - if service in services_to_stop: - services_to_stop.remove(service) - - execute_systemctl(services_to_stop, SYSTEMCTL_ACTION_STOP) - - -def _reset_failed_services(config_db): - # This list is order-independent. Please keep list in alphabetical order - services_to_reset = [ - 'bgp', - 'dhcp_relay', - 'hostcfgd', - 'hostname-config', - 'interfaces-config', - 'lldp', - 'mux', - 'nat', - 'ntp-config', - 'pmon', - 'radv', - 'restapi', - 'rsyslog-config', - 'sflow', - 'snmp', - 'swss', - 'syncd', - 'teamd', - 'telemetry', - 'macsec', - ] - - disabled_services = _get_disabled_services_list(config_db) - - for service in disabled_services: - if service in services_to_reset: - services_to_reset.remove(service) - - execute_systemctl(services_to_reset, SYSTEMCTL_ACTION_RESET_FAILED) - - -def _restart_services(config_db): - # This list is order-dependent. Please add services in the order they should be started - # on Mellanox platform pmon is started by syncd - services_to_restart = [ - 'hostname-config', - 'interfaces-config', - 'ntp-config', - 'rsyslog-config', - 'swss', - 'mux', - 'bgp', - 'pmon', - 'lldp', - 'hostcfgd', - 'nat', - 'sflow', - 'restapi', - 'telemetry', - 'macsec', - ] - - disabled_services = _get_disabled_services_list(config_db) - - for service in disabled_services: - if service in services_to_restart: - services_to_restart.remove(service) - - if asic_type == 'mellanox' and 'pmon' in services_to_restart: - services_to_restart.remove('pmon') - - execute_systemctl(services_to_restart, SYSTEMCTL_ACTION_RESTART) + +def _stop_services(): + click.echo("Stopping SONiC target ...") + clicommon.run_command("sudo systemctl stop sonic.target") + + +def _get_sonic_services(): + out = clicommon.run_command("systemctl list-dependencies --plain sonic.target | sed '1d'", return_cmd=True) + return [unit.strip() for unit in out.splitlines()] + + +def _reset_failed_services(): + for service in _get_sonic_services(): + click.echo("Resetting failed status on {}".format(service)) + clicommon.run_command("systemctl reset-failed {}".format(service)) + + +def _restart_services(): + click.echo("Restarting SONiC target ...") + clicommon.run_command("sudo systemctl restart sonic.target") # Reload Monit configuration to pick up new hostname in case it changed click.echo("Reloading Monit configuration ...") @@ -1115,7 +991,7 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart): #Stop services before config push if not no_service_restart: log.log_info("'reload' stopping services...") - _stop_services(db.cfgdb) + _stop_services() # In Single ASIC platforms we have single DB service. In multi-ASIC platforms we have a global DB # service running in the host + DB services running in each ASIC namespace created per ASIC. @@ -1186,9 +1062,9 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart): # We first run "systemctl reset-failed" to remove the "failed" # status from all services before we attempt to restart them if not no_service_restart: - _reset_failed_services(db.cfgdb) + _reset_failed_services() log.log_info("'reload' restarting services...") - _restart_services(db.cfgdb) + _restart_services() @config.command("load_mgmt_config") @click.option('-y', '--yes', is_flag=True, callback=_abort_if_false, @@ -1227,7 +1103,7 @@ def load_minigraph(db, no_service_restart): #Stop services before config push if not no_service_restart: log.log_info("'load_minigraph' stopping services...") - _stop_services(db.cfgdb) + _stop_services() # For Single Asic platform the namespace list has the empty string # for mulit Asic platform the empty string to generate the config @@ -1283,10 +1159,10 @@ def load_minigraph(db, no_service_restart): # We first run "systemctl reset-failed" to remove the "failed" # status from all services before we attempt to restart them if not no_service_restart: - _reset_failed_services(db.cfgdb) + _reset_failed_services() #FIXME: After config DB daemon is implemented, we'll no longer need to restart every service. log.log_info("'load_minigraph' restarting services...") - _restart_services(db.cfgdb) + _restart_services() click.echo("Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.") diff --git a/tests/config_test.py b/tests/config_test.py index 30a1f10ec8f5..89d8313d53a8 100644 --- a/tests/config_test.py +++ b/tests/config_test.py @@ -12,43 +12,11 @@ from utilities_common.db import Db load_minigraph_command_output="""\ -Executing stop of service telemetry... -Executing stop of service swss... -Executing stop of service lldp... -Executing stop of service pmon... -Executing stop of service bgp... -Executing stop of service hostcfgd... -Executing stop of service nat... +Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db Running command: pfcwd start_default Running command: config qos reload --no-dynamic-buffer -Executing reset-failed of service bgp... -Executing reset-failed of service dhcp_relay... -Executing reset-failed of service hostcfgd... -Executing reset-failed of service hostname-config... -Executing reset-failed of service interfaces-config... -Executing reset-failed of service lldp... -Executing reset-failed of service nat... -Executing reset-failed of service ntp-config... -Executing reset-failed of service pmon... -Executing reset-failed of service radv... -Executing reset-failed of service rsyslog-config... -Executing reset-failed of service snmp... -Executing reset-failed of service swss... -Executing reset-failed of service syncd... -Executing reset-failed of service teamd... -Executing reset-failed of service telemetry... -Executing restart of service hostname-config... -Executing restart of service interfaces-config... -Executing restart of service ntp-config... -Executing restart of service rsyslog-config... -Executing restart of service swss... -Executing restart of service bgp... -Executing restart of service pmon... -Executing restart of service lldp... -Executing restart of service hostcfgd... -Executing restart of service nat... -Executing restart of service telemetry... +Restarting SONiC target ... Reloading Monit configuration ... Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`. """ @@ -56,9 +24,12 @@ def mock_run_command_side_effect(*args, **kwargs): command = args[0] - if 'display_cmd' in kwargs and kwargs['display_cmd'] == True: + if kwargs.get('display_cmd'): click.echo(click.style("Running command: ", fg='cyan') + click.style(command, fg='green')) + if kwargs.get('return_cmd'): + return '' + class TestLoadMinigraph(object): @classmethod @@ -78,24 +49,7 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic): traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output - assert mock_run_command.call_count == 38 - - def test_load_minigraph_with_disabled_telemetry(self, get_cmd_module, setup_single_broadcom_asic): - with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command: - (config, show) = get_cmd_module - db = Db() - runner = CliRunner() - result = runner.invoke(config.config.commands["feature"].commands["state"], ["telemetry", "disabled"], obj=db) - assert result.exit_code == 0 - result = runner.invoke(show.cli.commands["feature"].commands["status"], ["telemetry"], obj=db) - print(result.output) - assert result.exit_code == 0 - result = runner.invoke(config.config.commands["load_minigraph"], ["-y"], obj=db) - print(result.exit_code) - print(result.output) - assert result.exit_code == 0 - assert "telemetry" not in result.output - assert mock_run_command.call_count == 35 + assert mock_run_command.call_count == 7 @classmethod def teardown_class(cls):