From 3eb462f35d02ccaac0ed643fd5539911a94f5f77 Mon Sep 17 00:00:00 2001
From: Hua Liu <58683130+liuh-80@users.noreply.github.com>
Date: Wed, 19 Jun 2024 09:39:41 +0800
Subject: [PATCH] Improve load_mingraph to wait eth0 restart before exit
 (#3365) (#3371)

Improve load_mingraph to wait eth0 restart before exit
This is cherry-pick PR for #3365
---
 config/main.py       | 37 +++++++++++++++++++++++++++++++++++++
 tests/config_test.py |  9 +++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/config/main.py b/config/main.py
index 34d95a58b3..dd87abc314 100644
--- a/config/main.py
+++ b/config/main.py
@@ -891,10 +891,47 @@ def _reset_failed_services():
     for service in _get_sonic_services():
         clicommon.run_command(['systemctl', 'reset-failed', str(service)])
 
+
+def get_service_finish_timestamp(service):
+    out, _ = clicommon.run_command(['sudo',
+                                    'systemctl',
+                                    'show',
+                                    '--no-pager',
+                                    service,
+                                    '-p',
+                                    'ExecMainExitTimestamp',
+                                    '--value'],
+                                   return_cmd=True)
+    return out.strip(' \t\n\r')
+
+
+def wait_service_restart_finish(service, last_timestamp, timeout=30):
+    start_time = time.time()
+    elapsed_time = 0
+    while elapsed_time < timeout:
+        current_timestamp = get_service_finish_timestamp(service)
+        if current_timestamp and (current_timestamp != last_timestamp):
+            return
+
+        time.sleep(1)
+        elapsed_time = time.time() - start_time
+
+    log.log_warning("Service: {} does not restart in {} seconds, stop waiting".format(service, timeout))
+
+
 def _restart_services():
+    last_interface_config_timestamp = get_service_finish_timestamp('interfaces-config')
+    last_networking_timestamp = get_service_finish_timestamp('networking')
+
     click.echo("Restarting SONiC target ...")
     clicommon.run_command(['sudo', 'systemctl', 'restart', 'sonic.target'])
 
+    # These service will restart eth0 and cause device lost network for 10 seconds
+    # When enable TACACS, every remote user commands will authorize by TACACS service via network
+    # If load_minigraph exit before eth0 restart, commands after load_minigraph may failed
+    wait_service_restart_finish('interfaces-config', last_interface_config_timestamp)
+    wait_service_restart_finish('networking', last_networking_timestamp)
+
     try:
         subprocess.check_call(['sudo', 'monit', 'status'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
         click.echo("Enabling container monitoring ...")
diff --git a/tests/config_test.py b/tests/config_test.py
index cc0ac22e98..d19511a71b 100644
--- a/tests/config_test.py
+++ b/tests/config_test.py
@@ -1,3 +1,4 @@
+import datetime
 import pytest
 import filecmp
 import importlib
@@ -186,6 +187,10 @@ def mock_run_command_side_effect(*args, **kwargs):
             return 'enabled', 0
         elif command == 'cat /var/run/dhclient.eth0.pid':
             return '101', 0
+        elif command == 'sudo systemctl show --no-pager interfaces-config -p ExecMainExitTimestamp --value':
+            return f'{datetime.datetime.now()}', 0
+        elif command == 'sudo systemctl show --no-pager networking -p ExecMainExitTimestamp --value':
+            return f'{datetime.datetime.now()}', 0
         else:
             return '', 0
 
@@ -413,7 +418,7 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
             assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output
             # Verify "systemctl reset-failed" is called for services under sonic.target
             mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
-            assert mock_run_command.call_count == 8
+            assert mock_run_command.call_count == 12
 
     @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_path, None)))
     def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broadcom_asic):
@@ -428,7 +433,7 @@ def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broad
             assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_platform_plugin_command_output
             # Verify "systemctl reset-failed" is called for services under sonic.target
             mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
-            assert mock_run_command.call_count == 8
+            assert mock_run_command.call_count == 12
 
     @mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_false_path, None)))
     def test_load_minigraph_platform_plugin_fail(self, get_cmd_module, setup_single_broadcom_asic):