From 5a4678e75615727903dadbf6e4fea0d19835a728 Mon Sep 17 00:00:00 2001 From: Nazarii Hnydyn Date: Mon, 20 Sep 2021 18:27:15 +0300 Subject: [PATCH] [202012][teammgrd]: Improve LAGs cleanup on shutdown (#1916) This PR is intended to fix LAGs cleanup degradation caused by python2.7 -> python3 migration. The approach is to replace `teamd -k -t` call with the raw `SIGTERM` and add PID alive check. This will make sure the `teammgrd` is stopped only after all managed processes are being killed. resolves: https://github.com/Azure/sonic-buildimage/issues/8071 **What I did** * Replaced `teamd -k -t` call with raw `SIGTERM` * Added PID alive check **Why I did it** * To fix LAGs cleanup timeout issue caused by python2.7 -> python3 upgrade **How I verified it** 1. Configure 64 LAG RIFs 2. Reload config **Details if related** * N/A --- cfgmgr/teammgr.cpp | 45 ++++++++++++++++++++++++++++++++++++++++----- cfgmgr/teammgr.h | 2 -- cfgmgr/teammgrd.cpp | 5 +++-- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/cfgmgr/teammgr.cpp b/cfgmgr/teammgr.cpp index cdff7013a0..e9ca84cd3d 100644 --- a/cfgmgr/teammgr.cpp +++ b/cfgmgr/teammgr.cpp @@ -112,18 +112,53 @@ void TeamMgr::doTask(Consumer &consumer) } } - void TeamMgr::cleanTeamProcesses() { SWSS_LOG_ENTER(); SWSS_LOG_NOTICE("Cleaning up LAGs during shutdown..."); - for (const auto& it: m_lagList) + + std::unordered_map aliasPidMap; + + for (const auto& alias: m_lagList) + { + std::string res; + pid_t pid; + + { + std::stringstream cmd; + cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid"); + EXEC_WITH_ERROR_THROW(cmd.str(), res); + + pid = static_cast(std::stoul(res, nullptr, 10)); + aliasPidMap[alias] = pid; + + SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid); + } + + { + std::stringstream cmd; + cmd << "kill -TERM " << pid; + EXEC_WITH_ERROR_THROW(cmd.str(), res); + + SWSS_LOG_INFO("Sent SIGTERM to port channel %s pid %d", alias.c_str(), pid); + } + } + + for (const auto& cit: aliasPidMap) { - //This will call team -k kill -t which internally send SIGTERM - removeLag(it); + const auto &alias = cit.first; + const auto &pid = cit.second; + + std::stringstream cmd; + std::string res; + + SWSS_LOG_NOTICE("Waiting for port channel %s pid %d to stop...", alias.c_str(), pid); + + cmd << "tail -f --pid=" << pid << " /dev/null"; + EXEC_WITH_ERROR_THROW(cmd.str(), res); } - return; + SWSS_LOG_NOTICE("LAGs cleanup is done"); } void TeamMgr::doLagTask(Consumer &consumer) diff --git a/cfgmgr/teammgr.h b/cfgmgr/teammgr.h index 0c0ff62579..cb7786a0eb 100644 --- a/cfgmgr/teammgr.h +++ b/cfgmgr/teammgr.h @@ -32,7 +32,6 @@ class TeamMgr : public Orch ProducerStateTable m_appLagTable; std::set m_lagList; - std::map m_lagPIDList; MacAddress m_mac; @@ -49,7 +48,6 @@ class TeamMgr : public Orch bool setLagAdminStatus(const std::string &alias, const std::string &admin_status); bool setLagMtu(const std::string &alias, const std::string &mtu); bool setLagLearnMode(const std::string &alias, const std::string &learn_mode); - bool isPortEnslaved(const std::string &); bool findPortMaster(std::string &, const std::string &); diff --git a/cfgmgr/teammgrd.cpp b/cfgmgr/teammgrd.cpp index 1ff2ed760e..e38456eebe 100644 --- a/cfgmgr/teammgrd.cpp +++ b/cfgmgr/teammgrd.cpp @@ -66,7 +66,7 @@ int main(int argc, char **argv) } while (!received_sigterm) - { + { Selectable *sel; int ret; @@ -91,7 +91,8 @@ int main(int argc, char **argv) catch (const exception &e) { SWSS_LOG_ERROR("Runtime error: %s", e.what()); + return EXIT_FAILURE; } - return -1; + return EXIT_SUCCESS; }