diff --git a/fdbsyncd/fdbsync.cpp b/fdbsyncd/fdbsync.cpp index 9a88e557bf..30c9e2d54c 100644 --- a/fdbsyncd/fdbsync.cpp +++ b/fdbsyncd/fdbsync.cpp @@ -43,6 +43,36 @@ FdbSync::~FdbSync() } } + +// Check if interface entries are restored in kernel +bool FdbSync::isIntfRestoreDone() +{ + vector required_modules = { + "vxlanmgrd", + "intfmgrd", + "vlanmgrd", + "vrfmgrd" + }; + + for (string& module : required_modules) + { + WarmStart::WarmStartState state; + + WarmStart::getWarmStartState(module, state); + if (state == WarmStart::REPLAYED || state == WarmStart::RECONCILED) + { + SWSS_LOG_INFO("Module %s Replayed or Reconciled %d",module.c_str(), (int) state); + } + else + { + SWSS_LOG_INFO("Module %s NOT Replayed or Reconciled %d",module.c_str(), (int) state); + return false; + } + } + + return true; +} + void FdbSync::processCfgEvpnNvo() { std::deque entries; @@ -447,6 +477,10 @@ void FdbSync::macDelVxlanDB(string key) fvVector.push_back(t); fvVector.push_back(v); + SWSS_LOG_NOTICE("%sVXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", + m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" , + key.c_str(), vtep.c_str(), type.c_str()); + // If warmstart is in progress, we take all netlink changes into the cache map if (m_AppRestartAssist->isWarmStartInProgress()) { @@ -454,7 +488,6 @@ void FdbSync::macDelVxlanDB(string key) return; } - SWSS_LOG_INFO("VXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", key.c_str(), vtep.c_str(), type.c_str()); m_fdbTable.del(key); return; @@ -476,6 +509,9 @@ void FdbSync::macAddVxlan(string key, struct in_addr vtep, string type, uint32_t fvVector.push_back(t); fvVector.push_back(v); + SWSS_LOG_INFO("%sVXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", + m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" , + key.c_str(), svtep.c_str(), type.c_str()); // If warmstart is in progress, we take all netlink changes into the cache map if (m_AppRestartAssist->isWarmStartInProgress()) { @@ -483,7 +519,6 @@ void FdbSync::macAddVxlan(string key, struct in_addr vtep, string type, uint32_t return; } - SWSS_LOG_INFO("VXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", key.c_str(), svtep.c_str(), type.c_str()); m_fdbTable.set(key, fvVector); return; diff --git a/fdbsyncd/fdbsync.h b/fdbsyncd/fdbsync.h index c8248ffefb..ee6aa0845b 100644 --- a/fdbsyncd/fdbsync.h +++ b/fdbsyncd/fdbsync.h @@ -9,8 +9,17 @@ #include "netmsg.h" #include "warmRestartAssist.h" -// The timeout value (in seconds) for fdbsyncd reconcilation logic -#define DEFAULT_FDBSYNC_WARMSTART_TIMER 30 +/* + * Default timer interval for fdbsyncd reconcillation + */ +#define DEFAULT_FDBSYNC_WARMSTART_TIMER 120 + +/* + * This is the MAX time in seconds, fdbsyncd will wait after warm-reboot + * for the interface entries to be recreated in kernel before attempting to + * write the FDB data to kernel + */ +#define INTF_RESTORE_MAX_WAIT_TIME 180 namespace swss { @@ -43,7 +52,7 @@ class FdbSync : public NetMsg virtual void onMsg(int nlmsg_type, struct nl_object *obj); - bool isFdbRestoreDone(); + bool isIntfRestoreDone(); AppRestartAssist *getRestartAssist() { diff --git a/fdbsyncd/fdbsyncd.cpp b/fdbsyncd/fdbsyncd.cpp index eeffeb68c1..41ab5a9824 100644 --- a/fdbsyncd/fdbsyncd.cpp +++ b/fdbsyncd/fdbsyncd.cpp @@ -7,6 +7,7 @@ #include "netdispatcher.h" #include "netlink.h" #include "fdbsyncd/fdbsync.h" +#include "warm_restart.h" using namespace std; using namespace swss; @@ -35,6 +36,7 @@ int main(int argc, char **argv) Selectable *temps; int ret; Select s; + SelectableTimer replayCheckTimer(timespec{0, 0}); using namespace std::chrono; @@ -45,7 +47,29 @@ int main(int argc, char **argv) if (sync.getRestartAssist()->isWarmStartInProgress()) { sync.getRestartAssist()->readTablesToMap(); - SWSS_LOG_NOTICE("Starting ReconcileTimer"); + + steady_clock::time_point starttime = steady_clock::now(); + while (!sync.isIntfRestoreDone()) + { + duration time_span = + duration_cast>(steady_clock::now() - starttime); + int pasttime = int(time_span.count()); + + if (pasttime > INTF_RESTORE_MAX_WAIT_TIME) + { + SWSS_LOG_INFO("timed-out before all interface data was replayed to kernel!!!"); + throw runtime_error("fdbsyncd: timedout on interface data replay"); + } + sleep(1); + } + replayCheckTimer.setInterval(timespec{1, 0}); + replayCheckTimer.start(); + s.addSelectable(&replayCheckTimer); + } + else + { + sync.getRestartAssist()->warmStartDisabled(); + sync.m_reconcileDone = true; } netlink.registerGroup(RTNLGRP_LINK); @@ -67,7 +91,7 @@ int main(int argc, char **argv) { s.select(&temps); - if(temps == (Selectable *)sync.getFdbStateTable()) + if (temps == (Selectable *)sync.getFdbStateTable()) { sync.processStateFdb(); } @@ -75,6 +99,33 @@ int main(int argc, char **argv) { sync.processCfgEvpnNvo(); } + else if (temps == &replayCheckTimer) + { + if (sync.getFdbStateTable()->empty() && sync.getCfgEvpnNvoTable()->empty()) + { + sync.getRestartAssist()->appDataReplayed(); + SWSS_LOG_NOTICE("FDB Replay Complete"); + s.removeSelectable(&replayCheckTimer); + + /* Obtain warm-restart timer defined for routing application */ + uint32_t warmRestartIval = WarmStart::getWarmStartTimer("bgp","bgp"); + if (warmRestartIval) + { + sync.getRestartAssist()->setReconcileInterval(warmRestartIval); + } + //Else the interval is already set to default value + + //TODO: Optimise the reconcillation time using eoiu - issue#1657 + SWSS_LOG_NOTICE("Starting ReconcileTimer"); + sync.getRestartAssist()->startReconcileTimer(s); + } + else + { + replayCheckTimer.setInterval(timespec{1, 0}); + // re-start replay check timer + replayCheckTimer.start(); + } + } else { /* @@ -88,7 +139,7 @@ int main(int argc, char **argv) sync.m_reconcileDone = true; sync.getRestartAssist()->stopReconcileTimer(s); sync.getRestartAssist()->reconcile(); - SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete (Timer)"); + SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete"); } } } diff --git a/fpmsyncd/fpmsyncd.cpp b/fpmsyncd/fpmsyncd.cpp index 9f5c9e1a65..412bfd6c98 100644 --- a/fpmsyncd/fpmsyncd.cpp +++ b/fpmsyncd/fpmsyncd.cpp @@ -18,6 +18,7 @@ using namespace swss; */ const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 120; + // Wait 3 seconds after detecting EOIU reached state // TODO: support eoiu hold interval config const uint32_t DEFAULT_EOIU_HOLD_INTERVAL = 3; @@ -67,6 +68,7 @@ int main(int argc, char **argv) SelectableTimer eoiuCheckTimer(timespec{0, 0}); // After eoiu flags are detected, start a hold timer before starting reconciliation. SelectableTimer eoiuHoldTimer(timespec{0, 0}); + /* * Pipeline should be flushed right away to deal with state pending * from previous try/catch iterations. @@ -108,6 +110,10 @@ int main(int argc, char **argv) s.addSelectable(&eoiuCheckTimer); SWSS_LOG_NOTICE("Warm-Restart eoiuCheckTimer timer started."); } + else + { + sync.m_warmStartHelper.setState(WarmStart::WSDISABLED); + } while (true) { @@ -132,6 +138,7 @@ int main(int argc, char **argv) { SWSS_LOG_NOTICE("Warm-Restart EOIU hold timer expired."); } + if (sync.m_warmStartHelper.inProgress()) { sync.m_warmStartHelper.reconcile(); diff --git a/tests/test_warm_reboot.py b/tests/test_warm_reboot.py index a7a567fb6f..84f2e57b87 100644 --- a/tests/test_warm_reboot.py +++ b/tests/test_warm_reboot.py @@ -76,7 +76,7 @@ def swss_app_check_RestoreCount_single(state_db, restore_count, name): if fv[0] == "restore_count": assert int(fv[1]) == restore_count[key] + 1 elif fv[0] == "state": - assert fv[1] == "reconciled" or fv[1] == "disabled" + assert fv[1] == "reconciled" or fv[1] == "disabled" def swss_app_check_warmstart_state(state_db, name, state): warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME) @@ -1150,7 +1150,7 @@ def test_routing_WarmRestart(self, dvs, testlog): time.sleep(5) # Verify FSM - swss_app_check_warmstart_state(state_db, "bgp", "") + swss_app_check_warmstart_state(state_db, "bgp", "disabled") # Verify that multiple changes are seen in swss and sairedis logs as there's # no warm-reboot logic in place. diff --git a/warmrestart/warmRestartAssist.cpp b/warmrestart/warmRestartAssist.cpp index ca19398577..e8e99add22 100644 --- a/warmrestart/warmRestartAssist.cpp +++ b/warmrestart/warmRestartAssist.cpp @@ -117,6 +117,16 @@ AppRestartAssist::cache_state_t AppRestartAssist::getCacheEntryState(const std:: throw std::logic_error("cache entry state is invalid"); } +void AppRestartAssist::appDataReplayed() +{ + WarmStart::setWarmStartState(m_appName, WarmStart::REPLAYED); +} + +void AppRestartAssist::warmStartDisabled() +{ + WarmStart::setWarmStartState(m_appName, WarmStart::WSDISABLED); +} + // Read table(s) from APPDB and append stale flag then insert to cachemap void AppRestartAssist::readTablesToMap() { @@ -274,6 +284,13 @@ void AppRestartAssist::reconcile() return; } +// set the reconcile interval +void AppRestartAssist::setReconcileInterval(uint32_t time) +{ + m_reconcileTimer = time; + m_warmStartTimer.setInterval(timespec{m_reconcileTimer, 0}); +} + // start the timer, take Select class "s" to add the timer. void AppRestartAssist::startReconcileTimer(Select &s) { diff --git a/warmrestart/warmRestartAssist.h b/warmrestart/warmRestartAssist.h index 8587d84d4a..e4cac31b6e 100644 --- a/warmrestart/warmRestartAssist.h +++ b/warmrestart/warmRestartAssist.h @@ -75,10 +75,13 @@ class AppRestartAssist DELETE = 3 }; // These functions were used as described in the class description + void setReconcileInterval(uint32_t time); void startReconcileTimer(Select &s); void stopReconcileTimer(Select &s); bool checkReconcileTimer(Selectable *s); void readTablesToMap(void); + void appDataReplayed(void); + void warmStartDisabled(void); void insertToMap(std::string tableName, std::string key, std::vector fvVector, bool delete_key); void reconcile(void); bool isWarmStartInProgress(void)