From f0b54102776dfa6880e1651124705d83e1f3ab64 Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Wed, 24 Apr 2019 18:56:00 -0700 Subject: [PATCH 1/9] [warm-reboot]: add bgp eoiu support to speed up route reconciliation in fpmsyncd Signed-off-by: Jipan Yang --- debian/swss.install | 1 + doc/swss-schema.md | 14 ++- fpmsyncd/bgp_eoiu_marker.py | 216 ++++++++++++++++++++++++++++++++++++ fpmsyncd/fpmsyncd.cpp | 88 +++++++++++++-- 4 files changed, 310 insertions(+), 9 deletions(-) create mode 100644 fpmsyncd/bgp_eoiu_marker.py diff --git a/debian/swss.install b/debian/swss.install index 23f9b18b8a..7dd28196ac 100644 --- a/debian/swss.install +++ b/debian/swss.install @@ -1,3 +1,4 @@ swssconfig/sample/netbouncer.json etc/swss/config.d swssconfig/sample/00-copp.config.json etc/swss/config.d neighsyncd/restore_neighbors.py usr/bin +fpmsyncd/bgp_eoiu_marker.py usr/bin diff --git a/doc/swss-schema.md b/doc/swss-schema.md index 7d934b876b..31868c915d 100644 --- a/doc/swss-schema.md +++ b/doc/swss-schema.md @@ -809,7 +809,7 @@ Stores information for physical switch ports managed by the switch chip. Ports t ;Stores application and orchdameon warm start status ;Status: work in progress - key = WARM_RESTART_TABLE:process_name ; process_name is a unique process identifier. + key = WARM_RESTART_TABLE|process_name ; process_name is a unique process identifier. ; with exception of 'warm-shutdown' operation. ; 'warm-shutdown' operation key is used to ; track warm shutdown stages and results. @@ -839,6 +839,18 @@ Stores information for physical switch ports managed by the switch chip. Ports t key = NEIGH_RESTORE_TABLE|Flags restored = "true" / "false" ; restored state +### BGP\_STATE\_TABLE + ;Stores bgp status + ;Status: work in progress + + key = WARM_RESTART_TABLE|family|eoiu ; family = "IPv4" / "IPv6" ; address family. + + state = "unknown" / "reached" / "consumed" ; unknown: eoiu state not fetched yet. + ; reached: bgp eoiu done. + ; + ; consumed: the reached state has been consumed by application. + + ## Configuration files What configuration files should we have? Do apps, orch agent each need separate files? diff --git a/fpmsyncd/bgp_eoiu_marker.py b/fpmsyncd/bgp_eoiu_marker.py new file mode 100644 index 0000000000..3109fff90d --- /dev/null +++ b/fpmsyncd/bgp_eoiu_marker.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python + +"""" +Description: bgp_eoiu_marker.py -- populating bgp eoiu marker flags in stateDB during warm reboot. + The script is started by supervisord in bgp docker when the docker is started. + It does not do anything in case neither system nor bgp warm restart is enabled. + + The script check bgp neighbor state via vtysh cli interface periodically (every 1 second). + It looks for explicit EOR and implicit EOR (keep alive after established) in the json output of show ip bgp neighbors A.B.C.D json + + Once the script has collected all needed EORs, it set a EOIU flags in stateDB. + + fpmsyncd may hold a few seconds (2~5 seconds) after getting the flag before starting routing reconciliation. + 2-5 seconds should be enough for all the route to be synced to fpmsyncd from bgp. If not, the system probably is already in wrong state. + + For any reason the script failed to set EOIU flag in stateDB, the current warm_restart bgp_timer will kick in later. +""" + +import sys +import swsssdk +import time +import syslog +import traceback +import commands +import json +from swsscommon import swsscommon +import errno +from time import gmtime, strftime + +class BgpStateCheck(): + # timeout the restore process in 120 seconds if not finished + # This is in consistent with the default timerout for bgp warm restart set in fpmsyncd + + DEF_TIME_OUT = 120 + + # every 1 seconds to check bgp neighbors state + CHECK_INTERVAL = 1 + def __init__(self): + self.ipv4_neighbors = [] + self.ipv4_neigh_eor_status = {} + self.ipv6_neighbors = [] + self.ipv6_neigh_eor_status = {} + self.keepalivesRecvCnt = {} + self.bgp_ipv4_eoiu = False + self.bgp_ipv6_eoiu = False + + def get_all_peers(self): + try: + cmd = "vtysh -c 'show bgp summary json'" + output = commands.getoutput(cmd) + peer_info = json.loads(output) + if "ipv4Unicast" in peer_info and "peers" in peer_info["ipv4Unicast"]: + self.ipv4_neighbors = peer_info["ipv4Unicast"]["peers"].keys() + + if "ipv6Unicast" in peer_info and "peers" in peer_info["ipv6Unicast"]: + self.ipv6_neighbors = peer_info["ipv6Unicast"]["peers"].keys() + + syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv4_neighbors)) + syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv6_neighbors)) + + except Exception: + syslog.syslog(syslog.LOG_ERR, "*ERROR* get_all_peers Exception: %s" % (traceback.format_exc())) + time.sleep(5) + self.get_all_peers() + + def init_peers_eor_status(self): + # init neigh eor status to unknown + for neigh in self.ipv4_neighbors: + self.ipv4_neigh_eor_status[neigh] = "unknown" + for neigh in self.ipv6_neighbors: + self.ipv6_neigh_eor_status[neigh] = "unknown" + + # Set the statedb "BGP_STATE_TABLE|eoiu", so fpmsyncd can get the bgp eoiu signal + # Only two families: 'ipv4' and 'ipv6' + # state is 'true' or 'false' + def set_bgp_eoiu_marker(self, family, state): + db = swsssdk.SonicV2Connector(host='127.0.0.1') + db.connect(db.STATE_DB, False) + key = "BGP_STATE_TABLE|%s|eoiu" % family + db.set(db.STATE_DB, key, 'state', state) + timesamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) + db.set(db.STATE_DB, key, 'timestamp', timesamp) + db.close(db.STATE_DB) + return + + def clean_bgp_eoiu_marker(self): + db = swsssdk.SonicV2Connector(host='127.0.0.1') + db.connect(db.STATE_DB, False) + db.delete(db.STATE_DB, "BGP_STATE_TABLE|IPv4|eoiu") + db.delete(db.STATE_DB, "BGP_STATE_TABLE|IPv6|eoiu") + db.close(db.STATE_DB) + syslog.syslog('Cleaned ipv4 and ipv6 eoiu marker flags') + return + + def bgp_eor_received(self, neigh, is_ipv4): + try: + neighstr = "%s" % neigh + eor_received = False + cmd = "vtysh -c 'show bgp neighbors %s json'" % neighstr + output = commands.getoutput(cmd) + neig_status = json.loads(output) + if neighstr in neig_status: + if "gracefulRestartInfo" in neig_status[neighstr]: + if "endOfRibRecv" in neig_status[neighstr]["gracefulRestartInfo"]: + eor_info = neig_status[neighstr]["gracefulRestartInfo"]["endOfRibRecv"] + if is_ipv4 and "IPv4 Unicast" in eor_info and eor_info["IPv4 Unicast"] == "true": + eor_received = True + elif not is_ipv4 and "IPv6 Unicast" in eor_info and eor_info["IPv6 Unicast"] == "true": + eor_received = True + if eor_received: + syslog.syslog('BGP eor received for neighbors: {}'.format(neigh)) + + # No explict eor, try implicit eor + if eor_received == False and "bgpState" in neig_status[neighstr] and neig_status[neighstr]["bgpState"] == "Established": + # if "messageStats" in neig_status and "keepalivesRecv" in neig_status["messageStats"]: + # it looks we need to record the keepalivesRecv count for detecting count change + if neighstr not in self.keepalivesRecvCnt: + self.keepalivesRecvCnt[neighstr] = neig_status[neighstr]["messageStats"]["keepalivesRecv"] + else : + eor_received = (self.keepalivesRecvCnt[neighstr] is not neig_status[neighstr]["messageStats"]["keepalivesRecv"]) + if eor_received: + syslog.syslog('BGP implicit eor received for neighbors: {}'.format(neigh)) + + return eor_received + + except Exception: + syslog.syslog(syslog.LOG_ERR, "*ERROR* bgp_eor_received Exception: %s" % (traceback.format_exc())) + + + # This function is to collect eor state based on the saved ipv4_neigh_eor_status and ipv6_neigh_eor_status dictionaries + # It iterates through the dictionary, and check whether the specific neighbor has EOR received. + # EOR may be explicit EOR (End-Of-RIB) or an implicit-EOR. + # The first keep-alive after BGP has reached Established is considered an implicit-EOR. + # + # ipv4 and ipv6 neighbors are processed separately. + # Once all ipv4 neighbors have EOR received, bgp_ipv4_eoiu becomes True. + # Once all ipv6 neighbors have EOR received, bgp_ipv6_eoiu becomes True. + + # The neighbor EoR states were checked in a loop with an interval (CHECK_INTERVAL) + # The function will timeout in case eoiu states never meet the condition + # after some time (DEF_TIME_OUT). + def wait_for_bgp_eoiu(self): + #mtime = monotonic.time.time + wait_time = self.DEF_TIME_OUT + while wait_time >= 0: + if not self.bgp_ipv4_eoiu: + for neigh, eor_status in self.ipv4_neigh_eor_status.items(): + if eor_status == "unknown" and self.bgp_eor_received(neigh, True): + self.ipv4_neigh_eor_status[neigh] = "rcvd" + if "unknown" not in self.ipv4_neigh_eor_status.values(): + self.bgp_ipv4_eoiu = True + syslog.syslog("BGP ipv4 eoiu reached") + + if not self.bgp_ipv6_eoiu: + for neigh, eor_status in self.ipv6_neigh_eor_status.items(): + if eor_status == "unknown" and self.bgp_eor_received(neigh, False): + self.ipv6_neigh_eor_status[neigh] = "rcvd" + if "unknown" not in self.ipv6_neigh_eor_status.values(): + self.bgp_ipv6_eoiu = True + syslog.syslog('BGP ipv6 eoiu reached') + + if self.bgp_ipv6_eoiu and self.bgp_ipv4_eoiu: + break; + time.sleep(self.CHECK_INTERVAL) + wait_time -= self.CHECK_INTERVAL + + if not self.bgp_ipv6_eoiu: + syslog.syslog(syslog.LOG_ERR, "BGP ipv6 eoiu not reached: {}".format(self.ipv6_neigh_eor_status)); + + if not self.bgp_ipv4_eoiu: + syslog.syslog(syslog.LOG_ERR, "BGP ipv4 eoiu not reached: {}".format(self.ipv4_neigh_eor_status)); + +def main(): + + print "bgp_eoiu_marker service is started" + + try: + bgp_state_check = BgpStateCheck() + except Exception, e: + syslog.syslog(syslog.LOG_ERR, "{}: error exit 1, reason {}".format(THIS_MODULE, str(e))) + exit(1) + + # Always clean the eoiu marker in stateDB first + bgp_state_check.clean_bgp_eoiu_marker() + + # Use warmstart python binding to check warmstart information + warmstart = swsscommon.WarmStart() + warmstart.initialize("bgp", "bgp") + warmstart.checkWarmStart("bgp", "bgp", False) + + # if bgp or system warm reboot not enabled, don't run + if not warmstart.isWarmStart(): + print "bgp_eoiu_marker service is skipped as warm restart not enabled" + return + + bgp_state_check.set_bgp_eoiu_marker("IPv4", "unknown") + bgp_state_check.set_bgp_eoiu_marker("IPv6", "unknown") + bgp_state_check.get_all_peers() + bgp_state_check.init_peers_eor_status() + try: + bgp_state_check.wait_for_bgp_eoiu() + except Exception as e: + syslog.syslog(syslog.LOG_ERR, str(e)) + sys.exit(1) + + # set statedb to signal other processes like fpmsynd + if bgp_state_check.bgp_ipv4_eoiu: + bgp_state_check.set_bgp_eoiu_marker("IPv4", "reached") + if bgp_state_check.bgp_ipv6_eoiu: + bgp_state_check.set_bgp_eoiu_marker("IPv6", "reached") + + print "bgp_eoiu_marker service is done" + return + +if __name__ == '__main__': + main() diff --git a/fpmsyncd/fpmsyncd.cpp b/fpmsyncd/fpmsyncd.cpp index 42eba65f0f..1762dfbb9d 100644 --- a/fpmsyncd/fpmsyncd.cpp +++ b/fpmsyncd/fpmsyncd.cpp @@ -18,6 +18,30 @@ using namespace swss; */ const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 120; +// Wait 3 seconds after detecting EOIU reached state +// TODO: support eoiu hold interval config +const uint32_t DEFAULT_EOIU_HOLD_INTERVAL = 3; + +// Check if eoiu state reached by both ipv4 and ipv6 +static bool eoiuFlagsSet(Table &bgpStateTable) +{ + string value; + + bgpStateTable.hget("IPv4|eoiu", "state", value); + if (value != "reached") + { + SWSS_LOG_DEBUG("IPv4|eoiu state: %s", value.c_str()); + return false; + } + bgpStateTable.hget("IPv6|eoiu", "state", value); + if (value != "reached") + { + SWSS_LOG_DEBUG("IPv6|eoiu state: %s", value.c_str()); + return false; + } + SWSS_LOG_NOTICE("Warm-Restart bgp eoiu reached for both ipv4 and ipv6"); + return true; +} int main(int argc, char **argv) { @@ -26,6 +50,9 @@ int main(int argc, char **argv) RedisPipeline pipeline(&db); RouteSync sync(&pipeline); + DBConnector stateDb(STATE_DB, 0); + Table bgpStateTable(&stateDb, STATE_BGP_TABLE_NAME); + NetDispatcher::getInstance().registerMessageHandler(RTM_NEWROUTE, &sync); NetDispatcher::getInstance().registerMessageHandler(RTM_DELROUTE, &sync); @@ -36,7 +63,10 @@ int main(int argc, char **argv) FpmLink fpm; Select s; SelectableTimer warmStartTimer(timespec{0, 0}); - + // Before eoiu flags detected, check them periodically. It also stop upon detection of reconciliation done. + SelectableTimer eoiuCheckTimer(timespec{0, 0}); + // After eoiu flags are detected, start a hold timer before starting reconciliation. + SelectableTimer eoiuHoldTimer(timespec{0, 0}); /* * Pipeline should be flushed right away to deal with state pending * from previous try/catch iterations. @@ -69,7 +99,14 @@ int main(int argc, char **argv) { warmStartTimer.start(); s.addSelectable(&warmStartTimer); + SWSS_LOG_NOTICE("Warm-Restart timer started."); } + + // Also start periodic eoiu check timer, first wait 5 seconds, then check every 1 second + eoiuCheckTimer.setInterval(timespec{5, 0}); + eoiuCheckTimer.start(); + s.addSelectable(&eoiuCheckTimer); + SWSS_LOG_NOTICE("Warm-Restart eoiuCheckTimer timer started."); } while (true) @@ -80,19 +117,54 @@ int main(int argc, char **argv) s.select(&temps); /* - * Upon expiration of the warm-restart timer, proceed to run the - * reconciliation process and remove warm-restart timer from + * Upon expiration of the warm-restart timer or eoiu Hold Timer, proceed to run the + * reconciliation process if not done yet and remove the timer from * select() loop. + * Note: route reconciliation always succeeds, it will not be done twice. */ - if (warmStartEnabled && temps == &warmStartTimer) + if (temps == &warmStartTimer || temps == &eoiuHoldTimer) { - SWSS_LOG_NOTICE("Warm-Restart timer expired."); - sync.m_warmStartHelper.reconcile(); - s.removeSelectable(&warmStartTimer); - + if (temps == &warmStartTimer) + { + SWSS_LOG_NOTICE("Warm-Restart timer expired."); + } + else + { + SWSS_LOG_NOTICE("Warm-Restart EOIU hold timer expired."); + } + if (sync.m_warmStartHelper.inProgress()) + { + sync.m_warmStartHelper.reconcile(); + SWSS_LOG_NOTICE("Warm-Restart reconciliation processed."); + } + // remove the one-shot timer. + s.removeSelectable(temps); pipeline.flush(); SWSS_LOG_DEBUG("Pipeline flushed"); } + else if (temps == &eoiuCheckTimer) + { + if (sync.m_warmStartHelper.inProgress()) + { + if (eoiuFlagsSet(bgpStateTable)) + { + eoiuHoldTimer.setInterval(timespec{DEFAULT_EOIU_HOLD_INTERVAL, 0}); + eoiuHoldTimer.start(); + s.addSelectable(&eoiuHoldTimer); + SWSS_LOG_NOTICE("Warm-Restart started EOIU hold timer which is to expire in %d seconds.", DEFAULT_EOIU_HOLD_INTERVAL); + s.removeSelectable(&eoiuCheckTimer); + continue; + } + eoiuCheckTimer.setInterval(timespec{1, 0}); + // re-start eoiu check timer + eoiuCheckTimer.start(); + SWSS_LOG_DEBUG("Warm-Restart eoiuCheckTimer restarted"); + } + else + { + s.removeSelectable(&eoiuCheckTimer); + } + } else if (!warmStartEnabled || sync.m_warmStartHelper.isReconciled()) { pipeline.flush(); From b59f192139e4a99304dac0078bea842d1550fd98 Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Wed, 24 Apr 2019 20:56:19 -0700 Subject: [PATCH 2/9] Fix explict eor check error, endofRibRecv[IPv4 Unicast] is of type Boolean Signed-off-by: Jipan Yang --- fpmsyncd/bgp_eoiu_marker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fpmsyncd/bgp_eoiu_marker.py b/fpmsyncd/bgp_eoiu_marker.py index 3109fff90d..1b436e0ad5 100644 --- a/fpmsyncd/bgp_eoiu_marker.py +++ b/fpmsyncd/bgp_eoiu_marker.py @@ -103,9 +103,9 @@ def bgp_eor_received(self, neigh, is_ipv4): if "gracefulRestartInfo" in neig_status[neighstr]: if "endOfRibRecv" in neig_status[neighstr]["gracefulRestartInfo"]: eor_info = neig_status[neighstr]["gracefulRestartInfo"]["endOfRibRecv"] - if is_ipv4 and "IPv4 Unicast" in eor_info and eor_info["IPv4 Unicast"] == "true": + if is_ipv4 and "IPv4 Unicast" in eor_info and eor_info["IPv4 Unicast"] == True: eor_received = True - elif not is_ipv4 and "IPv6 Unicast" in eor_info and eor_info["IPv6 Unicast"] == "true": + elif not is_ipv4 and "IPv6 Unicast" in eor_info and eor_info["IPv6 Unicast"] == True: eor_received = True if eor_received: syslog.syslog('BGP eor received for neighbors: {}'.format(neigh)) From 009b5714fcc2a2c3ed4f0696d96a6a6c6b8d82cb Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Thu, 25 Apr 2019 11:35:43 -0700 Subject: [PATCH 3/9] Add missing typo fix Signed-off-by: Jipan Yang --- fpmsyncd/fpmsyncd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpmsyncd/fpmsyncd.cpp b/fpmsyncd/fpmsyncd.cpp index 1762dfbb9d..0c2d2cf795 100644 --- a/fpmsyncd/fpmsyncd.cpp +++ b/fpmsyncd/fpmsyncd.cpp @@ -50,7 +50,7 @@ int main(int argc, char **argv) RedisPipeline pipeline(&db); RouteSync sync(&pipeline); - DBConnector stateDb(STATE_DB, 0); + DBConnector stateDb(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0); Table bgpStateTable(&stateDb, STATE_BGP_TABLE_NAME); NetDispatcher::getInstance().registerMessageHandler(RTM_NEWROUTE, &sync); From 9fc3a6476636a83c55b97898878f4e2830aa1adb Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Mon, 29 Apr 2019 18:34:11 -0700 Subject: [PATCH 4/9] Add DB interface for eoiu_hold timer value fetching Signed-off-by: Jipan Yang --- fpmsyncd/fpmsyncd.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fpmsyncd/fpmsyncd.cpp b/fpmsyncd/fpmsyncd.cpp index 0c2d2cf795..6f5dd88d8c 100644 --- a/fpmsyncd/fpmsyncd.cpp +++ b/fpmsyncd/fpmsyncd.cpp @@ -148,10 +148,20 @@ int main(int argc, char **argv) { if (eoiuFlagsSet(bgpStateTable)) { - eoiuHoldTimer.setInterval(timespec{DEFAULT_EOIU_HOLD_INTERVAL, 0}); + /* Obtain eoiu hold timer defined for bgp docker */ + uint32_t eoiuHoldIval = WarmStart::getWarmStartTimer("eoiu_hold", "bgp"); + if (!eoiuHoldIval) + { + eoiuHoldTimer.setInterval(timespec{DEFAULT_EOIU_HOLD_INTERVAL, 0}); + eoiuHoldIval = DEFAULT_EOIU_HOLD_INTERVAL; + } + else + { + eoiuHoldTimer.setInterval(timespec{eoiuHoldIval, 0}); + } eoiuHoldTimer.start(); s.addSelectable(&eoiuHoldTimer); - SWSS_LOG_NOTICE("Warm-Restart started EOIU hold timer which is to expire in %d seconds.", DEFAULT_EOIU_HOLD_INTERVAL); + SWSS_LOG_NOTICE("Warm-Restart started EOIU hold timer which is to expire in %d seconds.", eoiuHoldIval); s.removeSelectable(&eoiuCheckTimer); continue; } From f1e249d1161fe9be4917f275f31dd4a66e017b40 Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Mon, 24 Jun 2019 16:03:50 +0800 Subject: [PATCH 5/9] address review comments Signed-off-by: Jipan Yang --- fpmsyncd/bgp_eoiu_marker.py | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/fpmsyncd/bgp_eoiu_marker.py b/fpmsyncd/bgp_eoiu_marker.py index 1b436e0ad5..1098380079 100644 --- a/fpmsyncd/bgp_eoiu_marker.py +++ b/fpmsyncd/bgp_eoiu_marker.py @@ -43,25 +43,31 @@ def __init__(self): self.keepalivesRecvCnt = {} self.bgp_ipv4_eoiu = False self.bgp_ipv6_eoiu = False + self.get_peers_wt = self.DEF_TIME_OUT def get_all_peers(self): - try: - cmd = "vtysh -c 'show bgp summary json'" - output = commands.getoutput(cmd) - peer_info = json.loads(output) - if "ipv4Unicast" in peer_info and "peers" in peer_info["ipv4Unicast"]: - self.ipv4_neighbors = peer_info["ipv4Unicast"]["peers"].keys() - - if "ipv6Unicast" in peer_info and "peers" in peer_info["ipv6Unicast"]: - self.ipv6_neighbors = peer_info["ipv6Unicast"]["peers"].keys() - - syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv4_neighbors)) - syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv6_neighbors)) - - except Exception: - syslog.syslog(syslog.LOG_ERR, "*ERROR* get_all_peers Exception: %s" % (traceback.format_exc())) - time.sleep(5) - self.get_all_peers() + while self.get_peers_wt >= 0: + try: + cmd = "vtysh -c 'show bgp summary json'" + output = commands.getoutput(cmd) + peer_info = json.loads(output) + if "ipv4Unicast" in peer_info and "peers" in peer_info["ipv4Unicast"]: + self.ipv4_neighbors = peer_info["ipv4Unicast"]["peers"].keys() + + if "ipv6Unicast" in peer_info and "peers" in peer_info["ipv6Unicast"]: + self.ipv6_neighbors = peer_info["ipv6Unicast"]["peers"].keys() + + syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv4_neighbors)) + syslog.syslog('BGP ipv4 neighbors: {}'.format(self.ipv6_neighbors)) + return + + except Exception: + syslog.syslog(syslog.LOG_ERR, "*ERROR* get_all_peers Exception: %s" % (traceback.format_exc())) + time.sleep(5) + self.get_peers_wt -= 5 + self.get_all_peers() + syslog.syslog(syslog.LOG_ERR, "Failed to get bgp neighbor info in {} seconds, exiting".format(self.DEF_TIME_OUT)); + sys.exit(1) def init_peers_eor_status(self): # init neigh eor status to unknown @@ -116,7 +122,7 @@ def bgp_eor_received(self, neigh, is_ipv4): # it looks we need to record the keepalivesRecv count for detecting count change if neighstr not in self.keepalivesRecvCnt: self.keepalivesRecvCnt[neighstr] = neig_status[neighstr]["messageStats"]["keepalivesRecv"] - else : + else: eor_received = (self.keepalivesRecvCnt[neighstr] is not neig_status[neighstr]["messageStats"]["keepalivesRecv"]) if eor_received: syslog.syslog('BGP implicit eor received for neighbors: {}'.format(neigh)) From 15c7fbf8de23e62eb1c49df2069634c09785fcca Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Mon, 24 Jun 2019 16:12:51 +0800 Subject: [PATCH 6/9] Remove obsoleted comment Signed-off-by: Jipan Yang --- fpmsyncd/bgp_eoiu_marker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fpmsyncd/bgp_eoiu_marker.py b/fpmsyncd/bgp_eoiu_marker.py index 1098380079..679e84d5b7 100644 --- a/fpmsyncd/bgp_eoiu_marker.py +++ b/fpmsyncd/bgp_eoiu_marker.py @@ -146,7 +146,6 @@ def bgp_eor_received(self, neigh, is_ipv4): # The function will timeout in case eoiu states never meet the condition # after some time (DEF_TIME_OUT). def wait_for_bgp_eoiu(self): - #mtime = monotonic.time.time wait_time = self.DEF_TIME_OUT while wait_time >= 0: if not self.bgp_ipv4_eoiu: From c6d9f002451d7bc131cd145970389803802bdbea Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Sat, 27 Jul 2019 02:12:28 -0700 Subject: [PATCH 7/9] Add vs test case for bgp eoiu processing in fpmsyncd Signed-off-by: Jipan Yang --- doc/swss-schema.md | 17 ++++++++++++++++- fpmsyncd/bgp_eoiu_marker.py | 2 +- tests/test_warm_reboot.py | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/swss-schema.md b/doc/swss-schema.md index 31868c915d..befe0c248c 100644 --- a/doc/swss-schema.md +++ b/doc/swss-schema.md @@ -843,13 +843,28 @@ Stores information for physical switch ports managed by the switch chip. Ports t ;Stores bgp status ;Status: work in progress - key = WARM_RESTART_TABLE|family|eoiu ; family = "IPv4" / "IPv6" ; address family. + key = BGP_STATE_TABLE|family|eoiu ; family = "IPv4" / "IPv6" ; address family. state = "unknown" / "reached" / "consumed" ; unknown: eoiu state not fetched yet. ; reached: bgp eoiu done. ; ; consumed: the reached state has been consumed by application. + timestamp = time-stamp ; "%Y-%m-%d %H:%M:%S", full-date and partial-time separated by + ; white space. Example: 2019-04-25 09:39:19 + ;value annotations + date-fullyear = 4DIGIT + date-month = 2DIGIT ; 01-12 + date-mday = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on + ; month/year + time-hour = 2DIGIT ; 00-23 + time-minute = 2DIGIT ; 00-59 + time-second = 2DIGIT ; 00-58, 00-59, 00-60 based on leap second + ; rules + + partial-time = time-hour ":" time-minute ":" time-second + full-date = date-fullyear "-" date-month "-" date-mday + time-stamp = full-date %x20 partial-time ## Configuration files What configuration files should we have? Do apps, orch agent each need separate files? diff --git a/fpmsyncd/bgp_eoiu_marker.py b/fpmsyncd/bgp_eoiu_marker.py index 679e84d5b7..83051d7878 100644 --- a/fpmsyncd/bgp_eoiu_marker.py +++ b/fpmsyncd/bgp_eoiu_marker.py @@ -78,7 +78,7 @@ def init_peers_eor_status(self): # Set the statedb "BGP_STATE_TABLE|eoiu", so fpmsyncd can get the bgp eoiu signal # Only two families: 'ipv4' and 'ipv6' - # state is 'true' or 'false' + # state is "unknown" / "reached" / "consumed" def set_bgp_eoiu_marker(self, family, state): db = swsssdk.SonicV2Connector(host='127.0.0.1') db.connect(db.STATE_DB, False) diff --git a/tests/test_warm_reboot.py b/tests/test_warm_reboot.py index 31795d3ddd..5db1f8af12 100644 --- a/tests/test_warm_reboot.py +++ b/tests/test_warm_reboot.py @@ -1,3 +1,4 @@ + from swsscommon import swsscommon import os import re @@ -1944,4 +1945,3 @@ def test_system_warmreboot_neighbor_syncup(self, dvs, testlog): intf_tbl._del("Ethernet{}|{}00::1/64".format(i*4, i*4)) intf_tbl._del("Ethernet{}".format(i*4, i*4)) intf_tbl._del("Ethernet{}".format(i*4, i*4)) - From 54dd4e91bcadd5eb4871fcd15981281481497ef3 Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Sat, 27 Jul 2019 12:06:37 -0700 Subject: [PATCH 8/9] [VS test]: Revert back to INFO level log for fpmsyncd Signed-off-by: Jipan Yang --- tests/test_warm_reboot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_warm_reboot.py b/tests/test_warm_reboot.py index 5db1f8af12..7771cc935c 100644 --- a/tests/test_warm_reboot.py +++ b/tests/test_warm_reboot.py @@ -1927,6 +1927,7 @@ def test_system_warmreboot_neighbor_syncup(self, dvs, testlog): time.sleep(10) + # check syslog and sairedis.rec file for activities check_syslog_for_neighbor_entry(dvs, marker, 0, NUM_OF_NEIGHS/2, "ipv4") check_syslog_for_neighbor_entry(dvs, marker, 0, NUM_OF_NEIGHS/2, "ipv6") From a9161b41488c7687401398965bc02b3baf8a4b60 Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Wed, 31 Jul 2019 11:09:42 -0700 Subject: [PATCH 9/9] Redo "Add vs test case for bgp eoiu processing in fpmsyncd" after resolving merge conflict Signed-off-by: Jipan Yang --- tests/test_warm_reboot.py | 121 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/tests/test_warm_reboot.py b/tests/test_warm_reboot.py index 7771cc935c..165e2f267c 100644 --- a/tests/test_warm_reboot.py +++ b/tests/test_warm_reboot.py @@ -1659,6 +1659,127 @@ def test_routing_WarmRestart(self, dvs, testlog): rt_key = json.loads(addobjs[0]['key']) assert rt_key['dest'] == "192.168.100.0/24" + + ############################################################################# + # + # Testcase 16. Restart zebra and make no control-plane changes. + # Set WARM_RESTART_TABLE|IPv4|eoiu + # WARM_RESTART_TABLE|IPv6|eoiu + # Check route reconciliation wait time is reduced + # For this and all subsequent test-cases routing-warm-reboot + # feature will be kept enabled. + # + ############################################################################# + + + time.sleep(1) + # Hold time from EOIU detected for both Ipv4/Ipv6 to start route reconciliation + DEFAULT_EOIU_HOLD_INTERVAL = 3 + + # change to 20 for easy timeline check + restart_timer = 20 + + # clean up as that in bgp_eoiu_marker.py + del_entry_tbl(state_db, "BGP_STATE_TABLE", "IPv4|eoiu") + del_entry_tbl(state_db, "BGP_STATE_TABLE", "IPv6|eoiu") + + dvs.runcmd("config warm_restart bgp_timer {}".format(restart_timer)) + # Restart zebra + dvs.stop_zebra() + dvs.start_zebra() + + # + # Verify FSM: no eoiu, just default warm restart timer + # + swss_app_check_warmstart_state(state_db, "bgp", "restored") + # Periodic eoiu check timer, first wait 5 seconds, then check every 1 second + # DEFAULT_EOIU_HOLD_INTERVAL is 3 seconds. + # Since no EOIU set, after 3+ 5 + 1 seconds, the state still in restored state + time.sleep(DEFAULT_EOIU_HOLD_INTERVAL + 5 +1) + swss_app_check_warmstart_state(state_db, "bgp", "restored") + # default restart timer kicks in: + time.sleep(restart_timer - DEFAULT_EOIU_HOLD_INTERVAL -5) + swss_app_check_warmstart_state(state_db, "bgp", "reconciled") + + + time.sleep(1) + # Restart zebra + dvs.stop_zebra() + dvs.start_zebra() + + # + # Verify FSM: eoiu works as expected + # + swss_app_check_warmstart_state(state_db, "bgp", "restored") + # Set BGP_STATE_TABLE|Ipv4|eoiu BGP_STATE_TABLE|IPv6|eoiu + create_entry_tbl( + state_db, + "BGP_STATE_TABLE", "IPv4|eoiu", + [ + ("state", "reached"), + ("timestamp", "2019-04-25 09:39:19"), + ] + ) + create_entry_tbl( + state_db, + "BGP_STATE_TABLE", "IPv6|eoiu", + [ + ("state", "reached"), + ("timestamp", "2019-04-25 09:39:22"), + ] + ) + + # after DEFAULT_EOIU_HOLD_INTERVAL + inital eoiu check timer wait time + 1 seconds: 3+5+1 + # verify that bgp reached reconciled state + time.sleep(DEFAULT_EOIU_HOLD_INTERVAL + 5 + 1) + swss_app_check_warmstart_state(state_db, "bgp", "reconciled") + + # Verify swss changes -- none are expected this time + (addobjs, delobjs) = dvs.GetSubscribedAppDbObjects(pubsubAppDB) + assert len(addobjs) == 0 and len(delobjs) == 0 + # Verify swss changes -- none are expected this time + (addobjs, delobjs) = dvs.GetSubscribedAsicDbObjects(pubsubAsicDB) + assert len(addobjs) == 0 and len(delobjs) == 0 + + + del_entry_tbl(state_db, "BGP_STATE_TABLE", "IPv4|eoiu") + del_entry_tbl(state_db, "BGP_STATE_TABLE", "IPv6|eoiu") + time.sleep(1) + # Restart zebra + dvs.stop_zebra() + dvs.start_zebra() + + # + # Verify FSM: partial eoiu, fallback to default warm restart timer + # + swss_app_check_warmstart_state(state_db, "bgp", "restored") + # Set BGP_STATE_TABLE|Ipv4|eoiu but not BGP_STATE_TABLE|IPv6|eoiu + create_entry_tbl( + state_db, + "BGP_STATE_TABLE", "IPv4|eoiu", + [ + ("state", "reached"), + ("timestamp", "2019-04-25 09:39:19"), + ] + ) + + # Periodic eoiu check timer, first wait 5 seconds, then check every 1 second + # DEFAULT_EOIU_HOLD_INTERVAL is 3 seconds. + # Current bgp eoiu needs flag set on both Ipv4/Ipv6 to work, after 3+ 5 + 1 seconds, the state still in restored state + time.sleep(DEFAULT_EOIU_HOLD_INTERVAL + 5 +1) + swss_app_check_warmstart_state(state_db, "bgp", "restored") + # Fall back to warm restart timer, it kicks in after 15 seconds, +1 to avoid race condition: + time.sleep(restart_timer - DEFAULT_EOIU_HOLD_INTERVAL -5 ) + swss_app_check_warmstart_state(state_db, "bgp", "reconciled") + + # Verify swss changes -- none are expected this time + (addobjs, delobjs) = dvs.GetSubscribedAppDbObjects(pubsubAppDB) + assert len(addobjs) == 0 and len(delobjs) == 0 + + # Verify swss changes -- none are expected this time + (addobjs, delobjs) = dvs.GetSubscribedAsicDbObjects(pubsubAsicDB) + assert len(addobjs) == 0 and len(delobjs) == 0 + intf_tbl._del("{}|111.0.0.1/24".format(intfs[0])) intf_tbl._del("{}|1110::1/64".format(intfs[0])) intf_tbl._del("{}|122.0.0.1/24".format(intfs[1]))