Skip to content

Commit

Permalink
Warmboot Vlan neigh restore fix (#1040)
Browse files Browse the repository at this point in the history
* Send arp request after first Vlan member port is added

* Add wait logic after Vlan member add, nbrmgr to wait for restore complete

* Address comment to pass db as a parameter and open only once
  • Loading branch information
prsunny authored Sep 9, 2019
1 parent 5841e06 commit 313ef5c
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 12 deletions.
16 changes: 15 additions & 1 deletion cfgmgr/nbrmgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ NbrMgr::NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, con
m_statePortTable(stateDb, STATE_PORT_TABLE_NAME),
m_stateLagTable(stateDb, STATE_LAG_TABLE_NAME),
m_stateVlanTable(stateDb, STATE_VLAN_TABLE_NAME),
m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME)
m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME),
m_stateNeighRestoreTable(stateDb, STATE_NEIGH_RESTORE_TABLE_NAME)
{
int err = 0;

Expand Down Expand Up @@ -91,6 +92,19 @@ bool NbrMgr::isIntfStateOk(const string &alias)
return false;
}

bool NbrMgr::isNeighRestoreDone()
{
string value;

m_stateNeighRestoreTable.hget("Flags", "restored", value);
if (value == "true")
{
SWSS_LOG_INFO("Kernel neighbor table restore is done");
return true;
}
return false;
}

bool NbrMgr::setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac)
{
SWSS_LOG_ENTER();
Expand Down
4 changes: 3 additions & 1 deletion cfgmgr/nbrmgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ class NbrMgr : public Orch
NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, const vector<string> &tableNames);
using Orch::doTask;

bool isNeighRestoreDone();

private:
bool isIntfStateOk(const string &alias);
bool setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac);

void doTask(Consumer &consumer);

Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable;
Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable, m_stateNeighRestoreTable;
struct nl_sock *m_nl_sock;
};

Expand Down
19 changes: 19 additions & 0 deletions cfgmgr/nbrmgrd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <mutex>
#include <fstream>
#include <iostream>
#include <chrono>

#include "select.h"
#include "exec.h"
Expand All @@ -12,6 +13,9 @@
using namespace std;
using namespace swss;

#define RESTORE_NEIGH_WAIT_TIME_OUT 120
#define RESTORE_NEIGH_WAIT_TIME_INT 10

/* select() function timeout retry time, in millisecond */
#define SELECT_TIMEOUT 1000

Expand Down Expand Up @@ -50,6 +54,21 @@ int main(int argc, char **argv)

NbrMgr nbrmgr(&cfgDb, &appDb, &stateDb, cfg_nbr_tables);

chrono::steady_clock::time_point starttime = chrono::steady_clock::now();
while (!nbrmgr.isNeighRestoreDone())
{
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>
(chrono::steady_clock::now() - starttime);
int pasttime = int(time_span.count());
SWSS_LOG_INFO("Kernel neighbor table restoration waited for %d seconds", pasttime);
if (pasttime > RESTORE_NEIGH_WAIT_TIME_OUT)
{
SWSS_LOG_WARN("Kernel neighbor table restore is not finished!");
break;
}
sleep(RESTORE_NEIGH_WAIT_TIME_INT);
}

std::vector<Orch *> cfgOrchList = {&nbrmgr};

swss::Select s;
Expand Down
57 changes: 47 additions & 10 deletions neighsyncd/restore_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,29 @@
from scapy.all import conf, in6_getnsma, inet_pton, inet_ntop, in6_getnsmac, get_if_hwaddr, Ether, ARP, IPv6, ICMPv6ND_NS, ICMPv6NDOptSrcLLAddr
from swsscommon import swsscommon
import errno
import syslog

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
logger.addHandler(logging.NullHandler())

SYSLOG_IDENTIFIER = 'restore_neighbor'

def log_info(msg):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_INFO, msg)
syslog.closelog()

def log_warning(msg):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_WARNING, msg)
syslog.closelog()

def log_error(msg):
syslog.openlog(SYSLOG_IDENTIFIER)
syslog.syslog(syslog.LOG_ERR, msg)
syslog.closelog()

# timeout the restore process in 110 seconds if not finished
# This is mostly to wait for interfaces to be created and up after system warm-reboot
# and this process is started by supervisord in swss docker.
Expand Down Expand Up @@ -58,12 +76,27 @@ def is_intf_oper_state_up(intf):
state_file = open(oper_file.format(intf), 'r')
state = state_file.readline().rstrip()
except Exception as e:
logger.info('Error: {}'.format(str(e)))
log_info('Error: {}'.format(str(e)))
return False
if state == '1':
return True
return False

def is_intf_up(intf, db):
if not is_intf_oper_state_up(intf):
return False
if 'Vlan' in intf:
table_name = 'VLAN_MEMBER_TABLE|{}|*'.format(intf)
key = db.keys(db.STATE_DB, table_name)
if key is None:
log_info ("Vlan member is not yet created")
return False
if is_intf_up.counter == 0:
time.sleep(3*CHECK_INTERVAL)
is_intf_up.counter = 1
log_info ("intf {} is up".format(intf))
return True

# read the neigh table from AppDB to memory, format as below
# build map as below, this can efficiently access intf and family groups later
# { intf1 -> { { family1 -> [[ip1, mac1], [ip2, mac2] ...] }
Expand Down Expand Up @@ -131,7 +164,7 @@ def read_neigh_table_to_maps():

# Use netlink to set neigh table into kernel, not overwrite the existing ones
def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
logging.info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
log_info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
family, intf_idx, dst_ip, dmac))

if family not in ip_family:
Expand All @@ -152,7 +185,7 @@ def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
# If neigh exists, log it but no exception raise, other exceptions, raise
except NetlinkError as e:
if e[0] == errno.EEXIST:
logger.warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
log_warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
family, intf_idx, dst_ip, dmac))
else:
raise
Expand Down Expand Up @@ -196,10 +229,13 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
ipclass = IPRoute()
mtime = monotonic.time.time
start_time = mtime()
is_intf_up.counter = 0
db = swsssdk.SonicV2Connector(host='127.0.0.1')
db.connect(db.STATE_DB, False)
while (mtime() - start_time) < timeout:
for intf, family_neigh_map in intf_neigh_map.items():
# only try to restore to kernel when link is up
if is_intf_oper_state_up(intf):
if is_intf_up(intf, db):
src_mac = get_if_hwaddr(intf)
intf_idx = ipclass.link_lookup(ifname=intf)[0]
# create socket per intf to send packets
Expand All @@ -215,6 +251,8 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
# use netlink to set neighbor entries
set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac)

log_info('Sending Neigh with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
family, intf_idx, dst_ip, dmac))
# sending arp/ns packet to update kernel neigh info
s.send(build_arp_ns_pkt(family, src_mac, src_ip, dst_ip))
# delete this family on the intf
Expand All @@ -229,28 +267,27 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
if not intf_neigh_map:
break
time.sleep(CHECK_INTERVAL)
db.close(db.STATE_DB)


def main():

print "restore_neighbors service is started"

log_info ("restore_neighbors service is started")
# Use warmstart python binding to check warmstart information
warmstart = swsscommon.WarmStart()
warmstart.initialize("neighsyncd", "swss")
warmstart.checkWarmStart("neighsyncd", "swss", False)

# if swss or system warm reboot not enabled, don't run
if not warmstart.isWarmStart():
print "restore_neighbors service is skipped as warm restart not enabled"
log_info ("restore_neighbors service is skipped as warm restart not enabled")
return

# swss restart not system warm reboot, set statedb directly
if not warmstart.isSystemWarmRebootEnabled():
set_statedb_neigh_restore_done()
print "restore_neighbors service is done as system warm reboot not enabled"
log_info ("restore_neighbors service is done as system warm reboot not enabled")
return

# read the neigh table from appDB to internal map
try:
intf_neigh_map = read_neigh_table_to_maps()
Expand All @@ -266,7 +303,7 @@ def main():

# set statedb to signal other processes like neighsyncd
set_statedb_neigh_restore_done()
print "restore_neighbor service is done for system warmreboot"
log_info ("restore_neighbor service is done for system warmreboot")
return

if __name__ == '__main__':
Expand Down

0 comments on commit 313ef5c

Please sign in to comment.