Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SWSS] Static LAG Support #3090

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions cfgmgr/teammgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ void TeamMgr::doLagTask(Consumer &consumer)
int min_links = 0;
bool fallback = false;
bool fast_rate = false;
bool static_lag = false;
string admin_status = DEFAULT_ADMIN_STATUS_STR;
string mtu = DEFAULT_MTU_STR;
string learn_mode;
Expand Down Expand Up @@ -301,11 +302,16 @@ void TeamMgr::doLagTask(Consumer &consumer)
SWSS_LOG_INFO("Get fast_rate `%s`",
fast_rate ? "true" : "false");
}
else if (fvField(i) == "static")
{
static_lag = true;
SWSS_LOG_INFO ("static %d", static_lag);
}
}

if (m_lagList.find(alias) == m_lagList.end())
{
if (addLag(alias, min_links, fallback, fast_rate) == task_need_retry)
if (addLag(alias, min_links, fallback, fast_rate, static_lag) == task_need_retry)
{
// If LAG creation fails, we need to clean up any potentially orphaned teamd processes
removeLag(alias);
Expand Down Expand Up @@ -562,7 +568,7 @@ bool TeamMgr::setLagLearnMode(const string &alias, const string &learn_mode)
return true;
}

task_process_status TeamMgr::addLag(const string &alias, int min_links, bool fallback, bool fast_rate)
task_process_status TeamMgr::addLag(const string &alias, int min_links, bool fallback, bool fast_rate, bool static_lag)
{
SWSS_LOG_ENTER();

Expand Down Expand Up @@ -603,11 +609,20 @@ task_process_status TeamMgr::addLag(const string &alias, int min_links, bool fal
}
}

conf << "'{\"device\":\"" << alias << "\","
<< "\"hwaddr\":\"" << mac_boot.to_string() << "\","
<< "\"runner\":{"
<< "\"active\":true,"
<< "\"name\":\"lacp\"";
if (static_lag) {
conf << "'{\"device\":\"" << alias << "\","
<< "\"hwaddr\":\"" << mac_boot.to_string() << "\","
<< "\"runner\":{"
<< "\"name\":\"roundrobin\"";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe based on HLD discussion the default mode should be loadbalance and not round robin. Can you please clarify why do we have roundrobin defined as default?
sonic-net/SONiC#1039 (comment)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition if we have roundrobin how is it achieved in the hardware. If I understand roundrobin is to send one packet per link in roundrobin fashion https://man.archlinux.org/man/teamd.conf.5.en . However we don't have such hash algorithm in SAI.
If SAI is going with loadbalance we should have the same defined in teamd as well

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @dgsudharsan ,

Apologies for the delay... unwell last week (this week too ... )

No Round Robin is only for Linux traffic (CPU traffic), Right now it is not mapped to saiars.h (which support packet spraying in hardware for ECMP and LAG)
Regarding spec will modify to round robin..

With loadbalance facing issue of 100% teamd always thats the reason for the change to roundrobin.

Thanks,
Kannan.S


} else {

conf << "'{\"device\":\"" << alias << "\","
<< "\"hwaddr\":\"" << mac_boot.to_string() << "\","
<< "\"runner\":{"
<< "\"active\":true,"
<< "\"name\":\"lacp\"";
}

if (min_links != 0)
{
Expand Down
2 changes: 1 addition & 1 deletion cfgmgr/teammgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class TeamMgr : public Orch
void doLagMemberTask(Consumer &consumer);
void doPortUpdateTask(Consumer &consumer);

task_process_status addLag(const std::string &alias, int min_links, bool fall_back, bool fast_rate);
task_process_status addLag(const std::string &alias, int min_links, bool fall_back, bool fast_rate, bool static_lag);
bool removeLag(const std::string &alias);
task_process_status addLagMember(const std::string &lag, const std::string &member);
bool removeLagMember(const std::string &lag, const std::string &member);
Expand Down
20 changes: 19 additions & 1 deletion tests/mock_tests/teammgrd/teammgr_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,22 @@ namespace teammgr_ut
}
ASSERT_EQ(kill_cmd_called, 1);
}
}

TEST_F(TeamMgrTest, testProcessKilledAfterAddStaticFailure)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
cfg_lag_table.set("PortChannel1", { { "admin_status", "up" },
{ "mtu", "9100" },
{ "static", "true" } });
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
int kill_cmd_called = 0;
for (auto cmd : mockCallArgs){
if (cmd.find("kill -TERM 1234") != std::string::npos){
kill_cmd_called++;
}
}
ASSERT_EQ(kill_cmd_called, 1);
}
}
69 changes: 69 additions & 0 deletions tests/test_portchannel.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,75 @@ def test_portchannel_member_netdev_oper_status(self, dvs, testlog):
# wait for port-channel deletion
time.sleep(1)


def test_static_portchannel_member_netdev_oper_status(self, dvs, testlog):
config_db = swsscommon.DBConnector(swsscommon.CONFIG_DB, dvs.redis_sock, 0)
state_db = swsscommon.DBConnector(swsscommon.STATE_DB, dvs.redis_sock, 0)
app_db = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0)

# create port-channel
tbl = swsscommon.Table(config_db, "PORTCHANNEL")
fvs = swsscommon.FieldValuePairs([("admin_status", "up"),("mtu", "9100"),("oper_status", "up"),("static", "true")])
tbl.set("PortChannel111", fvs)

# set port-channel oper status
tbl = swsscommon.ProducerStateTable(app_db, "LAG_TABLE")
fvs = swsscommon.FieldValuePairs([("admin_status", "up"),("mtu", "9100"),("oper_status", "up")])
tbl.set("PortChannel111", fvs)

# add members to port-channel
tbl = swsscommon.Table(config_db, "PORTCHANNEL_MEMBER")
fvs = swsscommon.FieldValuePairs([("NULL", "NULL")])
tbl.set("PortChannel111|Ethernet0", fvs)
tbl.set("PortChannel111|Ethernet4", fvs)

# wait for port-channel netdev creation
time.sleep(1)

# set netdev oper status
(exitcode, _) = dvs.runcmd("ip link set up dev Ethernet0")
assert exitcode == 0, "ip link set failed"

(exitcode, _) = dvs.runcmd("ip link set up dev Ethernet4")
assert exitcode == 0, "ip link set failed"

(exitcode, _) = dvs.runcmd("ip link set dev PortChannel111 carrier on")
assert exitcode == 0, "ip link set failed"

# verify port-channel members netdev oper status
tbl = swsscommon.Table(state_db, "PORT_TABLE")
status, fvs = tbl.get("Ethernet0")
assert status is True
fvs = dict(fvs)
assert fvs['netdev_oper_status'] == 'up'

status, fvs = tbl.get("Ethernet4")
assert status is True
fvs = dict(fvs)
assert fvs['netdev_oper_status'] == 'up'

# wait for port-channel netdev creation
time.sleep(1)

(exit_code, output) = dvs.runcmd("teamdctl " + "PortChannel111" + " state dump")
port_state_dump = json.loads(output)
runner_name = port_state_dump["setup"]["runner_name"]
assert runner_name == "roundrobin"


# remove port-channel members
tbl = swsscommon.Table(config_db, "PORTCHANNEL_MEMBER")
tbl._del("PortChannel111|Ethernet0")
tbl._del("PortChannel111|Ethernet4")

# remove port-channel
tbl = swsscommon.Table(config_db, "PORTCHANNEL")
tbl._del("PortChannel111")

# wait for port-channel deletion
time.sleep(1)


# Add Dummy always-pass test at end as workaroud
# for issue when Flaky fail on final test it invokes module tear-down before retrying
def test_nonflaky_dummy():
Expand Down
63 changes: 63 additions & 0 deletions tlm_teamd/values_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,47 @@ std::string ValuesStore::get_value(json_t * root, const std::string & path, Valu
throw std::runtime_error("Reach the end of the ValuesStore::get_value. Path=" + path);
}

///
/// Extract static lag values for LAG with name lag_name, from the parsed json tree with root, to the temporary storage
/// @param lag_name a name of the LAG
/// @param root a pointer to the parsed json tree
/// @param storage a reference to the temporary storage
///

void ValuesStore::extract_values_static(const std::string & lag_name, json_t * root, HashOfRecords & storage)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to have a separate function to extract values for static? This appears to be duplicate of the existing API. Can you clarify the difference?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

teamdctl state information for Static lag and Dynamic LACP are little different.

Dynamic LACP has the following fields along with setup
{
"ports": {
"Ethernet48": {
"ifinfo": {
"dev_addr": "00:e0:ec:cc:a8:de",
"dev_addr_len": 6,
"ifindex": 40,
"ifname": "Ethernet48"
},
"link": {
"duplex": "half",
"speed": 0,
"up": false
},
"link_watches": {
"list": {
"link_watch_0": {
"delay_down": 0,
"delay_up": 0,
"down_count": 5,
"name": "ethtool",
"up": false
}
},
"up": false
},
"runner": {
"actor_lacpdu_info": {
"key": 103,
"port": 49,
"port_priority": 255,
"state": 5,
"system": "00:e0:ec:cc:a8:de",
"system_priority": 65535
},
"aggregator": {
"id": 0,
"selected": false
},
"key": 103,
"partner_lacpdu_info": {
"key": 0,
"port": 0,
"port_priority": 0,
"state": 0,
"system": "00:00:00:00:00:00",
"system_priority": 0
},
"partner_retry_count": 3,
"prio": 255,
"selected": false,
"state": "disabled"
}
}
},
"runner": {
"active": true,
"enable_retry_count_feature": true,
"fallback": true,
"fast_rate": false,
"retry_count": 3,
"select_policy": "lacp_prio",
"sys_prio": 65535
},
"setup": {
"daemonized": true,
"dbus_enabled": false,
"debug_level": 1,
"kernel_team_mode_name": "loadbalance",
"pid": 297,
"pid_file": "/var/run/teamd/PortChannel03.pid",
"runner_name": "lacp",
"zmq_enabled": false
},

"team_device": {
"ifinfo": {
"dev_addr": "00:e0:ec:cc:a8:de",
"dev_addr_len": 6,
"ifindex": 50,
"ifname": "PortChannel03"
}
}
}

Static Lag will not contain the runner option (Dynamic LACP needs runner option to determine peer lacp link protocol information)

{
"ports": {
"Ethernet32": {
"ifinfo": {
"dev_addr": "00:e0:ec:cc:a8:de",
"dev_addr_len": 6,
"ifindex": 39,
"ifname": "Ethernet32"
},
"link": {
"duplex": "half",
"speed": 0,
"up": true
},
"link_watches": {
"list": {
"link_watch_0": {
"delay_down": 0,
"delay_up": 0,
"down_count": 3,
"name": "ethtool",
"up": true
}
},
"up": true
}
},
"Ethernet64": {
"ifinfo": {
"dev_addr": "00:e0:ec:cc:a8:de",
"dev_addr_len": 6,
"ifindex": 37,
"ifname": "Ethernet64"
},
"link": {
"duplex": "half",
"speed": 0,
"up": false
},
"link_watches": {
"list": {
"link_watch_0": {
"delay_down": 0,
"delay_up": 0,
"down_count": 3,
"name": "ethtool",
"up": false
}
},
"up": false
}
}
},
"setup": {
"daemonized": true,
"dbus_enabled": false,
"debug_level": 1,
"kernel_team_mode_name": "roundrobin",
"pid": 49,
"pid_file": "/var/run/teamd/PortChannel02.pid",
"runner_name": "roundrobin",
"zmq_enabled": false
},
"team_device": {
"ifinfo": {
"dev_addr": "00:e0:ec:cc:a8:de",
"dev_addr_len": 6,
"ifindex": 9,
"ifname": "PortChannel02"
}
}
}

Since Static doesnt have those runner information modifying existing LAG Member table done for LACP can create more collaterals, thats the reason for separating it out between Static and LACP.

{

const std::string key = "LAG_TABLE|" + lag_name;
Records lag_values;

for (const auto & p: m_lag_static_paths)
{
const auto & path = p.first;
const auto & type = p.second;
const auto & value = get_value(root, path, type);
lag_values.emplace(path, value);
}
storage.emplace(key, lag_values);

const auto & ports = get_ports(root);
for (const auto & port: ports)
{
const std::string key = "LAG_MEMBER_TABLE|" + lag_name + "|" + port;
Records member_values;
for (const auto & p: m_member_static_paths)
{
const auto & path = p.first;
const auto & type = p.second;
const std::string full_path = "ports." + port + "." + path;
const auto & value = get_value(root, full_path, type);
member_values.emplace(path, value);
}
storage.emplace(key, member_values);
}

return;
}

///
/// Extract values for LAG with name lag_name, from the parsed json tree with root, to the temporary storage
/// @param lag_name a name of the LAG
Expand All @@ -155,6 +196,28 @@ void ValuesStore::extract_values(const std::string & lag_name, json_t * root, Ha

const std::string key = "LAG_TABLE|" + lag_name;
Records lag_values;


bool is_static = false;

for (const auto & p: m_lag_static_paths)
{
const auto & path = p.first;
const auto & type = p.second;
if (path == "setup.runner_name") {
const auto & value = get_value(root, path, type);
if(value == "loadbalance" ||
value == "roundrobin")
is_static = true;
break;
}
}

if (is_static) {
extract_values_static(lag_name, root, storage);
return;
}

for (const auto & p: m_lag_paths)
{
const auto & path = p.first;
Expand Down
14 changes: 14 additions & 0 deletions tlm_teamd/values_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class ValuesStore
std::vector<std::string> update_storage(const HashOfRecords & storage);
void update_db(const HashOfRecords & storage, const std::vector<std::string> & keys_to_refresh);
void extract_values(const std::string & lag_name, json_t * root, HashOfRecords & storage);
void extract_values_static(const std::string & lag_name, json_t * root, HashOfRecords & storage);

HashOfRecords m_storage; // our main storage
const swss::DBConnector * m_db;
Expand Down Expand Up @@ -70,4 +71,17 @@ class ValuesStore
{ "runner.selected", ValuesStore::json_type::boolean },
{ "runner.state", ValuesStore::json_type::string },
};
const std::vector<std::pair<std::string, ValuesStore::json_type>> m_lag_static_paths = {
{ "setup.runner_name", ValuesStore::json_type::string },
{ "setup.kernel_team_mode_name", ValuesStore::json_type::string },
{ "setup.pid", ValuesStore::json_type::integer },
{ "team_device.ifinfo.dev_addr", ValuesStore::json_type::string },
{ "team_device.ifinfo.ifindex", ValuesStore::json_type::integer },
};
const std::vector<std::pair<std::string, ValuesStore::json_type>> m_member_static_paths = {
{ "ifinfo.dev_addr", ValuesStore::json_type::string },
{ "ifinfo.ifindex", ValuesStore::json_type::integer },
{ "link.up", ValuesStore::json_type::boolean },
{ "link_watches.list.link_watch_0.up", ValuesStore::json_type::boolean },
};
};
Loading