Skip to content

Commit

Permalink
[chassis][voq]Collect counters for fabric links (#1944)
Browse files Browse the repository at this point in the history
Counters are port stats and queue stats. Currently only fabric asics
could be collected. J2 fabric counter collection doesn't work yet.
J2 fabric port counters fail to be collected due to logical port id
for fabric links is set up to 512 while SAI supports at most 256 ports.
J2 fabric queue counters are not supported by SAI at this moment (BCM
confirmed).

Signed-off-by: Maxime Lorrillere <[email protected]>

Signed-off-by: Maxime Lorrillere <[email protected]>
  • Loading branch information
mlorrillere authored Oct 5, 2022
1 parent df92fb7 commit 31c9321
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 38 deletions.
121 changes: 95 additions & 26 deletions orchagent/fabricportsorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@
#include "timer.h"

#define FABRIC_POLLING_INTERVAL_DEFAULT (30)
#define FABRIC_PORT_PREFIX "PORT"
#define FABRIC_PORT_ERROR 0
#define FABRIC_PORT_SUCCESS 1
#define FABRIC_PORT_STAT_COUNTER_FLEX_COUNTER_GROUP "FABRIC_PORT_STAT_COUNTER"
#define FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 10000
#define FABRIC_QUEUE_STAT_COUNTER_FLEX_COUNTER_GROUP "FABRIC_QUEUE_STAT_COUNTER"
#define FABRIC_QUEUE_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 100000
#define FABRIC_PORT_TABLE "FABRIC_PORT_TABLE"

extern sai_object_id_t gSwitchId;
extern sai_switch_api_t *sai_switch_api;
extern sai_port_api_t *sai_port_api;
extern sai_queue_api_t *sai_queue_api;

const vector<sai_port_stat_t> port_stat_ids =
{
Expand All @@ -42,7 +43,8 @@ static const vector<sai_queue_stat_t> queue_stat_ids =
SAI_QUEUE_STAT_CURR_OCCUPANCY_LEVEL,
};

FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames) :
FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames,
bool fabricPortStatEnabled, bool fabricQueueStatEnabled) :
Orch(appl_db, tableNames),
port_stat_manager(FABRIC_PORT_STAT_COUNTER_FLEX_COUNTER_GROUP, StatsMode::READ,
FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS, true),
Expand All @@ -55,14 +57,17 @@ FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pr
SWSS_LOG_NOTICE( "FabricPortsOrch constructor" );

m_state_db = shared_ptr<DBConnector>(new DBConnector("STATE_DB", 0));
m_stateTable = unique_ptr<Table>(new Table(m_state_db.get(), FABRIC_PORT_TABLE));
m_stateTable = unique_ptr<Table>(new Table(m_state_db.get(), APP_FABRIC_PORT_TABLE_NAME));

m_counter_db = shared_ptr<DBConnector>(new DBConnector("COUNTERS_DB", 0));
m_laneQueueCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_QUEUE_NAME_MAP));
m_lanePortCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_QUEUE_PORT_MAP));
m_portNameQueueCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_FABRIC_QUEUE_NAME_MAP));
m_portNamePortCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_FABRIC_PORT_NAME_MAP));

m_flex_db = shared_ptr<DBConnector>(new DBConnector("FLEX_COUNTER_DB", 0));
m_flexCounterTable = unique_ptr<ProducerTable>(new ProducerTable(m_flex_db.get(), FABRIC_PORT_TABLE));
m_flexCounterTable = unique_ptr<ProducerTable>(new ProducerTable(m_flex_db.get(), APP_FABRIC_PORT_TABLE_NAME));

m_fabricPortStatEnabled = fabricPortStatEnabled;
m_fabricQueueStatEnabled = fabricQueueStatEnabled;

getFabricPortList();

Expand Down Expand Up @@ -147,32 +152,96 @@ bool FabricPortsOrch::allPortsReady()

void FabricPortsOrch::generatePortStats()
{
// FIX_ME: This function installs flex counters for port stats
// on fabric ports for fabric asics and voq asics (that connect
// to fabric asics via fabric ports). These counters will be
// installed in FLEX_COUNTER_DB, and queried by syncd and updated
// to COUNTERS_DB.
// However, currently BCM SAI doesn't update its code to query
// port stats (metrics in list port_stat_ids) yet.
// Also, BCM sets too low value for "Max logical port count" (256),
// causing syncd to crash on voq asics that now include regular front
// panel ports, fabric ports, and multiple logical ports.
// So, this function will just do nothing for now, and we will readd
// code to install port stats counters when BCM completely supports.
if (!m_fabricPortStatEnabled) return;

SWSS_LOG_NOTICE("Generate fabric port stats");

vector<FieldValueTuple> portNamePortCounterMap;
for (auto p : m_fabricLanePortMap)
{
int lane = p.first;
sai_object_id_t port = p.second;

std::ostringstream portName;
portName << FABRIC_PORT_PREFIX << lane;
portNamePortCounterMap.emplace_back(portName.str(), sai_serialize_object_id(port));

// Install flex counters for port stats
std::unordered_set<std::string> counter_stats;
for (const auto& it: port_stat_ids)
{
counter_stats.emplace(sai_serialize_port_stat(it));
}
port_stat_manager.setCounterIdList(port, CounterType::PORT, counter_stats);
}
m_portNamePortCounterTable->set("", portNamePortCounterMap);
}

void FabricPortsOrch::generateQueueStats()
{
if (!m_fabricQueueStatEnabled) return;
if (m_isQueueStatsGenerated) return;
if (!m_getFabricPortListDone) return;

// FIX_ME: Similar to generatePortStats(), generateQueueStats() installs
// flex counters for queue stats on fabric ports for fabric asics and voq asics.
// However, currently BCM SAI doesn't fully support queue stats query.
// Query on queue type and index is not supported for fabric asics while
// voq asics are not completely supported.
// So, this function will just do nothing for now, and we will readd
// code to install queue stats counters when BCM completely supports.
SWSS_LOG_NOTICE("Generate queue map for fabric ports");

sai_status_t status;
sai_attribute_t attr;

for (auto p : m_fabricLanePortMap)
{
int lane = p.first;
sai_object_id_t port = p.second;

// Each serdes has some pipes (queues) for unicast and multicast.
// But normally fabric serdes uses only one pipe.
attr.id = SAI_PORT_ATTR_QOS_NUMBER_OF_QUEUES;
status = sai_port_api->get_port_attribute(port, 1, &attr);
if (status != SAI_STATUS_SUCCESS)
{
throw runtime_error("FabricPortsOrch get port queue number failure");
}
int num_queues = attr.value.u32;

if (num_queues > 0)
{
vector<sai_object_id_t> m_queue_ids;
m_queue_ids.resize(num_queues);

attr.id = SAI_PORT_ATTR_QOS_QUEUE_LIST;
attr.value.objlist.count = (uint32_t) num_queues;
attr.value.objlist.list = m_queue_ids.data();

status = sai_port_api->get_port_attribute(port, 1, &attr);
if (status != SAI_STATUS_SUCCESS)
{
throw runtime_error("FabricPortsOrch get port queue list failure");
}

// Maintain queue map and install flex counters for queue stats
vector<FieldValueTuple> portNameQueueMap;

// Fabric serdes queue type is SAI_QUEUE_TYPE_FABRIC_TX. Since we always
// maintain only one queue for fabric serdes, m_queue_ids size is 1.
// And so, there is no need to query SAI_QUEUE_ATTR_TYPE and SAI_QUEUE_ATTR_INDEX
// for queue. Actually, SAI does not support query these attributes on fabric serdes.
int queueIndex = 0;
std::ostringstream portName;
portName << FABRIC_PORT_PREFIX << lane << ":" << queueIndex;
const auto queue = sai_serialize_object_id(m_queue_ids[queueIndex]);
portNameQueueMap.emplace_back(portName.str(), queue);

// We collect queue counters like occupancy level
std::unordered_set<string> counter_stats;
for (const auto& it: queue_stat_ids)
{
counter_stats.emplace(sai_serialize_queue_stat(it));
}
queue_stat_manager.setCounterIdList(m_queue_ids[queueIndex], CounterType::QUEUE, counter_stats);

m_portNameQueueCounterTable->set("", portNameQueueMap);
}
}

m_isQueueStatsGenerated = true;
}
Expand All @@ -199,7 +268,7 @@ void FabricPortsOrch::updateFabricPortState()
int lane = p.first;
sai_object_id_t port = p.second;

string key = "PORT" + to_string(lane);
string key = FABRIC_PORT_PREFIX + to_string(lane);
std::vector<FieldValueTuple> values;
uint32_t remote_peer;
uint32_t remote_port;
Expand Down
10 changes: 7 additions & 3 deletions orchagent/fabricportsorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,22 @@
class FabricPortsOrch : public Orch, public Subject
{
public:
FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames);
FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames,
bool fabricPortStatEnabled=true, bool fabricQueueStatEnabled=true);
bool allPortsReady();
void generateQueueStats();

private:
bool m_fabricPortStatEnabled;
bool m_fabricQueueStatEnabled;

shared_ptr<DBConnector> m_state_db;
shared_ptr<DBConnector> m_counter_db;
shared_ptr<DBConnector> m_flex_db;

unique_ptr<Table> m_stateTable;
unique_ptr<Table> m_laneQueueCounterTable;
unique_ptr<Table> m_lanePortCounterTable;
unique_ptr<Table> m_portNameQueueCounterTable;
unique_ptr<Table> m_portNamePortCounterTable;
unique_ptr<ProducerTable> m_flexCounterTable;

swss::SelectableTimer *m_timer = nullptr;
Expand Down
3 changes: 3 additions & 0 deletions orchagent/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,9 @@ int main(int argc, char **argv)
if (gMySwitchType == "voq")
{
orchDaemon->setFabricEnabled(true);
// SAI doesn't fully support counters for non fabric asics
orchDaemon->setFabricPortStatEnabled(false);
orchDaemon->setFabricQueueStatEnabled(false);
}
}
else
Expand Down
2 changes: 1 addition & 1 deletion orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ bool OrchDaemon::init()
vector<table_name_with_pri_t> fabric_port_tables = {
// empty for now
};
gFabricPortsOrch = new FabricPortsOrch(m_applDb, fabric_port_tables);
gFabricPortsOrch = new FabricPortsOrch(m_applDb, fabric_port_tables, m_fabricPortStatEnabled, m_fabricQueueStatEnabled);
m_orchList.push_back(gFabricPortsOrch);
}

Expand Down
10 changes: 10 additions & 0 deletions orchagent/orchdaemon.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ class OrchDaemon
{
m_fabricEnabled = enabled;
}
void setFabricPortStatEnabled(bool enabled)
{
m_fabricPortStatEnabled = enabled;
}
void setFabricQueueStatEnabled(bool enabled)
{
m_fabricQueueStatEnabled = enabled;
}
void logRotate();
private:
DBConnector *m_applDb;
Expand All @@ -77,6 +85,8 @@ class OrchDaemon
DBConnector *m_chassisAppDb;

bool m_fabricEnabled = false;
bool m_fabricPortStatEnabled = true;
bool m_fabricQueueStatEnabled = true;

std::vector<Orch *> m_orchList;
Select *m_select;
Expand Down
40 changes: 32 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ def random_string(size=4, chars=string.ascii_uppercase + string.digits):


class AsicDbValidator(DVSDatabase):
def __init__(self, db_id: int, connector: str):
def __init__(self, db_id: int, connector: str, switch_type: str):
DVSDatabase.__init__(self, db_id, connector)
self._wait_for_asic_db_to_initialize()
self._populate_default_asic_db_values()
self._generate_oid_to_interface_mapping()
if switch_type not in ['fabric']:
self._wait_for_asic_db_to_initialize()
self._populate_default_asic_db_values()
self._generate_oid_to_interface_mapping()

def _wait_for_asic_db_to_initialize(self) -> None:
"""Wait up to 30 seconds for the default fields to appear in ASIC DB."""
Expand Down Expand Up @@ -497,7 +498,9 @@ def _polling_function():
wait_for_result(_polling_function, service_polling_config)

def init_asic_db_validator(self) -> None:
self.asicdb = AsicDbValidator(self.ASIC_DB_ID, self.redis_sock)
self.get_config_db()
metadata = self.config_db.get_entry('DEVICE_METADATA|localhost', '')
self.asicdb = AsicDbValidator(self.ASIC_DB_ID, self.redis_sock, metadata.get("switch_type"))

def init_appl_db_validator(self) -> None:
self.appldb = ApplDbValidator(self.APPL_DB_ID, self.redis_sock)
Expand Down Expand Up @@ -526,11 +529,13 @@ def _polling_function():
port_table_keys = app_db.get_keys("PORT_TABLE")
return ("PortInitDone" in port_table_keys and "PortConfigDone" in port_table_keys, None)

wait_for_result(_polling_function, startup_polling_config)
if metadata.get('switch_type') not in ['fabric']:
wait_for_result(_polling_function, startup_polling_config)

# Verify that all ports have been created
asic_db = self.get_asic_db()
asic_db.wait_for_n_keys("ASIC_STATE:SAI_OBJECT_TYPE_PORT", num_ports + 1) # +1 CPU Port
if metadata.get('switch_type') not in ['fabric']:
asic_db = self.get_asic_db()
asic_db.wait_for_n_keys("ASIC_STATE:SAI_OBJECT_TYPE_PORT", num_ports + 1) # +1 CPU Port

# Verify that fabric ports are monitored in STATE_DB
if metadata.get('switch_type', 'npu') in ['voq', 'fabric']:
Expand Down Expand Up @@ -1802,6 +1807,25 @@ def dvs(request, manage_dvs) -> DockerVirtualSwitch:

return manage_dvs(log_path, dvs_env)

@pytest.yield_fixture(scope="module")
def vst(request):
vctns = request.config.getoption("--vctns")
topo = request.config.getoption("--topo")
forcedvs = request.config.getoption("--forcedvs")
keeptb = request.config.getoption("--keeptb")
imgname = request.config.getoption("--imgname")
max_cpu = request.config.getoption("--max_cpu")
log_path = vctns if vctns else request.module.__name__
dvs_env = getattr(request.module, "DVS_ENV", [])
if not topo:
# use ecmp topology as default
topo = "virtual_chassis/chassis_supervisor.json"
vct = DockerVirtualChassisTopology(vctns, imgname, keeptb, dvs_env, log_path, max_cpu,
forcedvs, topo)
yield vct
vct.get_logs(request.module.__name__)
vct.destroy()

@pytest.fixture(scope="module")
def vct(request):
vctns = request.config.getoption("--vctns")
Expand Down
Loading

0 comments on commit 31c9321

Please sign in to comment.