From f89008681fcc20d88a1341030f3d5739750fe3c4 Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Wed, 28 Aug 2024 12:11:16 -0700 Subject: [PATCH] bgpd: backpressure - Optimize EVPN L2VNI remote routes processing Anytime BGP gets a L2 VNI ADD from zebra, - Walking the entire global routing table per L2VNI is very expensive. - The next read (say of another VNI ADD) from the socket does not proceed unless this walk is complete. So for triggers where a bulk of L2VNI's are flapped, this results in huge output buffer FIFO growth spiking up the memory in zebra since bgp is slow/busy processing the first message. To avoid this, idea is to hookup the VPN off the bgp_master struct and maintain a VPN FIFO list which is processed later on, where we walk a chunk of VPNs and do the remote route install. Note: So far in the L3 backpressure cases(#15524), we have considered the fact that zebra is slow, and the buffer grows in the BGP. However this is the reverse i.e. BGP is very busy processing the first ZAPI message from zebra due to which the buffer grows huge in zebra and memory spikes up. Ticket :#3864372 Signed-off-by: Rajasekar Raja --- bgpd/bgp_evpn.c | 226 +++++++++++++++++++++++++++++++--------- bgpd/bgp_evpn.h | 1 + bgpd/bgp_evpn_private.h | 5 +- bgpd/bgp_zebra.c | 17 +++ bgpd/bgp_zebra.h | 1 + bgpd/bgpd.c | 35 +++++-- bgpd/bgpd.h | 1 + 7 files changed, 227 insertions(+), 59 deletions(-) diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c index f173bd01f20f..c8a089a72673 100644 --- a/bgpd/bgp_evpn.c +++ b/bgpd/bgp_evpn.c @@ -3979,30 +3979,81 @@ static int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install) return 0; } +#define BGP_PROC_L2VNI_LIMIT 10 +static int install_evpn_remote_route_per_l2vni(struct bgp *bgp, struct bgp_path_info *pi, + const struct prefix_evpn *evp) +{ + int ret = 0; + uint8_t vni_iter = 0; + struct bgpevpn *t_vpn = NULL; + struct bgpevpn *t_vpn_next = NULL; + + for (t_vpn = zebra_l2_vni_first(&bm->zebra_l2_vni_head); + t_vpn && vni_iter < BGP_PROC_L2VNI_LIMIT; t_vpn = t_vpn_next) { + t_vpn_next = zebra_l2_vni_next(&bm->zebra_l2_vni_head, t_vpn); + vni_iter++; + /* + * Skip install/uninstall if the route entry is not needed to + * be imported into the VNI i.e. RTs dont match + */ + if (!is_route_matching_for_vni(bgp, t_vpn, pi)) + continue; + + ret = install_evpn_route_entry(bgp, t_vpn, evp, pi); + + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to install EVPN %s route in VNI %u during BP", + bgp->vrf_id, bgp_evpn_route_type_str[evp->prefix.route_type].str, + t_vpn->vni); + zebra_l2_vni_del(&bm->zebra_l2_vni_head, t_vpn); + + return ret; + } + } + + return 0; +} + /* * Install or uninstall routes of specified type that are appropriate for this * particular VNI. */ -static int install_uninstall_routes_for_vni(struct bgp *bgp, - struct bgpevpn *vpn, bool install) +int install_uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn, bool install) { afi_t afi; safi_t safi; struct bgp_dest *rd_dest, *dest; struct bgp_table *table; struct bgp_path_info *pi; - int ret; + int ret = 0; + uint8_t count = 0; + bool walk_fifo = false; afi = AFI_L2VPN; safi = SAFI_EVPN; - /* Walk entire global routing table and evaluate routes which could be + if (!bgp) { + walk_fifo = true; + bgp = bgp_get_evpn(); + if (!bgp) { + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("No BGP EVPN instance found..."); + + return -1; + } + } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("%s: Total %u L2VNI VPNs pending to be processed for remote route installation", + __func__, (uint32_t)zebra_l2_vni_count(&bm->zebra_l2_vni_head)); + /* + * Walk entire global routing table and evaluate routes which could be * imported into this VPN. Note that we cannot just look at the routes - * for - * the VNI's RD - remote routes applicable for this VNI could have any - * RD. + * for the VNI's RD - remote routes applicable for this VNI could have + * any RD. + * Note: EVPN routes are a 2-level table. */ - /* EVPN routes are a 2-level table. */ for (rd_dest = bgp_table_top(bgp->rib[afi][safi]); rd_dest; rd_dest = bgp_route_next(rd_dest)) { table = bgp_dest_get_bgp_table_info(rd_dest); @@ -4015,54 +4066,79 @@ static int install_uninstall_routes_for_vni(struct bgp *bgp, (const struct prefix_evpn *)bgp_dest_get_prefix( dest); - if (evp->prefix.route_type != BGP_EVPN_IMET_ROUTE && - evp->prefix.route_type != BGP_EVPN_AD_ROUTE && - evp->prefix.route_type != BGP_EVPN_MAC_IP_ROUTE) + /* Proceed only for IMET/AD/MAC_IP routes */ + switch (evp->prefix.route_type) { + case BGP_EVPN_IMET_ROUTE: + case BGP_EVPN_AD_ROUTE: + case BGP_EVPN_MAC_IP_ROUTE: + break; + default: continue; + } for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { - /* Consider "valid" remote routes applicable for - * this VNI. */ - if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) - && pi->type == ZEBRA_ROUTE_BGP - && pi->sub_type == BGP_ROUTE_NORMAL)) - continue; - - if (!is_route_matching_for_vni(bgp, vpn, pi)) + /* + * Skip install/uninstall if + * - Not a valid remote routes + * - Install & evpn route matchesi macvrf SOO + */ + if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) && + pi->type == ZEBRA_ROUTE_BGP && + pi->sub_type == BGP_ROUTE_NORMAL) || + (install && bgp_evpn_route_matches_macvrf_soo(pi, evp))) continue; - if (install) { - if (bgp_evpn_route_matches_macvrf_soo( - pi, evp)) + if (walk_fifo) { + ret = install_evpn_remote_route_per_l2vni(bgp, pi, evp); + if (ret) { + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + return ret; + } + } else { + /* + * Skip install/uninstall if the route + * entry is not needed to be imported + * into the VNI i.e. RTs dont match + */ + if (!is_route_matching_for_vni(bgp, vpn, pi)) continue; - ret = install_evpn_route_entry(bgp, vpn, - evp, pi); - } else - ret = uninstall_evpn_route_entry( - bgp, vpn, evp, pi); - - if (ret) { - flog_err(EC_BGP_EVPN_FAIL, - "%u: Failed to %s EVPN %s route in VNI %u", - bgp->vrf_id, - install ? "install" - : "uninstall", - evp->prefix.route_type == - BGP_EVPN_MAC_IP_ROUTE - ? "MACIP" - : "IMET", - vpn->vni); - - bgp_dest_unlock_node(rd_dest); - bgp_dest_unlock_node(dest); - return ret; + if (install) + ret = install_evpn_route_entry(bgp, vpn, evp, pi); + else + ret = uninstall_evpn_route_entry(bgp, vpn, evp, pi); + + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to %s EVPN %s route in VNI %u", + bgp->vrf_id, + install ? "install" : "uninstall", + bgp_evpn_route_type_str[evp->prefix.route_type] + .str, + vpn->vni); + + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + return ret; + } } } } } + if (walk_fifo) { + while (count < BGP_PROC_L2VNI_LIMIT) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + if (!vpn) + return 0; + + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + count++; + } + } + return 0; } @@ -7024,6 +7100,45 @@ int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) return 0; } +static void bgp_evpn_l2vni_remote_route_processing(struct bgpevpn *vpn) +{ + /* + * Anytime BGP gets a Bulk of L2 VNIs ADD/UPD from zebra, + * - Walking the entire global routing table per VNI is very expensive. + * - The next read (say of another VNI ADD/UPD) from the socket does + * not proceed unless this walk is complete. + * This results in huge output buffer FIFO growth spiking up the + * memory in zebra. + * + * To avoid this, idea is to hookup the VPN off the struct bgp_master + * and maintain a VPN FIFO list which is processed later on, where we + * walk a chunk of VPNs and do the remote route install. + */ + if (!CHECK_FLAG(vpn->flags, VNI_FLAG_ADD)) { + zebra_l2_vni_add_tail(&bm->zebra_l2_vni_head, vpn); + SET_FLAG(vpn->flags, VNI_FLAG_ADD); + } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("Scheduling L2VNI ADD to be processed later for VNI %u", vpn->vni); + + /* + * If there are no VNI's in the bgp VPN FIFO list i.e. an update + * for an already processed VNI comes in, schedule the remote + * route install immediately. + * + * In all other cases, it is ok to schedule the remote route install + * after a small sleep. This is to give benefit of doubt in case more + * L2VNI ADD events come. + */ + if (zebra_l2_vni_count(&bm->zebra_l2_vni_head)) + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, + 20, &bm->t_bgp_zebra_l2_vni); + else + event_add_event(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, 0, + &bm->t_bgp_zebra_l2_vni); +} + /* * When bgp instance goes down also clean up what might have been left over * from evpn. @@ -7047,6 +7162,10 @@ int bgp_evpn_local_vni_del(struct bgp *bgp, vni_t vni) if (!vpn) return 0; + /* Remove the VPN from the bgp VPN FIFO (if exists) */ + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + zebra_l2_vni_del(&bm->zebra_l2_vni_head, vpn); + /* Remove all local EVPN routes and schedule for processing (to * withdraw from peers). */ @@ -7203,12 +7322,6 @@ int bgp_evpn_local_vni_add(struct bgp *bgp, vni_t vni, } } - /* If we have learnt and retained remote routes (VTEPs, MACs) for this - * VNI, - * install them. - */ - install_routes_for_vni(bgp, vpn); - /* If we are advertising gateway mac-ip It needs to be conveyed again to zebra */ bgp_zebra_advertise_gw_macip(bgp, vpn->advertise_gw_macip, vpn->vni); @@ -7216,6 +7329,8 @@ int bgp_evpn_local_vni_add(struct bgp *bgp, vni_t vni, /* advertise svi mac-ip knob to zebra */ bgp_zebra_advertise_svi_macip(bgp, vpn->advertise_svi_macip, vpn->vni); + bgp_evpn_l2vni_remote_route_processing(vpn); + return 0; } @@ -7245,8 +7360,17 @@ void bgp_evpn_flood_control_change(struct bgp *bgp) */ void bgp_evpn_cleanup_on_disable(struct bgp *bgp) { - hash_iterate(bgp->vnihash, (void (*)(struct hash_bucket *, - void *))cleanup_vni_on_disable, + struct bgpevpn *vpn = NULL; + uint32_t vni_count = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + + /* Cleanup VNI FIFO list from this bgp instance */ + while (vni_count) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + vni_count--; + } + + hash_iterate(bgp->vnihash, (void (*)(struct hash_bucket *, void *))cleanup_vni_on_disable, bgp); } diff --git a/bgpd/bgp_evpn.h b/bgpd/bgp_evpn.h index 1a333a5a09a3..75dde616ce78 100644 --- a/bgpd/bgp_evpn.h +++ b/bgpd/bgp_evpn.h @@ -200,4 +200,5 @@ bool bgp_evpn_skip_vrf_import_of_local_es(struct bgp *bgp_vrf, const struct pref int uninstall_evpn_route_entry_in_vrf(struct bgp *bgp_vrf, const struct prefix_evpn *evp, struct bgp_path_info *parent_pi); extern void bgp_zebra_evpn_pop_items_from_announce_fifo(struct bgpevpn *vpn); +extern int install_uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn, bool install); #endif /* _QUAGGA_BGP_EVPN_H */ diff --git a/bgpd/bgp_evpn_private.h b/bgpd/bgp_evpn_private.h index 65f17aa73d55..cc92807d42b4 100644 --- a/bgpd/bgp_evpn_private.h +++ b/bgpd/bgp_evpn_private.h @@ -62,8 +62,9 @@ struct bgpevpn { #define VNI_FLAG_RD_CFGD 0x4 /* RD is user configured. */ #define VNI_FLAG_IMPRT_CFGD 0x8 /* Import RT is user configured */ #define VNI_FLAG_EXPRT_CFGD 0x10 /* Export RT is user configured */ -#define VNI_FLAG_USE_TWO_LABELS 0x20 /* Attach both L2-VNI and L3-VNI if - needed for this VPN */ +/* Attach both L2-VNI and L3-VNI if needed for this VPN */ +#define VNI_FLAG_USE_TWO_LABELS 0x20 +#define VNI_FLAG_ADD 0x40 /* L2VNI Add */ struct bgp *bgp_vrf; /* back pointer to the vrf instance */ diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c index 688dfacaa0b6..1eb614385c06 100644 --- a/bgpd/bgp_zebra.c +++ b/bgpd/bgp_zebra.c @@ -3025,6 +3025,23 @@ static void bgp_zebra_connected(struct zclient *zclient) BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(bgp, bgp->peer); } +void bgp_zebra_process_remote_routes_for_l2vni(struct event *e) +{ + /* + * If we have learnt and retained remote routes (VTEPs, MACs) + * for this VNI, install them. + */ + install_uninstall_routes_for_vni(NULL, NULL, true); + + /* + * If there are VNIs still pending to be processed, schedule them + * after a small sleep so that CPU can be used for other purposes. + */ + if (zebra_l2_vni_count(&bm->zebra_l2_vni_head)) + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, + 20, &bm->t_bgp_zebra_l2_vni); +} + static int bgp_zebra_process_local_es_add(ZAPI_CALLBACK_ARGS) { esi_t esi; diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h index 8deecba747b3..993d002998f2 100644 --- a/bgpd/bgp_zebra.h +++ b/bgpd/bgp_zebra.h @@ -135,4 +135,5 @@ extern void bgp_zebra_release_label_range(uint32_t start, uint32_t end); extern enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, struct bgp_path_info *info, struct bgp *bgp); +extern void bgp_zebra_process_remote_routes_for_l2vni(struct event *e); #endif /* _QUAGGA_BGP_ZEBRA_H */ diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index 8b05ac880ec4..45c4229afa3c 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -3966,11 +3966,14 @@ int bgp_delete(struct bgp *bgp) afi_t afi; safi_t safi; int i; + uint32_t vni_count; + struct bgpevpn *vpn = NULL; struct bgp_dest *dest = NULL; struct bgp_dest *dest_next = NULL; struct bgp_table *dest_table = NULL; struct graceful_restart_info *gr_info; - uint32_t cnt_before, cnt_after; + uint32_t b_ann_cnt = 0, b_l2_cnt = 0; + uint32_t a_ann_cnt = 0, a_l2_cnt = 0; assert(bgp); @@ -3978,7 +3981,7 @@ int bgp_delete(struct bgp *bgp) * Iterate the pending dest list and remove all the dest pertaininig to * the bgp under delete. */ - cnt_before = zebra_announce_count(&bm->zebra_announce_head); + b_ann_cnt = zebra_announce_count(&bm->zebra_announce_head); for (dest = zebra_announce_first(&bm->zebra_announce_head); dest; dest = dest_next) { dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); @@ -3990,10 +3993,28 @@ int bgp_delete(struct bgp *bgp) } } - cnt_after = zebra_announce_count(&bm->zebra_announce_head); - if (BGP_DEBUG(zebra, ZEBRA)) - zlog_debug("Zebra Announce Fifo cleanup count before %u and after %u during BGP %s deletion", - cnt_before, cnt_after, bgp->name_pretty); + /* + * Pop all VPNs yet to be processed for remote routes install if the + * bgp-evpn instance is getting deleted + */ + if (bgp == bgp_get_evpn()) { + b_l2_cnt = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + vni_count = b_l2_cnt; + while (vni_count) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + vni_count--; + } + } + + if (BGP_DEBUG(zebra, ZEBRA)) { + a_ann_cnt = zebra_announce_count(&bm->zebra_announce_head); + a_l2_cnt = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + zlog_debug("FIFO Cleanup Count during BGP %s deletion :: " + "Zebra Announce - before %u after %u :: " + "BGP L2_VNI - before %u after %u", + bgp->name_pretty, b_ann_cnt, a_ann_cnt, b_l2_cnt, a_l2_cnt); + } bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); @@ -8517,6 +8538,7 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm->stalepath_time = BGP_DEFAULT_STALEPATH_TIME; bm->select_defer_time = BGP_DEFAULT_SELECT_DEFERRAL_TIME; bm->rib_stale_time = BGP_DEFAULT_RIB_STALE_TIME; + bm->t_bgp_zebra_l2_vni = NULL; bgp_mac_init(); /* init the rd id space. @@ -8764,6 +8786,7 @@ void bgp_terminate(void) EVENT_OFF(bm->t_bgp_sync_label_manager); EVENT_OFF(bm->t_bgp_start_label_manager); EVENT_OFF(bm->t_bgp_zebra_route); + EVENT_OFF(bm->t_bgp_zebra_l2_vni); bgp_mac_finish(); } diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 5835eaac6b57..139871e4825c 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -206,6 +206,7 @@ struct bgp_master { /* To preserve ordering of installations into zebra across all Vrfs */ struct zebra_announce_head zebra_announce_head; + struct event *t_bgp_zebra_l2_vni; /* To preserve ordering of processing of L2 VNIs in BGP */ struct zebra_l2_vni_head zebra_l2_vni_head;