diff --git a/src/sonic-frr/patch/0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch b/src/sonic-frr/patch/0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch new file mode 100644 index 000000000000..0bebd1ea12ba --- /dev/null +++ b/src/sonic-frr/patch/0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch @@ -0,0 +1,174 @@ +From 7c711ff437985b23a4dd919a98b22b8ea14ef553 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Mon, 12 Feb 2024 10:44:18 -0800 +Subject: [PATCH 02/11] zebra: backpressure - Zebra push back on Buffer/Stream + creation + +Currently, the way zebra works is it creates pthread per client (BGP is +of interest in this case) and this thread loops itself in zserv_read() +to check for any incoming data. If there is one, then it reads, +validates and adds it in the ibuf_fifo signalling the main thread to +process the message. The main thread when it gets a change, processes +the message, and invokes the function pointer registered in the header +command. (Ex: zserv_handlers). + +Finally, if all of this was successful, this task reschedules itself and +loops in zserv_read() again + +However, if there are already items on ibuf FIFO, that means zebra is +slow in processing. And with the current mechanism if Zebra main is +busy, the ibuf FIFO keeps growing holding up the memory. + +Show memory zebra:(Example: 15k streams hoarding ~160 MB of data) +--- qmem libfrr --- +Stream : 44 variable 3432352 15042 161243800 + +Fix: +Client IO Thread: (zserv_read) + - Stop doing the read events when we know there are X number of items + on the FIFO already.(X - zebra zapi-packets <1-10000> (Default-1000) + + - Determine the number of items on the zserv->ibuf_fifo. Subtract this + from the work items and only pull the number of items off that would + take us to X items on the ibuf_fifo again. + + - If the number of items in the ibuf_fifo has reached to the maximum + * Either initially when zserv_read() is called (or) + * when processing the remainders of the incoming buffer + the client IO thread is woken by the the zebra main. + +Main thread: (zserv_process_message) +If the client ibuf always schedules a wakeup to the client IO to read +more items from the socked buffer. This way we ensure + - Client IO thread always tries to read the socket buffer and add more + items to the ibuf_fifo (until max limit) + - hidden config change (zebra zapi-packets <>) is taken into account + +Ticket: #3390099 + +Signed-off-by: Donald Sharp +Signed-off-by: Rajasekar Raja + +diff --git a/zebra/zserv.c b/zebra/zserv.c +index 2024f34534..de6e404fc4 100644 +--- a/zebra/zserv.c ++++ b/zebra/zserv.c +@@ -318,6 +318,14 @@ zwrite_fail: + * this task reschedules itself. + * + * Any failure in any of these actions is handled by terminating the client. ++ * ++ * The client's input buffer ibuf_fifo can have a maximum items as configured ++ * in the packets_to_process. This way we are not filling up the FIFO more ++ * than the maximum when the zebra main is busy. If the fifo has space, we ++ * reschedule ourselves to read more. ++ * ++ * The main thread processes the items in ibuf_fifo and always signals the ++ * client IO thread. + */ + static void zserv_read(struct thread *thread) + { +@@ -325,15 +333,25 @@ static void zserv_read(struct thread *thread) + int sock; + size_t already; + struct stream_fifo *cache; +- uint32_t p2p_orig; +- +- uint32_t p2p; ++ uint32_t p2p; /* Temp p2p used to process */ ++ uint32_t p2p_orig; /* Configured p2p (Default-1000) */ ++ int p2p_avail; /* How much space is available for p2p */ + struct zmsghdr hdr; ++ size_t client_ibuf_fifo_cnt = stream_fifo_count_safe(client->ibuf_fifo); + + p2p_orig = atomic_load_explicit(&zrouter.packets_to_process, + memory_order_relaxed); ++ p2p_avail = p2p_orig - client_ibuf_fifo_cnt; ++ ++ /* ++ * Do nothing if ibuf_fifo count has reached its max limit. Otherwise ++ * proceed and reschedule ourselves if there is space in the ibuf_fifo. ++ */ ++ if (p2p_avail <= 0) ++ return; ++ ++ p2p = p2p_avail; + cache = stream_fifo_new(); +- p2p = p2p_orig; + sock = THREAD_FD(thread); + + while (p2p) { +@@ -433,7 +451,7 @@ static void zserv_read(struct thread *thread) + p2p--; + } + +- if (p2p < p2p_orig) { ++ if (p2p < (uint32_t)p2p_avail) { + uint64_t time_now = monotime(NULL); + + /* update session statistics */ +@@ -447,19 +465,23 @@ static void zserv_read(struct thread *thread) + while (cache->head) + stream_fifo_push(client->ibuf_fifo, + stream_fifo_pop(cache)); ++ /* Need to update count as main thread could have processed few */ ++ client_ibuf_fifo_cnt = ++ stream_fifo_count_safe(client->ibuf_fifo); + } + + /* Schedule job to process those packets */ + zserv_event(client, ZSERV_PROCESS_MESSAGES); +- + } + + if (IS_ZEBRA_DEBUG_PACKET) +- zlog_debug("Read %d packets from client: %s", p2p_orig - p2p, +- zebra_route_string(client->proto)); ++ zlog_debug("Read %d packets from client: %s. Current ibuf fifo count: %zu. Conf P2p %d", ++ p2p_avail - p2p, zebra_route_string(client->proto), ++ client_ibuf_fifo_cnt, p2p_orig); + +- /* Reschedule ourselves */ +- zserv_client_event(client, ZSERV_CLIENT_READ); ++ /* Reschedule ourselves since we have space in ibuf_fifo */ ++ if (client_ibuf_fifo_cnt < p2p_orig) ++ zserv_client_event(client, ZSERV_CLIENT_READ); + + stream_fifo_free(cache); + +@@ -495,14 +517,20 @@ static void zserv_client_event(struct zserv *client, + * as the task argument. + * + * Each message is popped off the client's input queue and the action associated +- * with the message is executed. This proceeds until there are no more messages, +- * an error occurs, or the processing limit is reached. ++ * with the message is executed. This proceeds until an error occurs, or the ++ * processing limit is reached. + * + * The client's I/O thread can push at most zrouter.packets_to_process messages + * onto the input buffer before notifying us there are packets to read. As long + * as we always process zrouter.packets_to_process messages here, then we can + * rely on the read thread to handle queuing this task enough times to process + * everything on the input queue. ++ * ++ * If the client ibuf always schedules a wakeup to the client IO to read more ++ * items from the socked buffer. This way we ensure ++ * - Client IO thread always tries to read the socket buffer and add more ++ * items to the ibuf_fifo (until max limit) ++ * - the hidden config change (zebra zapi-packets <>) is taken into account. + */ + static void zserv_process_messages(struct thread *thread) + { +@@ -538,6 +566,9 @@ static void zserv_process_messages(struct thread *thread) + /* Reschedule ourselves if necessary */ + if (need_resched) + zserv_event(client, ZSERV_PROCESS_MESSAGES); ++ ++ /* Ensure to include the read socket in the select/poll/etc.. */ ++ zserv_client_event(client, ZSERV_CLIENT_READ); + } + + int zserv_send_message(struct zserv *client, struct stream *msg) +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch b/src/sonic-frr/patch/0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch new file mode 100644 index 000000000000..21a2b90f44f4 --- /dev/null +++ b/src/sonic-frr/patch/0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch @@ -0,0 +1,90 @@ +From 7796ce2bb6eb1650ae1bec41ab2f270807b33c62 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 25 Jan 2024 12:53:24 -0500 +Subject: [PATCH 03/11] bgpd: backpressure - Add a typesafe list for Zebra + Announcement + +Modify the bgp master to hold a type safe list for bgp_dests that need +to be passed to zebra. + +Future commits will use this. + +Ticket: #3390099 + +Signed-off-by: Donald Sharp +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_main.c b/bgpd/bgp_main.c +index 90ae580bab..e28dde5d16 100644 +--- a/bgpd/bgp_main.c ++++ b/bgpd/bgp_main.c +@@ -214,6 +214,8 @@ static __attribute__((__noreturn__)) void bgp_exit(int status) + bgp_evpn_mh_finish(); + bgp_l3nhg_finish(); + ++ zebra_announce_fini(&bm->zebra_announce_head); ++ + /* reverse bgp_dump_init */ + bgp_dump_finish(); + +diff --git a/bgpd/bgp_table.h b/bgpd/bgp_table.h +index 121afc481f..d43bf86eb9 100644 +--- a/bgpd/bgp_table.h ++++ b/bgpd/bgp_table.h +@@ -101,6 +101,8 @@ struct bgp_node { + + STAILQ_ENTRY(bgp_dest) pq; + ++ struct zebra_announce_item zai; ++ + uint64_t version; + + mpls_label_t local_label; +@@ -121,6 +123,8 @@ struct bgp_node { + enum bgp_path_selection_reason reason; + }; + ++DECLARE_LIST(zebra_announce, struct bgp_dest, zai); ++ + extern void bgp_delete_listnode(struct bgp_dest *dest); + /* + * bgp_table_iter_t +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 023047050b..392423e028 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -8017,6 +8017,8 @@ void bgp_master_init(struct thread_master *master, const int buffer_size, + memset(&bgp_master, 0, sizeof(bgp_master)); + + bm = &bgp_master; ++ ++ zebra_announce_init(&bm->zebra_announce_head); + bm->bgp = list_new(); + bm->listen_sockets = list_new(); + bm->port = BGP_PORT_DEFAULT; +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index 72b5b50fb4..55f53bf9d3 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -32,6 +32,8 @@ + #include "srv6.h" + #include "iana_afi.h" + ++PREDECL_LIST(zebra_announce); ++ + /* For union sockunion. */ + #include "queue.h" + #include "sockunion.h" +@@ -180,6 +182,9 @@ struct bgp_master { + uint32_t inq_limit; + uint32_t outq_limit; + ++ /* To preserve ordering of installations into zebra across all Vrfs */ ++ struct zebra_announce_head zebra_announce_head; ++ + QOBJ_FIELDS; + }; + DECLARE_QOBJ_TYPE(bgp_master); +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0032-bgpd-fix-flushing-ipv6-flowspec-entries-when-peering.patch b/src/sonic-frr/patch/0032-bgpd-fix-flushing-ipv6-flowspec-entries-when-peering.patch new file mode 100644 index 000000000000..bfefccceff10 --- /dev/null +++ b/src/sonic-frr/patch/0032-bgpd-fix-flushing-ipv6-flowspec-entries-when-peering.patch @@ -0,0 +1,158 @@ +From 69e38aa82f325129ebad4535be5d834c599b5c0b Mon Sep 17 00:00:00 2001 +From: Philippe Guibert +Date: Wed, 7 Feb 2024 22:34:34 +0100 +Subject: [PATCH 04/11] bgpd: fix flushing ipv6 flowspec entries when peering + stops + +When a BGP flowspec peering stops, the BGP RIB entries for IPv6 +flowspec entries are removed, but not the ZEBRA RIB IPv6 entries. + +Actually, when calling bgp_zebra_withdraw() function call, only +the AFI_IP parameter is passed to the bgp_pbr_update_entry() function +in charge of the Flowspec add/delete in zebra. Fix this by passing +the AFI parameter to the bgp_zebra_withdraw() function. + +Note that using topotest does not show up the problem as the +flowspec driver code is not present and was refused. Without that, +routes are not installed, and can not be uninstalled. + +Fixes: 529efa234655 ("bgpd: allow flowspec entries to be announced to zebra") +Link: https://github.com/FRRouting/frr/pull/2025 + +Signed-off-by: Philippe Guibert + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index fbff57634a..455cd6cdbb 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -3312,7 +3312,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + */ + if (old_select && + is_route_parent_evpn(old_select)) +- bgp_zebra_withdraw(p, old_select, bgp, safi); ++ bgp_zebra_withdraw(p, old_select, bgp, afi, ++ safi); + + bgp_zebra_announce(dest, p, new_select, bgp, afi, safi); + } else { +@@ -3322,7 +3323,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || old_select->sub_type == BGP_ROUTE_AGGREGATE + || old_select->sub_type == BGP_ROUTE_IMPORTED)) + +- bgp_zebra_withdraw(p, old_select, bgp, safi); ++ bgp_zebra_withdraw(p, old_select, bgp, afi, ++ safi); + } + } + +@@ -4201,7 +4203,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + if (pi && pi->attr->rmap_table_id != new_attr.rmap_table_id) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + /* remove from RIB previous entry */ +- bgp_zebra_withdraw(p, pi, bgp, safi); ++ bgp_zebra_withdraw(p, pi, bgp, afi, safi); + } + + if (peer->sort == BGP_PEER_EBGP) { +@@ -5841,7 +5843,7 @@ bool bgp_inbound_policy_exists(struct peer *peer, struct bgp_filter *filter) + } + + static void bgp_cleanup_table(struct bgp *bgp, struct bgp_table *table, +- safi_t safi) ++ afi_t afi, safi_t safi) + { + struct bgp_dest *dest; + struct bgp_path_info *pi; +@@ -5865,7 +5867,8 @@ static void bgp_cleanup_table(struct bgp *bgp, struct bgp_table *table, + || pi->sub_type == BGP_ROUTE_IMPORTED)) { + + if (bgp_fibupd_safi(safi)) +- bgp_zebra_withdraw(p, pi, bgp, safi); ++ bgp_zebra_withdraw(p, pi, bgp, afi, ++ safi); + } + + bgp_path_info_reap(dest, pi); +@@ -5882,7 +5885,7 @@ void bgp_cleanup_routes(struct bgp *bgp) + for (afi = AFI_IP; afi < AFI_MAX; ++afi) { + if (afi == AFI_L2VPN) + continue; +- bgp_cleanup_table(bgp, bgp->rib[afi][SAFI_UNICAST], ++ bgp_cleanup_table(bgp, bgp->rib[afi][SAFI_UNICAST], afi, + SAFI_UNICAST); + /* + * VPN and ENCAP and EVPN tables are two-level (RD is top level) +@@ -5894,7 +5897,7 @@ void bgp_cleanup_routes(struct bgp *bgp) + dest = bgp_route_next(dest)) { + table = bgp_dest_get_bgp_table_info(dest); + if (table != NULL) { +- bgp_cleanup_table(bgp, table, safi); ++ bgp_cleanup_table(bgp, table, afi, safi); + bgp_table_finish(&table); + bgp_dest_set_bgp_table_info(dest, NULL); + bgp_dest_unlock_node(dest); +@@ -5905,7 +5908,7 @@ void bgp_cleanup_routes(struct bgp *bgp) + dest = bgp_route_next(dest)) { + table = bgp_dest_get_bgp_table_info(dest); + if (table != NULL) { +- bgp_cleanup_table(bgp, table, safi); ++ bgp_cleanup_table(bgp, table, afi, safi); + bgp_table_finish(&table); + bgp_dest_set_bgp_table_info(dest, NULL); + bgp_dest_unlock_node(dest); +@@ -5917,7 +5920,7 @@ void bgp_cleanup_routes(struct bgp *bgp) + dest = bgp_route_next(dest)) { + table = bgp_dest_get_bgp_table_info(dest); + if (table != NULL) { +- bgp_cleanup_table(bgp, table, SAFI_EVPN); ++ bgp_cleanup_table(bgp, table, afi, SAFI_EVPN); + bgp_table_finish(&table); + bgp_dest_set_bgp_table_info(dest, NULL); + bgp_dest_unlock_node(dest); +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index ff79746b4c..69240a3b83 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1761,7 +1761,7 @@ void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, + } + + void bgp_zebra_withdraw(const struct prefix *p, struct bgp_path_info *info, +- struct bgp *bgp, safi_t safi) ++ struct bgp *bgp, afi_t afi, safi_t safi) + { + struct zapi_route api; + struct peer *peer; +@@ -1780,7 +1780,7 @@ void bgp_zebra_withdraw(const struct prefix *p, struct bgp_path_info *info, + + if (safi == SAFI_FLOWSPEC) { + peer = info->peer; +- bgp_pbr_update_entry(peer->bgp, p, info, AFI_IP, safi, false); ++ bgp_pbr_update_entry(peer->bgp, p, info, afi, safi, false); + return; + } + +@@ -1821,7 +1821,7 @@ void bgp_zebra_withdraw_table_all_subtypes(struct bgp *bgp, afi_t afi, safi_t sa + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) + && (pi->type == ZEBRA_ROUTE_BGP)) + bgp_zebra_withdraw(bgp_dest_get_prefix(dest), +- pi, bgp, safi); ++ pi, bgp, afi, safi); + } + } + } +diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h +index 0a41069411..a5fe8d7ace 100644 +--- a/bgpd/bgp_zebra.h ++++ b/bgpd/bgp_zebra.h +@@ -49,7 +49,7 @@ extern void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + extern void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi); + extern void bgp_zebra_withdraw(const struct prefix *p, + struct bgp_path_info *path, struct bgp *bgp, +- safi_t safi); ++ afi_t afi, safi_t safi); + + /* Announce routes of any bgp subtype of a table to zebra */ + extern void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0033-bgpd-backpressure-cleanup-bgp_zebra_XX-func-args.patch b/src/sonic-frr/patch/0033-bgpd-backpressure-cleanup-bgp_zebra_XX-func-args.patch new file mode 100644 index 000000000000..16383dc95caa --- /dev/null +++ b/src/sonic-frr/patch/0033-bgpd-backpressure-cleanup-bgp_zebra_XX-func-args.patch @@ -0,0 +1,234 @@ +From 679ad9ee5f3c15570d697506183d37aa29f6ebf2 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 25 Jan 2024 13:07:37 -0500 +Subject: [PATCH 05/11] bgpd: backpressure - cleanup bgp_zebra_XX func args + +Since installing/withdrawing routes into zebra is going to be changed +around to be dest based in a list, + - Retrieve the afi/safi to use based upon the dest's afi/safi + instead of passing it in. + - Prefix is known by the dest. Remove this arg as well + +Ticket: #3390099 + +Signed-off-by: Donald Sharp +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 455cd6cdbb..d19f27110e 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -3214,8 +3214,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || new_select->sub_type + == BGP_ROUTE_IMPORTED)) + +- bgp_zebra_announce(dest, p, old_select, +- bgp, afi, safi); ++ bgp_zebra_announce(dest, old_select, ++ bgp); + } + } + +@@ -3312,10 +3312,9 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + */ + if (old_select && + is_route_parent_evpn(old_select)) +- bgp_zebra_withdraw(p, old_select, bgp, afi, +- safi); ++ bgp_zebra_withdraw(dest, old_select, bgp); + +- bgp_zebra_announce(dest, p, new_select, bgp, afi, safi); ++ bgp_zebra_announce(dest, new_select, bgp); + } else { + /* Withdraw the route from the kernel. */ + if (old_select && old_select->type == ZEBRA_ROUTE_BGP +@@ -3323,8 +3322,7 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || old_select->sub_type == BGP_ROUTE_AGGREGATE + || old_select->sub_type == BGP_ROUTE_IMPORTED)) + +- bgp_zebra_withdraw(p, old_select, bgp, afi, +- safi); ++ bgp_zebra_withdraw(dest, old_select, bgp); + } + } + +@@ -4203,7 +4201,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + if (pi && pi->attr->rmap_table_id != new_attr.rmap_table_id) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + /* remove from RIB previous entry */ +- bgp_zebra_withdraw(p, pi, bgp, afi, safi); ++ bgp_zebra_withdraw(dest, pi, bgp); + } + + if (peer->sort == BGP_PEER_EBGP) { +@@ -5867,8 +5865,7 @@ static void bgp_cleanup_table(struct bgp *bgp, struct bgp_table *table, + || pi->sub_type == BGP_ROUTE_IMPORTED)) { + + if (bgp_fibupd_safi(safi)) +- bgp_zebra_withdraw(p, pi, bgp, afi, +- safi); ++ bgp_zebra_withdraw(dest, pi, bgp); + } + + bgp_path_info_reap(dest, pi); +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 69240a3b83..920df835a4 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1292,9 +1292,8 @@ static bool bgp_zebra_use_nhop_weighted(struct bgp *bgp, struct attr *attr, + return true; + } + +-void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, +- struct bgp_path_info *info, struct bgp *bgp, afi_t afi, +- safi_t safi) ++void bgp_zebra_announce(struct bgp_dest *dest, struct bgp_path_info *info, ++ struct bgp *bgp) + { + struct zapi_route api = { 0 }; + struct zapi_nexthop *api_nh; +@@ -1321,6 +1320,8 @@ void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + uint32_t ttl = 0; + uint32_t bos = 0; + uint32_t exp = 0; ++ struct bgp_table *table = bgp_dest_table(dest); ++ const struct prefix *p = bgp_dest_get_prefix(dest); + + /* + * BGP is installing this route and bgp has been configured +@@ -1339,9 +1340,9 @@ void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + if (bgp->main_zebra_update_hold) + return; + +- if (safi == SAFI_FLOWSPEC) { +- bgp_pbr_update_entry(bgp, bgp_dest_get_prefix(dest), info, afi, +- safi, true); ++ if (table->safi == SAFI_FLOWSPEC) { ++ bgp_pbr_update_entry(bgp, p, info, table->afi, table->safi, ++ true); + return; + } + +@@ -1354,7 +1355,7 @@ void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + /* Make Zebra API structure. */ + api.vrf_id = bgp->vrf_id; + api.type = ZEBRA_ROUTE_BGP; +- api.safi = safi; ++ api.safi = table->safi; + api.prefix = *p; + SET_FLAG(api.message, ZAPI_MESSAGE_NEXTHOP); + +@@ -1458,12 +1459,13 @@ void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + } + } + +- if (bgp->table_map[afi][safi].name) { ++ if (bgp->table_map[table->afi][table->safi].name) { + /* Copy info and attributes, so the route-map + apply doesn't modify the BGP route info. */ + local_attr = *mpinfo->attr; + mpinfo_cp->attr = &local_attr; +- if (!bgp_table_map_apply(bgp->table_map[afi][safi].map, ++ if (!bgp_table_map_apply(bgp->table_map[table->afi] ++ [table->safi].map, + p, mpinfo_cp)) + continue; + +@@ -1619,7 +1621,7 @@ void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, + api.tag = tag; + } + +- distance = bgp_distance_apply(p, info, afi, safi, bgp); ++ distance = bgp_distance_apply(p, info, table->afi, table->safi, bgp); + if (distance) { + SET_FLAG(api.message, ZAPI_MESSAGE_DISTANCE); + api.distance = distance; +@@ -1731,9 +1733,7 @@ void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi) + && (pi->sub_type == BGP_ROUTE_NORMAL + || pi->sub_type == BGP_ROUTE_IMPORTED))) + +- bgp_zebra_announce(dest, +- bgp_dest_get_prefix(dest), +- pi, bgp, afi, safi); ++ bgp_zebra_announce(dest, pi, bgp); + } + + /* Announce routes of any bgp subtype of a table to zebra */ +@@ -1755,16 +1755,16 @@ void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) && + pi->type == ZEBRA_ROUTE_BGP) +- bgp_zebra_announce(dest, +- bgp_dest_get_prefix(dest), +- pi, bgp, afi, safi); ++ bgp_zebra_announce(dest, pi, bgp); + } + +-void bgp_zebra_withdraw(const struct prefix *p, struct bgp_path_info *info, +- struct bgp *bgp, afi_t afi, safi_t safi) ++void bgp_zebra_withdraw(struct bgp_dest *dest, struct bgp_path_info *info, ++ struct bgp *bgp) + { + struct zapi_route api; + struct peer *peer; ++ struct bgp_table *table = bgp_dest_table(dest); ++ const struct prefix *p = bgp_dest_get_prefix(dest); + + /* + * If we are withdrawing the route, we don't need to have this +@@ -1778,16 +1778,17 @@ void bgp_zebra_withdraw(const struct prefix *p, struct bgp_path_info *info, + if (!bgp_install_info_to_zebra(bgp)) + return; + +- if (safi == SAFI_FLOWSPEC) { ++ if (table->safi == SAFI_FLOWSPEC) { + peer = info->peer; +- bgp_pbr_update_entry(peer->bgp, p, info, afi, safi, false); ++ bgp_pbr_update_entry(peer->bgp, p, info, table->afi, ++ table->safi, false); + return; + } + + memset(&api, 0, sizeof(api)); + api.vrf_id = bgp->vrf_id; + api.type = ZEBRA_ROUTE_BGP; +- api.safi = safi; ++ api.safi = table->safi; + api.prefix = *p; + + if (info->attr->rmap_table_id) { +@@ -1820,8 +1821,7 @@ void bgp_zebra_withdraw_table_all_subtypes(struct bgp *bgp, afi_t afi, safi_t sa + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) + && (pi->type == ZEBRA_ROUTE_BGP)) +- bgp_zebra_withdraw(bgp_dest_get_prefix(dest), +- pi, bgp, afi, safi); ++ bgp_zebra_withdraw(dest, pi, bgp); + } + } + } +diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h +index a5fe8d7ace..b77e423f8f 100644 +--- a/bgpd/bgp_zebra.h ++++ b/bgpd/bgp_zebra.h +@@ -43,13 +43,11 @@ extern void bgp_zebra_destroy(void); + extern int bgp_zebra_get_table_range(uint32_t chunk_size, + uint32_t *start, uint32_t *end); + extern int bgp_if_update_all(void); +-extern void bgp_zebra_announce(struct bgp_dest *dest, const struct prefix *p, +- struct bgp_path_info *path, struct bgp *bgp, +- afi_t afi, safi_t safi); ++extern void bgp_zebra_announce(struct bgp_dest *dest, ++ struct bgp_path_info *path, struct bgp *bgp); + extern void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi); +-extern void bgp_zebra_withdraw(const struct prefix *p, +- struct bgp_path_info *path, struct bgp *bgp, +- afi_t afi, safi_t safi); ++extern void bgp_zebra_withdraw(struct bgp_dest *dest, ++ struct bgp_path_info *path, struct bgp *bgp); + + /* Announce routes of any bgp subtype of a table to zebra */ + extern void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0034-gpd-backpressure-Handle-BGP-Zebra-Install-evt-Creat.patch b/src/sonic-frr/patch/0034-gpd-backpressure-Handle-BGP-Zebra-Install-evt-Creat.patch new file mode 100644 index 000000000000..724cf1297a59 --- /dev/null +++ b/src/sonic-frr/patch/0034-gpd-backpressure-Handle-BGP-Zebra-Install-evt-Creat.patch @@ -0,0 +1,512 @@ +From 6d5604a9315801e9380c11357d663ad6537ed8ab Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Fri, 26 Jan 2024 14:48:53 -0500 +Subject: [PATCH 06/11] bgpd : backpressure - Handle BGP-Zebra Install evt + Creation + +BGP is now keeping a list of dests with the dest having a pointer +to the bgp_path_info that it will be working on. + +1) When bgp receives a prefix, process it, add the bgp_dest of the +prefix into the new Fifo list if not present, update the flags (Ex: +earlier if the prefix was advertised and now it is a withdrawn), +increment the ref_count and DO NOT advertise the install/withdraw +to zebra yet. + +2) Schedule an event to wake up to invoke the new function which will +walk the list one by one and installs/withdraws the routes into zebra. + a) if BUFFER_EMPTY, process the next item on the list + b) if BUFFER_PENDING, bail out and the callback in + zclient_flush_data() will invoke the same function when BUFFER_EMPTY + +Changes + - rename old bgp_zebra_announce to bgp_zebra_announce_actual + - rename old bgp_zebra_withdrw to bgp_zebra_withdraw_actual + - Handle new fifo list cleanup in bgp_exit() + - New funcs: bgp_handle_route_announcements_to_zebra() and + bgp_zebra_route_install() + - Define a callback function to invoke + bgp_handle_route_announcements_to_zebra() when BUFFER_EMPTY in + zclient_flush_data() + +The current change deals with bgp installing routes via +bgp_process_main_one() + +Ticket: #3390099 + +Signed-off-by: Donald Sharp +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index d19f27110e..c29442d96c 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -3214,8 +3214,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || new_select->sub_type + == BGP_ROUTE_IMPORTED)) + +- bgp_zebra_announce(dest, old_select, +- bgp); ++ bgp_zebra_route_install( ++ dest, old_select, bgp, true); + } + } + +@@ -3312,9 +3312,10 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + */ + if (old_select && + is_route_parent_evpn(old_select)) +- bgp_zebra_withdraw(dest, old_select, bgp); ++ bgp_zebra_route_install(dest, old_select, bgp, ++ false); + +- bgp_zebra_announce(dest, new_select, bgp); ++ bgp_zebra_route_install(dest, new_select, bgp, true); + } else { + /* Withdraw the route from the kernel. */ + if (old_select && old_select->type == ZEBRA_ROUTE_BGP +@@ -3322,7 +3323,8 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || old_select->sub_type == BGP_ROUTE_AGGREGATE + || old_select->sub_type == BGP_ROUTE_IMPORTED)) + +- bgp_zebra_withdraw(dest, old_select, bgp); ++ bgp_zebra_route_install(dest, old_select, bgp, ++ false); + } + } + +@@ -4201,7 +4203,7 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + if (pi && pi->attr->rmap_table_id != new_attr.rmap_table_id) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + /* remove from RIB previous entry */ +- bgp_zebra_withdraw(dest, pi, bgp); ++ bgp_zebra_route_install(dest, pi, bgp, false); + } + + if (peer->sort == BGP_PEER_EBGP) { +@@ -5865,7 +5867,8 @@ static void bgp_cleanup_table(struct bgp *bgp, struct bgp_table *table, + || pi->sub_type == BGP_ROUTE_IMPORTED)) { + + if (bgp_fibupd_safi(safi)) +- bgp_zebra_withdraw(dest, pi, bgp); ++ bgp_zebra_withdraw_actual(dest, pi, ++ bgp); + } + + bgp_path_info_reap(dest, pi); +diff --git a/bgpd/bgp_table.h b/bgpd/bgp_table.h +index d43bf86eb9..45d61f8dfd 100644 +--- a/bgpd/bgp_table.h ++++ b/bgpd/bgp_table.h +@@ -102,6 +102,7 @@ struct bgp_node { + STAILQ_ENTRY(bgp_dest) pq; + + struct zebra_announce_item zai; ++ struct bgp_path_info *za_bgp_pi; + + uint64_t version; + +@@ -117,6 +118,8 @@ struct bgp_node { + #define BGP_NODE_FIB_INSTALLED (1 << 6) + #define BGP_NODE_LABEL_REQUESTED (1 << 7) + #define BGP_NODE_SOFT_RECONFIG (1 << 8) ++#define BGP_NODE_SCHEDULE_FOR_INSTALL (1 << 10) ++#define BGP_NODE_SCHEDULE_FOR_DELETE (1 << 11) + + struct bgp_addpath_node_data tx_addpath; + +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 920df835a4..1162941ef1 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1292,8 +1292,9 @@ static bool bgp_zebra_use_nhop_weighted(struct bgp *bgp, struct attr *attr, + return true; + } + +-void bgp_zebra_announce(struct bgp_dest *dest, struct bgp_path_info *info, +- struct bgp *bgp) ++static enum zclient_send_status ++bgp_zebra_announce_actual(struct bgp_dest *dest, struct bgp_path_info *info, ++ struct bgp *bgp) + { + struct zapi_route api = { 0 }; + struct zapi_nexthop *api_nh; +@@ -1323,27 +1324,10 @@ void bgp_zebra_announce(struct bgp_dest *dest, struct bgp_path_info *info, + struct bgp_table *table = bgp_dest_table(dest); + const struct prefix *p = bgp_dest_get_prefix(dest); + +- /* +- * BGP is installing this route and bgp has been configured +- * to suppress announcements until the route has been installed +- * let's set the fact that we expect this route to be installed +- */ +- if (BGP_SUPPRESS_FIB_ENABLED(bgp)) +- SET_FLAG(dest->flags, BGP_NODE_FIB_INSTALL_PENDING); +- +- /* Don't try to install if we're not connected to Zebra or Zebra doesn't +- * know of this instance. +- */ +- if (!bgp_install_info_to_zebra(bgp)) +- return; +- +- if (bgp->main_zebra_update_hold) +- return; +- + if (table->safi == SAFI_FLOWSPEC) { + bgp_pbr_update_entry(bgp, p, info, table->afi, table->safi, + true); +- return; ++ return ZCLIENT_SEND_SUCCESS; + } + + /* +@@ -1704,10 +1688,11 @@ void bgp_zebra_announce(struct bgp_dest *dest, struct bgp_path_info *info, + zlog_debug("%s: %pFX: announcing to zebra (recursion %sset)", + __func__, p, (recursion_flag ? "" : "NOT ")); + } +- zclient_route_send(is_add ? ZEBRA_ROUTE_ADD : ZEBRA_ROUTE_DELETE, +- zclient, &api); ++ return zclient_route_send(is_add ? ZEBRA_ROUTE_ADD : ZEBRA_ROUTE_DELETE, ++ zclient, &api); + } + ++ + /* Announce all routes of a table to zebra */ + void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi) + { +@@ -1733,7 +1718,7 @@ void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi) + && (pi->sub_type == BGP_ROUTE_NORMAL + || pi->sub_type == BGP_ROUTE_IMPORTED))) + +- bgp_zebra_announce(dest, pi, bgp); ++ bgp_zebra_route_install(dest, pi, bgp, true); + } + + /* Announce routes of any bgp subtype of a table to zebra */ +@@ -1755,34 +1740,23 @@ void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) && + pi->type == ZEBRA_ROUTE_BGP) +- bgp_zebra_announce(dest, pi, bgp); ++ bgp_zebra_route_install(dest, pi, bgp, true); + } + +-void bgp_zebra_withdraw(struct bgp_dest *dest, struct bgp_path_info *info, +- struct bgp *bgp) ++enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, ++ struct bgp_path_info *info, ++ struct bgp *bgp) + { + struct zapi_route api; + struct peer *peer; + struct bgp_table *table = bgp_dest_table(dest); + const struct prefix *p = bgp_dest_get_prefix(dest); + +- /* +- * If we are withdrawing the route, we don't need to have this +- * flag set. So unset it. +- */ +- UNSET_FLAG(info->net->flags, BGP_NODE_FIB_INSTALL_PENDING); +- +- /* Don't try to install if we're not connected to Zebra or Zebra doesn't +- * know of this instance. +- */ +- if (!bgp_install_info_to_zebra(bgp)) +- return; +- + if (table->safi == SAFI_FLOWSPEC) { + peer = info->peer; + bgp_pbr_update_entry(peer->bgp, p, info, table->afi, + table->safi, false); +- return; ++ return ZCLIENT_SEND_SUCCESS; + } + + memset(&api, 0, sizeof(api)); +@@ -1800,7 +1774,172 @@ void bgp_zebra_withdraw(struct bgp_dest *dest, struct bgp_path_info *info, + zlog_debug("Tx route delete VRF %u %pFX", bgp->vrf_id, + &api.prefix); + +- zclient_route_send(ZEBRA_ROUTE_DELETE, zclient, &api); ++ return zclient_route_send(ZEBRA_ROUTE_DELETE, zclient, &api); ++} ++ ++/* ++ * Walk the new Fifo list one by one and invoke bgp_zebra_announce/withdraw ++ * to install/withdraw the routes to zebra. ++ * ++ * If status = ZCLIENT_SEND_SUCCESS (Buffer empt)y i.e. Zebra is free to ++ * receive more incoming data, then pick the next item on the list and ++ * continue processing. ++ * ++ * If status = ZCLIENT_SEND_BUFFERED (Buffer pending) i.e. Zebra is busy, ++ * break and bail out of the function because once at some point when zebra ++ * is free, a callback is triggered which inturn call this same function and ++ * continue processing items on list. ++ */ ++#define ZEBRA_ANNOUNCEMENTS_LIMIT 1000 ++static void bgp_handle_route_announcements_to_zebra(struct thread *e) ++{ ++ uint32_t count = 0; ++ struct bgp_dest *dest = NULL; ++ struct bgp_table *table = NULL; ++ enum zclient_send_status status = ZCLIENT_SEND_SUCCESS; ++ bool install; ++ ++ while (count < ZEBRA_ANNOUNCEMENTS_LIMIT) { ++ dest = zebra_announce_pop(&bm->zebra_announce_head); ++ ++ if (!dest) ++ break; ++ ++ table = bgp_dest_table(dest); ++ install = ++ CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ ++ if (BGP_DEBUG(zebra, ZEBRA)) ++ zlog_debug( ++ "BGP %s route %pBD(%s) with dest %p and flags 0x%x to zebra", ++ install ? "announcing" : "withdrawing", dest, ++ table->bgp->name_pretty, dest, dest->flags); ++ ++ if (install) { ++ status = bgp_zebra_announce_actual( ++ dest, dest->za_bgp_pi, table->bgp); ++ UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ } else { ++ status = bgp_zebra_withdraw_actual( ++ dest, dest->za_bgp_pi, table->bgp); ++ UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); ++ } ++ ++ bgp_path_info_unlock(dest->za_bgp_pi); ++ dest->za_bgp_pi = NULL; ++ bgp_dest_unlock_node(dest); ++ ++ if (status == ZCLIENT_SEND_BUFFERED) ++ break; ++ ++ count++; ++ } ++ ++ if (status != ZCLIENT_SEND_BUFFERED && ++ zebra_announce_count(&bm->zebra_announce_head)) ++ thread_add_event(bm->master, ++ bgp_handle_route_announcements_to_zebra, NULL, ++ 0, &bm->t_bgp_zebra_route); ++} ++ ++/* ++ * Callback function invoked when zclient_flush_data() receives a BUFFER_EMPTY ++ * i.e. zebra is free to receive more incoming data. ++ */ ++static void bgp_zebra_buffer_write_ready(void) ++{ ++ bgp_handle_route_announcements_to_zebra(NULL); ++} ++ ++/* ++ * BGP is now keeping a list of dests with the dest having a pointer ++ * to the bgp_path_info that it will be working on. ++ * Here is the sequence of events that should happen: ++ * ++ * Current State New State Action ++ * ------------- --------- ------ ++ * ---- Install Place dest on list, save pi, mark ++ * as going to be installed ++ * ---- Withdrawal Place dest on list, save pi, mark ++ * as going to be deleted ++ * ++ * Install Install Leave dest on list, release old pi, ++ * save new pi, mark as going to be ++ * Installed ++ * Install Withdrawal Leave dest on list, release old pi, ++ * save new pi, mark as going to be ++ * withdrawan, remove install flag ++ * ++ * Withdrawal Install Special case, send withdrawal immediately ++ * Leave dest on list, release old pi, ++ * save new pi, mark as going to be ++ * installed. ++ * Withdrawal Withdrawal Leave dest on list, release old pi, ++ * save new pi, mark as going to be ++ * withdrawn. ++ */ ++void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, ++ struct bgp *bgp, bool install) ++{ ++ /* ++ * BGP is installing this route and bgp has been configured ++ * to suppress announcements until the route has been installed ++ * let's set the fact that we expect this route to be installed ++ */ ++ if (install) { ++ if (BGP_SUPPRESS_FIB_ENABLED(bgp)) ++ SET_FLAG(dest->flags, BGP_NODE_FIB_INSTALL_PENDING); ++ ++ if (bgp->main_zebra_update_hold) ++ return; ++ } else { ++ UNSET_FLAG(dest->flags, BGP_NODE_FIB_INSTALL_PENDING); ++ } ++ ++ /* ++ * Don't try to install if we're not connected to Zebra or Zebra doesn't ++ * know of this instance. ++ */ ++ if (!bgp_install_info_to_zebra(bgp)) ++ return; ++ ++ if (!CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL) && ++ !CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE)) { ++ zebra_announce_add_tail(&bm->zebra_announce_head, dest); ++ /* ++ * If neither flag is set and za_bgp_pi is not set then it is a ++ * bug ++ */ ++ assert(!dest->za_bgp_pi); ++ bgp_path_info_lock(info); ++ bgp_dest_lock_node(dest); ++ dest->za_bgp_pi = info; ++ } else if (CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL)) { ++ assert(dest->za_bgp_pi); ++ bgp_path_info_unlock(dest->za_bgp_pi); ++ bgp_path_info_lock(info); ++ dest->za_bgp_pi = info; ++ } else if (CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE)) { ++ assert(dest->za_bgp_pi); ++ if (install) ++ bgp_zebra_withdraw_actual(dest, dest->za_bgp_pi, bgp); ++ ++ bgp_path_info_unlock(dest->za_bgp_pi); ++ bgp_path_info_lock(info); ++ dest->za_bgp_pi = info; ++ } ++ ++ if (install) { ++ UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); ++ SET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ } else { ++ UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ SET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); ++ } ++ ++ thread_add_event(bm->master, bgp_handle_route_announcements_to_zebra, ++ NULL, 0, &bm->t_bgp_zebra_route); + } + + /* Withdraw all entries in a BGP instances RIB table from Zebra */ +@@ -1821,7 +1960,7 @@ void bgp_zebra_withdraw_table_all_subtypes(struct bgp *bgp, afi_t afi, safi_t sa + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) + && (pi->type == ZEBRA_ROUTE_BGP)) +- bgp_zebra_withdraw(dest, pi, bgp); ++ bgp_zebra_route_install(dest, pi, bgp, false); + } + } + } +@@ -3470,6 +3609,7 @@ void bgp_zebra_init(struct thread_master *master, unsigned short instance) + zclient = zclient_new(master, &zclient_options_default, bgp_handlers, + array_size(bgp_handlers)); + zclient_init(zclient, ZEBRA_ROUTE_BGP, 0, &bgpd_privs); ++ zclient->zebra_buffer_write_ready = bgp_zebra_buffer_write_ready; + zclient->zebra_connected = bgp_zebra_connected; + zclient->instance = instance; + } +diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h +index b77e423f8f..45fcf7f514 100644 +--- a/bgpd/bgp_zebra.h ++++ b/bgpd/bgp_zebra.h +@@ -43,11 +43,10 @@ extern void bgp_zebra_destroy(void); + extern int bgp_zebra_get_table_range(uint32_t chunk_size, + uint32_t *start, uint32_t *end); + extern int bgp_if_update_all(void); +-extern void bgp_zebra_announce(struct bgp_dest *dest, +- struct bgp_path_info *path, struct bgp *bgp); ++extern void bgp_zebra_route_install(struct bgp_dest *dest, ++ struct bgp_path_info *path, struct bgp *bgp, ++ bool install); + extern void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi); +-extern void bgp_zebra_withdraw(struct bgp_dest *dest, +- struct bgp_path_info *path, struct bgp *bgp); + + /* Announce routes of any bgp subtype of a table to zebra */ + extern void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, +@@ -131,4 +130,7 @@ extern int bgp_zebra_update(afi_t afi, safi_t safi, vrf_id_t vrf_id, int type); + extern int bgp_zebra_stale_timer_update(struct bgp *bgp); + extern int bgp_zebra_srv6_manager_get_locator_chunk(const char *name); + extern int bgp_zebra_srv6_manager_release_locator_chunk(const char *name); ++extern enum zclient_send_status ++bgp_zebra_withdraw_actual(struct bgp_dest *dest, struct bgp_path_info *info, ++ struct bgp *bgp); + #endif /* _QUAGGA_BGP_ZEBRA_H */ +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 392423e028..da133d71c1 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -3688,10 +3688,20 @@ int bgp_delete(struct bgp *bgp) + afi_t afi; + safi_t safi; + int i; ++ struct bgp_dest *dest = NULL; + struct graceful_restart_info *gr_info; + + assert(bgp); + ++ while (zebra_announce_count(&bm->zebra_announce_head)) { ++ dest = zebra_announce_pop(&bm->zebra_announce_head); ++ if (dest->za_bgp_pi->peer->bgp == bgp) { ++ bgp_path_info_unlock(dest->za_bgp_pi); ++ bgp_dest_unlock_node(dest); ++ } else ++ zebra_announce_add_tail(&bm->zebra_announce_head, dest); ++ } ++ + bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); + + /* make sure we withdraw any exported routes */ +@@ -8035,6 +8045,7 @@ void bgp_master_init(struct thread_master *master, const int buffer_size, + bm->tcp_dscp = IPTOS_PREC_INTERNETCONTROL; + bm->inq_limit = BM_DEFAULT_Q_LIMIT; + bm->outq_limit = BM_DEFAULT_Q_LIMIT; ++ bm->t_bgp_zebra_route = NULL; + + bgp_mac_init(); + /* init the rd id space. +@@ -8278,6 +8289,7 @@ void bgp_terminate(void) + list_delete(&bm->listen_sockets); + + THREAD_OFF(bm->t_rmap_update); ++ THREAD_OFF(bm->t_bgp_zebra_route); + + bgp_mac_finish(); + } +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index 55f53bf9d3..bdf31f5161 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -182,6 +182,8 @@ struct bgp_master { + uint32_t inq_limit; + uint32_t outq_limit; + ++ struct thread *t_bgp_zebra_route; ++ + /* To preserve ordering of installations into zebra across all Vrfs */ + struct zebra_announce_head zebra_announce_head; + +diff --git a/lib/zclient.c b/lib/zclient.c +index 0082b21485..c48c1c6ee4 100644 +--- a/lib/zclient.c ++++ b/lib/zclient.c +@@ -285,6 +285,7 @@ static void zclient_flush_data(struct thread *thread) + zclient->sock, &zclient->t_write); + break; + case BUFFER_EMPTY: ++ /* Currently only Sharpd and Bgpd has callbacks defined */ + if (zclient->zebra_buffer_write_ready) + (*zclient->zebra_buffer_write_ready)(); + break; +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0035-bgpd-backpressure-Handle-BGP-Zebra-EPVN-Install-evt-.patch b/src/sonic-frr/patch/0035-bgpd-backpressure-Handle-BGP-Zebra-EPVN-Install-evt-.patch new file mode 100644 index 000000000000..e83526b6e8ef --- /dev/null +++ b/src/sonic-frr/patch/0035-bgpd-backpressure-Handle-BGP-Zebra-EPVN-Install-evt-.patch @@ -0,0 +1,995 @@ +From 84f7778808b7fee771f26c3cae46292ef85f06c0 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Thu, 15 Feb 2024 11:23:51 -0800 +Subject: [PATCH 07/11] bgpd : backpressure - Handle BGP-Zebra(EPVN) Install + evt Creation + +Current changes deals with EVPN routes installation to zebra. + +In evpn_route_select_install() we invoke evpn_zebra_install/uninstall +which sends zclient_send_message(). + +This is a continuation of code changes (similar to +ccfe452763d16c432fa81fd20e805bec819b345e) but to handle evpn part +of the code. + +Ticket: #3390099 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 2b2cfa0f4c..622fd6afd2 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -863,11 +863,10 @@ struct bgp_dest *bgp_evpn_vni_node_lookup(const struct bgpevpn *vpn, + /* + * Add (update) or delete MACIP from zebra. + */ +-static int bgp_zebra_send_remote_macip(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p, +- const struct ethaddr *mac, +- struct in_addr remote_vtep_ip, int add, +- uint8_t flags, uint32_t seq, esi_t *esi) ++static enum zclient_send_status bgp_zebra_send_remote_macip( ++ struct bgp *bgp, struct bgpevpn *vpn, const struct prefix_evpn *p, ++ const struct ethaddr *mac, struct in_addr remote_vtep_ip, int add, ++ uint8_t flags, uint32_t seq, esi_t *esi) + { + struct stream *s; + uint16_t ipa_len; +@@ -875,8 +874,12 @@ static int bgp_zebra_send_remote_macip(struct bgp *bgp, struct bgpevpn *vpn, + bool esi_valid; + + /* Check socket. */ +- if (!zclient || zclient->sock < 0) +- return 0; ++ if (!zclient || zclient->sock < 0) { ++ if (BGP_DEBUG(zebra, ZEBRA)) ++ zlog_debug("%s: No zclient or zclient->sock exists", ++ __func__); ++ return ZCLIENT_SEND_SUCCESS; ++ } + + /* Don't try to register if Zebra doesn't know of this instance. */ + if (!IS_BGP_INST_KNOWN_TO_ZEBRA(bgp)) { +@@ -884,7 +887,7 @@ static int bgp_zebra_send_remote_macip(struct bgp *bgp, struct bgpevpn *vpn, + zlog_debug( + "%s: No zebra instance to talk to, not installing remote macip", + __func__); +- return 0; ++ return ZCLIENT_SEND_SUCCESS; + } + + if (!esi) +@@ -956,15 +959,20 @@ static int bgp_zebra_send_remote_macip(struct bgp *bgp, struct bgpevpn *vpn, + /* + * Add (update) or delete remote VTEP from zebra. + */ +-static int bgp_zebra_send_remote_vtep(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p, +- int flood_control, int add) ++static enum zclient_send_status ++bgp_zebra_send_remote_vtep(struct bgp *bgp, struct bgpevpn *vpn, ++ const struct prefix_evpn *p, int flood_control, ++ int add) + { + struct stream *s; + + /* Check socket. */ +- if (!zclient || zclient->sock < 0) +- return 0; ++ if (!zclient || zclient->sock < 0) { ++ if (BGP_DEBUG(zebra, ZEBRA)) ++ zlog_debug("%s: No zclient or zclient->sock exists", ++ __func__); ++ return ZCLIENT_SEND_SUCCESS; ++ } + + /* Don't try to register if Zebra doesn't know of this instance. */ + if (!IS_BGP_INST_KNOWN_TO_ZEBRA(bgp)) { +@@ -972,7 +980,7 @@ static int bgp_zebra_send_remote_vtep(struct bgp *bgp, struct bgpevpn *vpn, + zlog_debug( + "%s: No zebra instance to talk to, not installing remote vtep", + __func__); +- return 0; ++ return ZCLIENT_SEND_SUCCESS; + } + + s = zclient->obuf; +@@ -989,7 +997,7 @@ static int bgp_zebra_send_remote_vtep(struct bgp *bgp, struct bgpevpn *vpn, + EC_BGP_VTEP_INVALID, + "Bad remote IP when trying to %s remote VTEP for VNI %u", + add ? "ADD" : "DEL", vpn->vni); +- return -1; ++ return ZCLIENT_SEND_FAILURE; + } + stream_putl(s, flood_control); + +@@ -1222,14 +1230,15 @@ static void add_mac_mobility_to_attr(uint32_t seq_num, struct attr *attr) + } + + /* Install EVPN route into zebra. */ +-static int evpn_zebra_install(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p, +- struct bgp_path_info *pi) ++enum zclient_send_status evpn_zebra_install(struct bgp *bgp, ++ struct bgpevpn *vpn, ++ const struct prefix_evpn *p, ++ struct bgp_path_info *pi) + { +- int ret; + uint8_t flags; + int flood_control; + uint32_t seq; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + if (p->prefix.route_type == BGP_EVPN_MAC_IP_ROUTE) { + flags = 0; +@@ -1302,6 +1311,7 @@ static int evpn_zebra_install(struct bgp *bgp, struct bgpevpn *vpn, + flood_control = VXLAN_FLOOD_DISABLED; + break; + } ++ + ret = bgp_zebra_send_remote_vtep(bgp, vpn, p, flood_control, 1); + } + +@@ -1309,11 +1319,13 @@ static int evpn_zebra_install(struct bgp *bgp, struct bgpevpn *vpn, + } + + /* Uninstall EVPN route from zebra. */ +-static int evpn_zebra_uninstall(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p, +- struct bgp_path_info *pi, bool is_sync) ++enum zclient_send_status evpn_zebra_uninstall(struct bgp *bgp, ++ struct bgpevpn *vpn, ++ const struct prefix_evpn *p, ++ struct bgp_path_info *pi, ++ bool is_sync) + { +- int ret; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + if (p->prefix.route_type == BGP_EVPN_MAC_IP_ROUTE) + ret = bgp_zebra_send_remote_macip( +@@ -1328,7 +1340,7 @@ static int evpn_zebra_uninstall(struct bgp *bgp, struct bgpevpn *vpn, + ret = bgp_evpn_remote_es_evi_del(bgp, vpn, p); + else + ret = bgp_zebra_send_remote_vtep(bgp, vpn, p, +- VXLAN_FLOOD_DISABLED, 0); ++ VXLAN_FLOOD_DISABLED, 0); + + return ret; + } +@@ -1419,12 +1431,18 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + && !CHECK_FLAG(dest->flags, BGP_NODE_USER_CLEAR) + && !CHECK_FLAG(old_select->flags, BGP_PATH_ATTR_CHANGED) + && !bgp_addpath_is_addpath_used(&bgp->tx_addpath, afi, safi)) { +- if (bgp_zebra_has_route_changed(old_select)) +- ret = evpn_zebra_install( +- bgp, vpn, +- (const struct prefix_evpn *)bgp_dest_get_prefix( +- dest), +- old_select); ++ if (bgp_zebra_has_route_changed(old_select)) { ++ if (CHECK_FLAG(bgp->flags, BGP_FLAG_DELETE_IN_PROGRESS)) ++ evpn_zebra_install( ++ bgp, vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ old_select); ++ else ++ bgp_zebra_route_install(dest, old_select, bgp, ++ true, vpn, false); ++ } ++ + UNSET_FLAG(old_select->flags, BGP_PATH_MULTIPATH_CHG); + UNSET_FLAG(old_select->flags, BGP_PATH_LINK_BW_CHG); + bgp_zebra_clear_route_change_flags(dest); +@@ -1456,10 +1474,14 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + if (new_select && new_select->type == ZEBRA_ROUTE_BGP + && (new_select->sub_type == BGP_ROUTE_IMPORTED || + bgp_evpn_attr_is_sync(new_select->attr))) { +- ret = evpn_zebra_install( +- bgp, vpn, +- (struct prefix_evpn *)bgp_dest_get_prefix(dest), +- new_select); ++ if (CHECK_FLAG(bgp->flags, BGP_FLAG_DELETE_IN_PROGRESS)) ++ evpn_zebra_install(bgp, vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ new_select); ++ else ++ bgp_zebra_route_install(dest, new_select, bgp, true, ++ vpn, false); + + /* If an old best existed and it was a "local" route, the only + * reason +@@ -1476,13 +1498,20 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + evpn_delete_old_local_route(bgp, vpn, dest, + old_select, new_select); + } else { +- if (old_select && old_select->type == ZEBRA_ROUTE_BGP +- && old_select->sub_type == BGP_ROUTE_IMPORTED) +- ret = evpn_zebra_uninstall( +- bgp, vpn, +- (const struct prefix_evpn *)bgp_dest_get_prefix( +- dest), +- old_select, false); ++ if (old_select && old_select->type == ZEBRA_ROUTE_BGP && ++ old_select->sub_type == BGP_ROUTE_IMPORTED) { ++ if (CHECK_FLAG(bgp->flags, ++ BGP_FLAG_DELETE_IN_PROGRESS) || ++ CHECK_FLAG(bgp->flags, BGP_FLAG_VNI_DOWN)) ++ evpn_zebra_uninstall( ++ bgp, vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ old_select, false); ++ else ++ bgp_zebra_route_install(dest, old_select, bgp, ++ false, vpn, false); ++ } + } + + /* Clear any route change flags. */ +@@ -2012,9 +2041,19 @@ static void evpn_zebra_reinstall_best_route(struct bgp *bgp, + if (curr_select && curr_select->type == ZEBRA_ROUTE_BGP + && (curr_select->sub_type == BGP_ROUTE_IMPORTED || + bgp_evpn_attr_is_sync(curr_select->attr))) +- evpn_zebra_install(bgp, vpn, +- (const struct prefix_evpn *)bgp_dest_get_prefix(dest), +- curr_select); ++ if (curr_select && curr_select->type == ZEBRA_ROUTE_BGP && ++ (curr_select->sub_type == BGP_ROUTE_IMPORTED || ++ bgp_evpn_attr_is_sync(curr_select->attr))) { ++ if (CHECK_FLAG(bgp->flags, BGP_FLAG_DELETE_IN_PROGRESS)) ++ evpn_zebra_install( ++ bgp, vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ curr_select); ++ else ++ bgp_zebra_route_install(dest, curr_select, bgp, ++ true, vpn, false); ++ } + } + + /* +@@ -2189,8 +2228,16 @@ static int update_evpn_route(struct bgp *bgp, struct bgpevpn *vpn, + * has been removed. + */ + new_is_sync = bgp_evpn_attr_is_sync(pi->attr); +- if (!new_is_sync && old_is_sync) +- evpn_zebra_uninstall(bgp, vpn, p, pi, true); ++ if (!new_is_sync && old_is_sync) { ++ if (CHECK_FLAG(bgp->flags, ++ BGP_FLAG_DELETE_IN_PROGRESS)) ++ evpn_zebra_uninstall(bgp, vpn, p, pi, ++ true); ++ else ++ bgp_zebra_route_install(dest, pi, bgp, ++ false, vpn, ++ true); ++ } + } + } + bgp_path_info_unlock(pi); +@@ -2444,8 +2491,16 @@ void bgp_evpn_update_type2_route_entry(struct bgp *bgp, struct bgpevpn *vpn, + * has been removed. + */ + new_is_sync = bgp_evpn_attr_is_sync(pi->attr); +- if (!new_is_sync && old_is_sync) +- evpn_zebra_uninstall(bgp, vpn, &evp, pi, true); ++ if (!new_is_sync && old_is_sync) { ++ if (CHECK_FLAG(bgp->flags, ++ BGP_FLAG_DELETE_IN_PROGRESS)) ++ (void)evpn_zebra_uninstall( ++ bgp, vpn, &evp, pi, true); ++ else ++ bgp_zebra_route_install(dest, pi, bgp, ++ false, vpn, ++ true); ++ } + } + } + +@@ -2701,7 +2756,22 @@ static int delete_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn) + delete_all_type2_routes(bgp, vpn); + + build_evpn_type3_prefix(&p, vpn->originator_ip); ++ ++ /* ++ * To handle the following scenario: ++ * - Say, the new zebra announce fifo list has few vni Evpn prefixes ++ * yet to be sent to zebra. ++ * - At this point if we have triggers like "no advertise-all-vni" or ++ * "networking restart", where a vni is going down. ++ * ++ * Perform the below ++ * 1) send withdraw routes to zebra immediately in case it is ++ * installed. 2) before we blow up the vni table, we need to walk the ++ * list and pop all the dest whose za_vpn points to this vni. ++ */ ++ SET_FLAG(bgp->flags, BGP_FLAG_VNI_DOWN); + ret = delete_evpn_route(bgp, vpn, &p); ++ UNSET_FLAG(bgp->flags, BGP_FLAG_VNI_DOWN); + if (ret) + return ret; + +@@ -6028,6 +6098,17 @@ struct bgpevpn *bgp_evpn_new(struct bgp *bgp, vni_t vni, + */ + void bgp_evpn_free(struct bgp *bgp, struct bgpevpn *vpn) + { ++ struct bgp_dest *dest = NULL; ++ ++ while (zebra_announce_count(&bm->zebra_announce_head)) { ++ dest = zebra_announce_pop(&bm->zebra_announce_head); ++ if (dest->za_vpn == vpn) { ++ bgp_path_info_unlock(dest->za_bgp_pi); ++ bgp_dest_unlock_node(dest); ++ } else ++ zebra_announce_add_tail(&bm->zebra_announce_head, dest); ++ } ++ + bgp_evpn_remote_ip_hash_destroy(vpn); + bgp_evpn_vni_es_cleanup(vpn); + bgpevpn_unlink_from_l3vni(vpn); +diff --git a/bgpd/bgp_evpn.h b/bgpd/bgp_evpn.h +index 3cbc5af5af..bf1943a2db 100644 +--- a/bgpd/bgp_evpn.h ++++ b/bgpd/bgp_evpn.h +@@ -230,4 +230,12 @@ extern void + bgp_evpn_handle_resolve_overlay_index_unset(struct hash_bucket *bucket, + void *arg); + ++extern enum zclient_send_status evpn_zebra_install(struct bgp *bgp, ++ struct bgpevpn *vpn, ++ const struct prefix_evpn *p, ++ struct bgp_path_info *pi); ++extern enum zclient_send_status ++evpn_zebra_uninstall(struct bgp *bgp, struct bgpevpn *vpn, ++ const struct prefix_evpn *p, struct bgp_path_info *pi, ++ bool is_sync); + #endif /* _QUAGGA_BGP_EVPN_H */ +diff --git a/bgpd/bgp_evpn_mh.c b/bgpd/bgp_evpn_mh.c +index 552365959d..40687c558d 100644 +--- a/bgpd/bgp_evpn_mh.c ++++ b/bgpd/bgp_evpn_mh.c +@@ -56,13 +56,14 @@ static void bgp_evpn_local_es_down(struct bgp *bgp, + struct bgp_evpn_es *es); + static void bgp_evpn_local_type1_evi_route_del(struct bgp *bgp, + struct bgp_evpn_es *es); +-static struct bgp_evpn_es_vtep *bgp_evpn_es_vtep_add(struct bgp *bgp, ++static struct bgp_evpn_es_vtep * ++bgp_evpn_es_vtep_add(struct bgp *bgp, struct bgp_evpn_es *es, ++ struct in_addr vtep_ip, bool esr, uint8_t df_alg, ++ uint16_t df_pref, int *zret); ++static enum zclient_send_status bgp_evpn_es_vtep_del(struct bgp *bgp, + struct bgp_evpn_es *es, + struct in_addr vtep_ip, +- bool esr, uint8_t df_alg, +- uint16_t df_pref); +-static void bgp_evpn_es_vtep_del(struct bgp *bgp, +- struct bgp_evpn_es *es, struct in_addr vtep_ip, bool esr); ++ bool esr); + static void bgp_evpn_es_cons_checks_pend_add(struct bgp_evpn_es *es); + static void bgp_evpn_es_cons_checks_pend_del(struct bgp_evpn_es *es); + static struct bgp_evpn_es_evi * +@@ -105,6 +106,7 @@ static int bgp_evpn_es_route_select_install(struct bgp *bgp, + struct bgp_dest *dest) + { + int ret = 0; ++ int zret = 0; + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + struct bgp_path_info *old_select; /* old best */ +@@ -131,7 +133,7 @@ static int bgp_evpn_es_route_select_install(struct bgp *bgp, + bgp_evpn_es_vtep_add(bgp, es, old_select->attr->nexthop, + true /*esr*/, + old_select->attr->df_alg, +- old_select->attr->df_pref); ++ old_select->attr->df_pref, &zret); + } + UNSET_FLAG(old_select->flags, BGP_PATH_MULTIPATH_CHG); + bgp_zebra_clear_route_change_flags(dest); +@@ -160,7 +162,7 @@ static int bgp_evpn_es_route_select_install(struct bgp *bgp, + && new_select->sub_type == BGP_ROUTE_IMPORTED) { + bgp_evpn_es_vtep_add(bgp, es, new_select->attr->nexthop, + true /*esr */, new_select->attr->df_alg, +- new_select->attr->df_pref); ++ new_select->attr->df_pref, &zret); + } else { + if (old_select && old_select->type == ZEBRA_ROUTE_BGP + && old_select->sub_type == BGP_ROUTE_IMPORTED) +@@ -447,7 +449,7 @@ int bgp_evpn_mh_route_update(struct bgp *bgp, struct bgp_evpn_es *es, + &attr->mp_nexthop_global_in); + } + +- /* Return back the route entry. */ ++ /* Return back th*e route entry. */ + *ri = tmp_pi; + return 0; + } +@@ -1366,23 +1368,28 @@ static struct bgp_evpn_es_vtep *bgp_evpn_es_vtep_find(struct bgp_evpn_es *es, + } + + /* Send the remote ES to zebra for NHG programming */ +-static int bgp_zebra_send_remote_es_vtep(struct bgp *bgp, +- struct bgp_evpn_es_vtep *es_vtep, bool add) ++static enum zclient_send_status ++bgp_zebra_send_remote_es_vtep(struct bgp *bgp, struct bgp_evpn_es_vtep *es_vtep, ++ bool add) + { + struct bgp_evpn_es *es = es_vtep->es; + struct stream *s; + uint32_t flags = 0; + + /* Check socket. */ +- if (!zclient || zclient->sock < 0) +- return 0; ++ if (!zclient || zclient->sock < 0) { ++ if (BGP_DEBUG(zebra, ZEBRA)) ++ zlog_debug("%s: No zclient or zclient->sock exists", ++ __func__); ++ return ZCLIENT_SEND_SUCCESS; ++ } + + /* Don't try to register if Zebra doesn't know of this instance. */ + if (!IS_BGP_INST_KNOWN_TO_ZEBRA(bgp)) { + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("No zebra instance, not installing remote es %s", + es->esi_str); +- return 0; ++ return ZCLIENT_SEND_SUCCESS; + } + + if (es_vtep->flags & BGP_EVPNES_VTEP_ESR) +@@ -1413,12 +1420,12 @@ static int bgp_zebra_send_remote_es_vtep(struct bgp *bgp, + return zclient_send_message(zclient); + } + +-static void bgp_evpn_es_vtep_re_eval_active(struct bgp *bgp, +- struct bgp_evpn_es_vtep *es_vtep, +- bool param_change) ++static enum zclient_send_status bgp_evpn_es_vtep_re_eval_active( ++ struct bgp *bgp, struct bgp_evpn_es_vtep *es_vtep, bool param_change) + { + bool old_active; + bool new_active; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + old_active = CHECK_FLAG(es_vtep->flags, BGP_EVPNES_VTEP_ACTIVE); + /* currently we need an active EVI reference to use the VTEP as +@@ -1440,7 +1447,7 @@ static void bgp_evpn_es_vtep_re_eval_active(struct bgp *bgp, + es_vtep->df_alg, es_vtep->df_pref); + + /* send remote ES to zebra */ +- bgp_zebra_send_remote_es_vtep(bgp, es_vtep, new_active); ++ ret = bgp_zebra_send_remote_es_vtep(bgp, es_vtep, new_active); + + /* The NHG is updated first for efficient failover handling. + * Note the NHG can be de-activated while there are bgp +@@ -1452,13 +1459,14 @@ static void bgp_evpn_es_vtep_re_eval_active(struct bgp *bgp, + /* queue up the es for background consistency checks */ + bgp_evpn_es_cons_checks_pend_add(es_vtep->es); + } ++ ++ return ret; + } + +-static struct bgp_evpn_es_vtep *bgp_evpn_es_vtep_add(struct bgp *bgp, +- struct bgp_evpn_es *es, +- struct in_addr vtep_ip, +- bool esr, uint8_t df_alg, +- uint16_t df_pref) ++static struct bgp_evpn_es_vtep * ++bgp_evpn_es_vtep_add(struct bgp *bgp, struct bgp_evpn_es *es, ++ struct in_addr vtep_ip, bool esr, uint8_t df_alg, ++ uint16_t df_pref, int *zret) + { + struct bgp_evpn_es_vtep *es_vtep; + bool param_change = false; +@@ -1485,15 +1493,17 @@ static struct bgp_evpn_es_vtep *bgp_evpn_es_vtep_add(struct bgp *bgp, + ++es_vtep->evi_cnt; + } + +- bgp_evpn_es_vtep_re_eval_active(bgp, es_vtep, param_change); ++ *zret = bgp_evpn_es_vtep_re_eval_active(bgp, es_vtep, param_change); + + return es_vtep; + } + +-static void bgp_evpn_es_vtep_do_del(struct bgp *bgp, +- struct bgp_evpn_es_vtep *es_vtep, bool esr) ++static enum zclient_send_status ++bgp_evpn_es_vtep_do_del(struct bgp *bgp, struct bgp_evpn_es_vtep *es_vtep, ++ bool esr) + { + bool param_change = false; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + if (BGP_DEBUG(evpn_mh, EVPN_MH_ES)) + zlog_debug("es %s vtep %pI4 del %s", es_vtep->es->esi_str, +@@ -1510,18 +1520,25 @@ static void bgp_evpn_es_vtep_do_del(struct bgp *bgp, + --es_vtep->evi_cnt; + } + +- bgp_evpn_es_vtep_re_eval_active(bgp, es_vtep, param_change); ++ ret = bgp_evpn_es_vtep_re_eval_active(bgp, es_vtep, param_change); + bgp_evpn_es_vtep_free(es_vtep); ++ ++ return ret; + } + +-static void bgp_evpn_es_vtep_del(struct bgp *bgp, +- struct bgp_evpn_es *es, struct in_addr vtep_ip, bool esr) ++static enum zclient_send_status bgp_evpn_es_vtep_del(struct bgp *bgp, ++ struct bgp_evpn_es *es, ++ struct in_addr vtep_ip, ++ bool esr) + { + struct bgp_evpn_es_vtep *es_vtep; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + es_vtep = bgp_evpn_es_vtep_find(es, vtep_ip); + if (es_vtep) +- bgp_evpn_es_vtep_do_del(bgp, es_vtep, esr); ++ ret = bgp_evpn_es_vtep_do_del(bgp, es_vtep, esr); ++ ++ return ret; + } + + /********************** ES MAC-IP paths ************************************* +@@ -3382,12 +3399,14 @@ static struct bgp_evpn_es_evi_vtep *bgp_evpn_es_evi_vtep_find( + /* A VTEP can be added as "active" attach to an ES if EAD-per-ES and + * EAD-per-EVI routes are rxed from it. + */ +-static void bgp_evpn_es_evi_vtep_re_eval_active(struct bgp *bgp, +- struct bgp_evpn_es_evi_vtep *evi_vtep) ++static enum zclient_send_status ++bgp_evpn_es_evi_vtep_re_eval_active(struct bgp *bgp, ++ struct bgp_evpn_es_evi_vtep *evi_vtep) + { + bool old_active; + bool new_active; + uint32_t ead_activity_flags; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + old_active = CHECK_FLAG(evi_vtep->flags, BGP_EVPN_EVI_VTEP_ACTIVE); + +@@ -3408,7 +3427,7 @@ static void bgp_evpn_es_evi_vtep_re_eval_active(struct bgp *bgp, + new_active = CHECK_FLAG(evi_vtep->flags, BGP_EVPN_EVI_VTEP_ACTIVE); + + if (old_active == new_active) +- return; ++ return ret; + + if (BGP_DEBUG(evpn_mh, EVPN_MH_ES)) + zlog_debug("es %s evi %u vtep %pI4 %s", +@@ -3417,24 +3436,26 @@ static void bgp_evpn_es_evi_vtep_re_eval_active(struct bgp *bgp, + new_active ? "active" : "inactive"); + + /* add VTEP to parent es */ +- if (new_active) ++ if (new_active) { + evi_vtep->es_vtep = bgp_evpn_es_vtep_add( + bgp, evi_vtep->es_evi->es, evi_vtep->vtep_ip, +- false /*esr*/, 0, 0); +- else { ++ false /*esr*/, 0, 0, &ret); ++ } else { + if (evi_vtep->es_vtep) { +- bgp_evpn_es_vtep_do_del(bgp, evi_vtep->es_vtep, +- false /*esr*/); ++ ret = bgp_evpn_es_vtep_do_del(bgp, evi_vtep->es_vtep, ++ false /*esr*/); + evi_vtep->es_vtep = NULL; + } + } + /* queue up the parent es for background consistency checks */ + bgp_evpn_es_cons_checks_pend_add(evi_vtep->es_evi->es); ++ ++ return ret; + } + +-static void bgp_evpn_es_evi_vtep_add(struct bgp *bgp, +- struct bgp_evpn_es_evi *es_evi, struct in_addr vtep_ip, +- bool ead_es) ++static enum zclient_send_status ++bgp_evpn_es_evi_vtep_add(struct bgp *bgp, struct bgp_evpn_es_evi *es_evi, ++ struct in_addr vtep_ip, bool ead_es) + { + struct bgp_evpn_es_evi_vtep *evi_vtep; + +@@ -3454,18 +3475,19 @@ static void bgp_evpn_es_evi_vtep_add(struct bgp *bgp, + else + SET_FLAG(evi_vtep->flags, BGP_EVPN_EVI_VTEP_EAD_PER_EVI); + +- bgp_evpn_es_evi_vtep_re_eval_active(bgp, evi_vtep); ++ return bgp_evpn_es_evi_vtep_re_eval_active(bgp, evi_vtep); + } + +-static void bgp_evpn_es_evi_vtep_del(struct bgp *bgp, +- struct bgp_evpn_es_evi *es_evi, struct in_addr vtep_ip, +- bool ead_es) ++static enum zclient_send_status ++bgp_evpn_es_evi_vtep_del(struct bgp *bgp, struct bgp_evpn_es_evi *es_evi, ++ struct in_addr vtep_ip, bool ead_es) + { + struct bgp_evpn_es_evi_vtep *evi_vtep; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + evi_vtep = bgp_evpn_es_evi_vtep_find(es_evi, vtep_ip); + if (!evi_vtep) +- return; ++ return ret; + + if (BGP_DEBUG(evpn_mh, EVPN_MH_ES)) + zlog_debug("del es %s evi %u vtep %pI4 %s", +@@ -3478,8 +3500,10 @@ static void bgp_evpn_es_evi_vtep_del(struct bgp *bgp, + else + UNSET_FLAG(evi_vtep->flags, BGP_EVPN_EVI_VTEP_EAD_PER_EVI); + +- bgp_evpn_es_evi_vtep_re_eval_active(bgp, evi_vtep); ++ ret = bgp_evpn_es_evi_vtep_re_eval_active(bgp, evi_vtep); + bgp_evpn_es_evi_vtep_free(evi_vtep); ++ ++ return ret; + } + + /* compare ES-IDs for the ES-EVI RB tree maintained per-VNI */ +@@ -3755,18 +3779,20 @@ int bgp_evpn_local_es_evi_add(struct bgp *bgp, esi_t *esi, vni_t vni) + /* Add remote ES-EVI entry. This is actually the remote VTEP add and the + * ES-EVI is implicity created on first VTEP's reference. + */ +-int bgp_evpn_remote_es_evi_add(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p) ++enum zclient_send_status bgp_evpn_remote_es_evi_add(struct bgp *bgp, ++ struct bgpevpn *vpn, ++ const struct prefix_evpn *p) + { + char buf[ESI_STR_LEN]; + struct bgp_evpn_es *es; + struct bgp_evpn_es_evi *es_evi; + bool ead_es; + const esi_t *esi = &p->prefix.ead_addr.esi; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + if (!vpn) + /* local EAD-ES need not be sent back to zebra */ +- return 0; ++ return ret; + + if (BGP_DEBUG(evpn_mh, EVPN_MH_ES)) + zlog_debug("add remote %s es %s evi %u vtep %pI4", +@@ -3783,27 +3809,29 @@ int bgp_evpn_remote_es_evi_add(struct bgp *bgp, struct bgpevpn *vpn, + es_evi = bgp_evpn_es_evi_new(es, vpn); + + ead_es = !!p->prefix.ead_addr.eth_tag; +- bgp_evpn_es_evi_vtep_add(bgp, es_evi, p->prefix.ead_addr.ip.ipaddr_v4, +- ead_es); ++ ret = bgp_evpn_es_evi_vtep_add(bgp, es_evi, ++ p->prefix.ead_addr.ip.ipaddr_v4, ead_es); + + bgp_evpn_es_evi_remote_info_re_eval(es_evi); +- return 0; ++ return ret; + } + + /* A remote VTEP has withdrawn. The es-evi-vtep will be deleted and the + * parent es-evi freed up implicitly in last VTEP's deref. + */ +-int bgp_evpn_remote_es_evi_del(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p) ++enum zclient_send_status bgp_evpn_remote_es_evi_del(struct bgp *bgp, ++ struct bgpevpn *vpn, ++ const struct prefix_evpn *p) + { + char buf[ESI_STR_LEN]; + struct bgp_evpn_es *es; + struct bgp_evpn_es_evi *es_evi; + bool ead_es; ++ enum zclient_send_status ret = ZCLIENT_SEND_SUCCESS; + + if (!vpn) + /* local EAD-ES need not be sent back to zebra */ +- return 0; ++ return ret; + + if (BGP_DEBUG(evpn_mh, EVPN_MH_ES)) + zlog_debug( +@@ -3822,7 +3850,7 @@ int bgp_evpn_remote_es_evi_del(struct bgp *bgp, struct bgpevpn *vpn, + esi_to_str(&p->prefix.ead_addr.esi, buf, + sizeof(buf)), + vpn->vni, &p->prefix.ead_addr.ip.ipaddr_v4); +- return 0; ++ return ret; + } + es_evi = bgp_evpn_es_evi_find(es, vpn); + if (!es_evi) { +@@ -3835,14 +3863,15 @@ int bgp_evpn_remote_es_evi_del(struct bgp *bgp, struct bgpevpn *vpn, + sizeof(buf)), + vpn->vni, + &p->prefix.ead_addr.ip.ipaddr_v4); +- return 0; ++ return ret; + } + + ead_es = !!p->prefix.ead_addr.eth_tag; +- bgp_evpn_es_evi_vtep_del(bgp, es_evi, p->prefix.ead_addr.ip.ipaddr_v4, +- ead_es); ++ ret = bgp_evpn_es_evi_vtep_del(bgp, es_evi, ++ p->prefix.ead_addr.ip.ipaddr_v4, ead_es); + bgp_evpn_es_evi_remote_info_re_eval(es_evi); +- return 0; ++ ++ return ret; + } + + /* If a VNI is being deleted we need to force del all remote VTEPs */ +diff --git a/bgpd/bgp_evpn_mh.h b/bgpd/bgp_evpn_mh.h +index 11030e323f..d6e77e982f 100644 +--- a/bgpd/bgp_evpn_mh.h ++++ b/bgpd/bgp_evpn_mh.h +@@ -434,10 +434,12 @@ extern int bgp_evpn_local_es_add(struct bgp *bgp, esi_t *esi, + extern int bgp_evpn_local_es_del(struct bgp *bgp, esi_t *esi); + extern int bgp_evpn_local_es_evi_add(struct bgp *bgp, esi_t *esi, vni_t vni); + extern int bgp_evpn_local_es_evi_del(struct bgp *bgp, esi_t *esi, vni_t vni); +-extern int bgp_evpn_remote_es_evi_add(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p); +-extern int bgp_evpn_remote_es_evi_del(struct bgp *bgp, struct bgpevpn *vpn, +- const struct prefix_evpn *p); ++extern enum zclient_send_status ++bgp_evpn_remote_es_evi_add(struct bgp *bgp, struct bgpevpn *vpn, ++ const struct prefix_evpn *p); ++extern enum zclient_send_status ++bgp_evpn_remote_es_evi_del(struct bgp *bgp, struct bgpevpn *vpn, ++ const struct prefix_evpn *p); + extern void bgp_evpn_mh_init(void); + extern void bgp_evpn_mh_finish(void); + void bgp_evpn_vni_es_init(struct bgpevpn *vpn); +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index c29442d96c..679abba463 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -3213,9 +3213,9 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + && (new_select->sub_type == BGP_ROUTE_NORMAL + || new_select->sub_type + == BGP_ROUTE_IMPORTED)) +- + bgp_zebra_route_install( +- dest, old_select, bgp, true); ++ dest, old_select, bgp, true, ++ NULL, false); + } + } + +@@ -3313,9 +3313,10 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + if (old_select && + is_route_parent_evpn(old_select)) + bgp_zebra_route_install(dest, old_select, bgp, +- false); ++ false, NULL, false); + +- bgp_zebra_route_install(dest, new_select, bgp, true); ++ bgp_zebra_route_install(dest, new_select, bgp, true, ++ NULL, false); + } else { + /* Withdraw the route from the kernel. */ + if (old_select && old_select->type == ZEBRA_ROUTE_BGP +@@ -3324,7 +3325,7 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + || old_select->sub_type == BGP_ROUTE_IMPORTED)) + + bgp_zebra_route_install(dest, old_select, bgp, +- false); ++ false, NULL, false); + } + } + +@@ -4203,7 +4204,8 @@ void bgp_update(struct peer *peer, const struct prefix *p, uint32_t addpath_id, + if (pi && pi->attr->rmap_table_id != new_attr.rmap_table_id) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED)) + /* remove from RIB previous entry */ +- bgp_zebra_route_install(dest, pi, bgp, false); ++ bgp_zebra_route_install(dest, pi, bgp, false, NULL, ++ false); + } + + if (peer->sort == BGP_PEER_EBGP) { +diff --git a/bgpd/bgp_table.h b/bgpd/bgp_table.h +index 45d61f8dfd..9eb681ea3f 100644 +--- a/bgpd/bgp_table.h ++++ b/bgpd/bgp_table.h +@@ -103,6 +103,8 @@ struct bgp_node { + + struct zebra_announce_item zai; + struct bgp_path_info *za_bgp_pi; ++ struct bgpevpn *za_vpn; ++ bool za_is_sync; + + uint64_t version; + +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 1162941ef1..b81acaf8ec 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1713,12 +1713,11 @@ void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi) + for (dest = bgp_table_top(table); dest; dest = bgp_route_next(dest)) + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) && +- +- (pi->type == ZEBRA_ROUTE_BGP +- && (pi->sub_type == BGP_ROUTE_NORMAL +- || pi->sub_type == BGP_ROUTE_IMPORTED))) +- +- bgp_zebra_route_install(dest, pi, bgp, true); ++ (pi->type == ZEBRA_ROUTE_BGP && ++ (pi->sub_type == BGP_ROUTE_NORMAL || ++ pi->sub_type == BGP_ROUTE_IMPORTED))) ++ bgp_zebra_route_install(dest, pi, bgp, true, ++ NULL, false); + } + + /* Announce routes of any bgp subtype of a table to zebra */ +@@ -1740,7 +1739,8 @@ void bgp_zebra_announce_table_all_subtypes(struct bgp *bgp, afi_t afi, + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) && + pi->type == ZEBRA_ROUTE_BGP) +- bgp_zebra_route_install(dest, pi, bgp, true); ++ bgp_zebra_route_install(dest, pi, bgp, true, ++ NULL, false); + } + + enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, +@@ -1793,6 +1793,7 @@ enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, + #define ZEBRA_ANNOUNCEMENTS_LIMIT 1000 + static void bgp_handle_route_announcements_to_zebra(struct thread *e) + { ++ bool is_evpn = false; + uint32_t count = 0; + struct bgp_dest *dest = NULL; + struct bgp_table *table = NULL; +@@ -1808,6 +1809,9 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + table = bgp_dest_table(dest); + install = + CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ if (table && table->afi == AFI_L2VPN && ++ table->safi == SAFI_EVPN) ++ is_evpn = true; + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug( +@@ -1816,17 +1820,33 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + table->bgp->name_pretty, dest, dest->flags); + + if (install) { +- status = bgp_zebra_announce_actual( +- dest, dest->za_bgp_pi, table->bgp); ++ if (is_evpn) ++ status = evpn_zebra_install( ++ table->bgp, dest->za_vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ dest->za_bgp_pi); ++ else ++ status = bgp_zebra_announce_actual( ++ dest, dest->za_bgp_pi, table->bgp); + UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); + } else { +- status = bgp_zebra_withdraw_actual( +- dest, dest->za_bgp_pi, table->bgp); ++ if (is_evpn) ++ status = evpn_zebra_uninstall( ++ table->bgp, dest->za_vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix(dest), ++ dest->za_bgp_pi, false); ++ else ++ status = bgp_zebra_withdraw_actual( ++ dest, dest->za_bgp_pi, table->bgp); ++ + UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); + } + + bgp_path_info_unlock(dest->za_bgp_pi); + dest->za_bgp_pi = NULL; ++ dest->za_vpn = NULL; + bgp_dest_unlock_node(dest); + + if (status == ZCLIENT_SEND_BUFFERED) +@@ -1880,8 +1900,16 @@ static void bgp_zebra_buffer_write_ready(void) + * withdrawn. + */ + void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, +- struct bgp *bgp, bool install) ++ struct bgp *bgp, bool install, struct bgpevpn *vpn, ++ bool is_sync) + { ++ bool is_evpn = false; ++ struct bgp_table *table = NULL; ++ ++ table = bgp_dest_table(dest); ++ if (table && table->afi == AFI_L2VPN && table->safi == SAFI_EVPN) ++ is_evpn = true; ++ + /* + * BGP is installing this route and bgp has been configured + * to suppress announcements until the route has been installed +@@ -1891,7 +1919,7 @@ void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, + if (BGP_SUPPRESS_FIB_ENABLED(bgp)) + SET_FLAG(dest->flags, BGP_NODE_FIB_INSTALL_PENDING); + +- if (bgp->main_zebra_update_hold) ++ if (bgp->main_zebra_update_hold && !is_evpn) + return; + } else { + UNSET_FLAG(dest->flags, BGP_NODE_FIB_INSTALL_PENDING); +@@ -1901,7 +1929,7 @@ void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, + * Don't try to install if we're not connected to Zebra or Zebra doesn't + * know of this instance. + */ +- if (!bgp_install_info_to_zebra(bgp)) ++ if (!bgp_install_info_to_zebra(bgp) && !is_evpn) + return; + + if (!CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL) && +@@ -1922,7 +1950,7 @@ void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, + dest->za_bgp_pi = info; + } else if (CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE)) { + assert(dest->za_bgp_pi); +- if (install) ++ if (install & !is_evpn) + bgp_zebra_withdraw_actual(dest, dest->za_bgp_pi, bgp); + + bgp_path_info_unlock(dest->za_bgp_pi); +@@ -1930,6 +1958,11 @@ void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, + dest->za_bgp_pi = info; + } + ++ if (is_evpn) { ++ dest->za_vpn = vpn; ++ dest->za_is_sync = is_sync; ++ } ++ + if (install) { + UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); + SET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); +@@ -1960,7 +1993,8 @@ void bgp_zebra_withdraw_table_all_subtypes(struct bgp *bgp, afi_t afi, safi_t sa + for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { + if (CHECK_FLAG(pi->flags, BGP_PATH_SELECTED) + && (pi->type == ZEBRA_ROUTE_BGP)) +- bgp_zebra_route_install(dest, pi, bgp, false); ++ bgp_zebra_route_install(dest, pi, bgp, false, ++ NULL, false); + } + } + } +diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h +index 45fcf7f514..5186b7454e 100644 +--- a/bgpd/bgp_zebra.h ++++ b/bgpd/bgp_zebra.h +@@ -45,7 +45,8 @@ extern int bgp_zebra_get_table_range(uint32_t chunk_size, + extern int bgp_if_update_all(void); + extern void bgp_zebra_route_install(struct bgp_dest *dest, + struct bgp_path_info *path, struct bgp *bgp, +- bool install); ++ bool install, struct bgpevpn *vpn, ++ bool is_sync); + extern void bgp_zebra_announce_table(struct bgp *bgp, afi_t afi, safi_t safi); + + /* Announce routes of any bgp subtype of a table to zebra */ +diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h +index bdf31f5161..dc660fb8f0 100644 +--- a/bgpd/bgpd.h ++++ b/bgpd/bgpd.h +@@ -512,6 +512,7 @@ struct bgp { + #define BGP_FLAG_HARD_ADMIN_RESET (1ULL << 31) + /* Evaluate the AIGP attribute during the best path selection process */ + #define BGP_FLAG_COMPARE_AIGP (1ULL << 32) ++#define BGP_FLAG_VNI_DOWN (1ULL << 38) + + /* BGP default address-families. + * New peers inherit enabled afi/safis from bgp instance. +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0036-zebra-backpressure-Fix-Null-ptr-access-Coverity-Issu.patch b/src/sonic-frr/patch/0036-zebra-backpressure-Fix-Null-ptr-access-Coverity-Issu.patch new file mode 100644 index 000000000000..a3b1fa5fe0da --- /dev/null +++ b/src/sonic-frr/patch/0036-zebra-backpressure-Fix-Null-ptr-access-Coverity-Issu.patch @@ -0,0 +1,29 @@ +From b6caf88bd3bce9673d49435453991c49712287aa Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Thu, 11 Apr 2024 22:27:37 -0700 +Subject: [PATCH 08/11] zebra: backpressure - Fix Null ptr access (Coverity + Issue) + +Fix dereferencing NULL ptr making coverity happy. + +Ticket :#3390099 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index b81acaf8ec..524551b1e0 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1809,8 +1809,7 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + table = bgp_dest_table(dest); + install = + CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); +- if (table && table->afi == AFI_L2VPN && +- table->safi == SAFI_EVPN) ++ if (table->afi == AFI_L2VPN && table->safi == SAFI_EVPN) + is_evpn = true; + + if (BGP_DEBUG(zebra, ZEBRA)) +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0037-bgpd-Increase-install-uninstall-speed-of-evpn-vpn-vn.patch b/src/sonic-frr/patch/0037-bgpd-Increase-install-uninstall-speed-of-evpn-vpn-vn.patch new file mode 100644 index 000000000000..a360a3101599 --- /dev/null +++ b/src/sonic-frr/patch/0037-bgpd-Increase-install-uninstall-speed-of-evpn-vpn-vn.patch @@ -0,0 +1,116 @@ +From 7166c2222cb82885510c3e8c7906c1d7de950f9b Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 11 Apr 2024 13:28:30 -0400 +Subject: [PATCH 09/11] bgpd: Increase install/uninstall speed of evpn vpn + vni's + +BGP receives notification from zebra about an vpn that +needs to be installed into the evpn tables. Unfortunately +this function was walking the entirety of evpn tables +3 times. Modify the code to walk the tree 1 time and +to just look for the needed route types as you go. + +This reduces, in a scaled environment, processing +time of the zclient_read function from 130 seconds +to 95 seconds. For a up / down / up interface +scenario. + +Signed-off-by: Rajasekar Raja +Signed-off-by: Donald Sharp + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 622fd6afd2..eb5aa9f077 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -3752,9 +3752,7 @@ static int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, int install) + * particular VNI. + */ + static int install_uninstall_routes_for_vni(struct bgp *bgp, +- struct bgpevpn *vpn, +- bgp_evpn_route_type rtype, +- int install) ++ struct bgpevpn *vpn, int install) + { + afi_t afi; + safi_t safi; +@@ -3785,7 +3783,9 @@ static int install_uninstall_routes_for_vni(struct bgp *bgp, + (const struct prefix_evpn *)bgp_dest_get_prefix( + dest); + +- if (evp->prefix.route_type != rtype) ++ if (evp->prefix.route_type != BGP_EVPN_IMET_ROUTE && ++ evp->prefix.route_type != BGP_EVPN_AD_ROUTE && ++ evp->prefix.route_type != BGP_EVPN_MAC_IP_ROUTE) + continue; + + for (pi = bgp_dest_get_bgp_path_info(dest); pi; +@@ -3812,7 +3812,8 @@ static int install_uninstall_routes_for_vni(struct bgp *bgp, + bgp->vrf_id, + install ? "install" + : "uninstall", +- rtype == BGP_EVPN_MAC_IP_ROUTE ++ evp->prefix.route_type == ++ BGP_EVPN_MAC_IP_ROUTE + ? "MACIP" + : "IMET", + vpn->vni); +@@ -3845,23 +3846,11 @@ static int install_routes_for_vrf(struct bgp *bgp_vrf) + */ + static int install_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn) + { +- int ret; +- +- /* Install type-3 routes followed by type-2 routes - the ones applicable ++ /* ++ * Install type-3 routes followed by type-2 routes - the ones applicable + * for this VNI. + */ +- ret = install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_IMET_ROUTE, +- 1); +- if (ret) +- return ret; +- +- ret = install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_AD_ROUTE, +- 1); +- if (ret) +- return ret; +- +- return install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_MAC_IP_ROUTE, +- 1); ++ return install_uninstall_routes_for_vni(bgp, vpn, 1); + } + + /* uninstall routes from l3vni vrf. */ +@@ -3877,25 +3866,11 @@ static int uninstall_routes_for_vrf(struct bgp *bgp_vrf) + */ + static int uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn) + { +- int ret; +- +- /* Uninstall type-2 routes followed by type-3 routes - the ones +- * applicable +- * for this VNI. ++ /* ++ * Uninstall type-2 routes followed by type-3 routes - the ones ++ * applicable for this VNI. + */ +- ret = install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_MAC_IP_ROUTE, +- 0); +- if (ret) +- return ret; +- +- ret = install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_AD_ROUTE, +- 0); +- if (ret) +- return ret; +- +- +- return install_uninstall_routes_for_vni(bgp, vpn, BGP_EVPN_IMET_ROUTE, +- 0); ++ return install_uninstall_routes_for_vni(bgp, vpn, 0); + } + + /* +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0038-zebra-Actually-display-I-O-buffer-sizes.patch b/src/sonic-frr/patch/0038-zebra-Actually-display-I-O-buffer-sizes.patch new file mode 100644 index 000000000000..904cb169b840 --- /dev/null +++ b/src/sonic-frr/patch/0038-zebra-Actually-display-I-O-buffer-sizes.patch @@ -0,0 +1,49 @@ +From 72781109e5dadafe55f10d72ae2b3505bf0ccb93 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 8 Apr 2024 16:24:05 -0400 +Subject: [PATCH 10/11] zebra: Actually display I/O buffer sizes + +An operator found a situation where zebra was +backing up in a significant way towards BGP +with EVPN changes taking up some serious amounts +of memory. The key lines that would have clued +us in on it were behind a dev build. Let's change +this. + +Signed-off-by: Donald Sharp + +diff --git a/lib/stream.h b/lib/stream.h +index a3c148c9c9..7acbbd2dc7 100644 +--- a/lib/stream.h ++++ b/lib/stream.h +@@ -120,9 +120,7 @@ struct stream_fifo { + + /* number of streams in this fifo */ + atomic_size_t count; +-#if defined DEV_BUILD + atomic_size_t max_count; +-#endif + + struct stream *head; + struct stream *tail; +diff --git a/zebra/zserv.c b/zebra/zserv.c +index de6e404fc4..daccfa59d5 100644 +--- a/zebra/zserv.c ++++ b/zebra/zserv.c +@@ -1130,12 +1130,10 @@ static void zebra_show_client_detail(struct vty *vty, struct zserv *client) + vty_out(vty, "ES-EVI %-12u%-12u%-12u\n", + client->local_es_evi_add_cnt, 0, client->local_es_evi_del_cnt); + vty_out(vty, "Errors: %u\n", client->error_cnt); +- +-#if defined DEV_BUILD + vty_out(vty, "Input Fifo: %zu:%zu Output Fifo: %zu:%zu\n", + client->ibuf_fifo->count, client->ibuf_fifo->max_count, + client->obuf_fifo->count, client->obuf_fifo->max_count); +-#endif ++ + vty_out(vty, "\n"); + } + +-- +2.17.1 + diff --git a/src/sonic-frr/patch/0039-zebra-Actually-display-I-O-buffer-sizes-part-2.patch b/src/sonic-frr/patch/0039-zebra-Actually-display-I-O-buffer-sizes-part-2.patch new file mode 100644 index 000000000000..9088bfebbdad --- /dev/null +++ b/src/sonic-frr/patch/0039-zebra-Actually-display-I-O-buffer-sizes-part-2.patch @@ -0,0 +1,49 @@ +From 981bfca72414bfa7a97b6526e4736ea4f6834620 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Mon, 22 Apr 2024 13:50:47 -0400 +Subject: [PATCH] From a88498262d8d88fb3846d1a5c1a373751fe6f381 Mon Sep 17 + 00:00:00 2001 Subject: [PATCH 11/11] zebra: Actually display I/O buffer sizes + (part-2) + +An extension of commit-8d8f12ba8e5cd11c189b8475b05539fa8415ccb9 + +Removing ifdef DEV_BUILD in stream_fifo_push as well to make the 'sh +zebra client' display the current I/O fifo along with max fifo items. + +TICKET :#3390099 + +Signed-off-by: Rajasekar Raja + +diff --git a/lib/stream.c b/lib/stream.c +index 2de3abdf45..3aef70794e 100644 +--- a/lib/stream.c ++++ b/lib/stream.c +@@ -1256,9 +1256,7 @@ void stream_fifo_init(struct stream_fifo *fifo) + /* Add new stream to fifo. */ + void stream_fifo_push(struct stream_fifo *fifo, struct stream *s) + { +-#if defined DEV_BUILD + size_t max, curmax; +-#endif + + if (fifo->tail) + fifo->tail->next = s; +@@ -1267,15 +1265,11 @@ void stream_fifo_push(struct stream_fifo *fifo, struct stream *s) + + fifo->tail = s; + fifo->tail->next = NULL; +-#if !defined DEV_BUILD +- atomic_fetch_add_explicit(&fifo->count, 1, memory_order_release); +-#else + max = atomic_fetch_add_explicit(&fifo->count, 1, memory_order_release); + curmax = atomic_load_explicit(&fifo->max_count, memory_order_relaxed); + if (max > curmax) + atomic_store_explicit(&fifo->max_count, max, + memory_order_relaxed); +-#endif + } + + void stream_fifo_push_safe(struct stream_fifo *fifo, struct stream *s) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0040-bgpd-backpressure-Fix-to-withdraw-evpn-type-5-routes.patch b/src/sonic-frr/patch/0040-bgpd-backpressure-Fix-to-withdraw-evpn-type-5-routes.patch new file mode 100644 index 000000000000..b74d86881ae5 --- /dev/null +++ b/src/sonic-frr/patch/0040-bgpd-backpressure-Fix-to-withdraw-evpn-type-5-routes.patch @@ -0,0 +1,63 @@ +From f3ec35b461a06f1278383d2347dfbc611b6a91a6 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Fri, 17 May 2024 12:36:31 -0700 +Subject: [PATCH] bgpd: backpressure - Fix to withdraw evpn type-5 routes + immediately + +As part of backpressure changes, there is a bug where immediate withdraw +is to be sent for evpn imported type-5 prefix to clear the nh neigh and +RMAC entry. + +Fixing this by sending withdraw immediately to keep it inline with the +code today + +Ticket: #3905571 + +Signed-off-by: Donald Sharp +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c +index 679abba463..e28069767f 100644 +--- a/bgpd/bgp_route.c ++++ b/bgpd/bgp_route.c +@@ -3312,8 +3312,7 @@ static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, + */ + if (old_select && + is_route_parent_evpn(old_select)) +- bgp_zebra_route_install(dest, old_select, bgp, +- false, NULL, false); ++ bgp_zebra_withdraw_actual(dest, old_select, bgp); + + bgp_zebra_route_install(dest, new_select, bgp, true, + NULL, false); +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 524551b1e0..5d5525156b 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1889,11 +1889,9 @@ static void bgp_zebra_buffer_write_ready(void) + * save new pi, mark as going to be + * withdrawan, remove install flag + * +- * Withdrawal Install Special case, send withdrawal immediately +- * Leave dest on list, release old pi, ++ * Withdrawal Install Leave dest on list, release old pi, + * save new pi, mark as going to be +- * installed. ++ * installed. + * Withdrawal Withdrawal Leave dest on list, release old pi, + * save new pi, mark as going to be + * withdrawn. +@@ -1949,9 +1947,6 @@ void bgp_zebra_route_install(struct bgp_dest *dest, struct bgp_path_info *info, + dest->za_bgp_pi = info; + } else if (CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE)) { + assert(dest->za_bgp_pi); +- if (install & !is_evpn) +- bgp_zebra_withdraw_actual(dest, dest->za_bgp_pi, bgp); +- + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_path_info_lock(info); + dest->za_bgp_pi = info; +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0041-bgpd-backpressure-Fix-to-avoid-CPU-hog.patch b/src/sonic-frr/patch/0041-bgpd-backpressure-Fix-to-avoid-CPU-hog.patch new file mode 100644 index 000000000000..860d928d3247 --- /dev/null +++ b/src/sonic-frr/patch/0041-bgpd-backpressure-Fix-to-avoid-CPU-hog.patch @@ -0,0 +1,53 @@ +From 4775b2b2bf39caa4bc5aed10ef44c6dcf9c7fc80 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Fri, 17 May 2024 15:43:59 -0700 +Subject: [PATCH] bgpd: backpressure - Fix to avoid CPU hog + +In case when bgp_evpn_free or bgp_delete is called and the announce_list +has few items where vpn/bgp does not match, we add the item back to the +list. Because of this the list count is always > 0 thereby hogging CPU or +infinite loop. + +Ticket: #3905624 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index eb5aa9f077..2243ffdc77 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -6074,9 +6074,11 @@ struct bgpevpn *bgp_evpn_new(struct bgp *bgp, vni_t vni, + void bgp_evpn_free(struct bgp *bgp, struct bgpevpn *vpn) + { + struct bgp_dest *dest = NULL; ++ uint32_t ann_count = zebra_announce_count(&bm->zebra_announce_head); + +- while (zebra_announce_count(&bm->zebra_announce_head)) { ++ while (ann_count) { + dest = zebra_announce_pop(&bm->zebra_announce_head); ++ ann_count--; + if (dest->za_vpn == vpn) { + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index da133d71c1..492566f8c8 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -3690,11 +3690,13 @@ int bgp_delete(struct bgp *bgp) + int i; + struct bgp_dest *dest = NULL; + struct graceful_restart_info *gr_info; ++ uint32_t ann_count = zebra_announce_count(&bm->zebra_announce_head); + + assert(bgp); + +- while (zebra_announce_count(&bm->zebra_announce_head)) { ++ while (ann_count) { + dest = zebra_announce_pop(&bm->zebra_announce_head); ++ ann_count--; + if (dest->za_bgp_pi->peer->bgp == bgp) { + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0042-zebra-Use-built-in-data-structure-counter.patch b/src/sonic-frr/patch/0042-zebra-Use-built-in-data-structure-counter.patch new file mode 100644 index 000000000000..3401dae27158 --- /dev/null +++ b/src/sonic-frr/patch/0042-zebra-Use-built-in-data-structure-counter.patch @@ -0,0 +1,156 @@ +From 529cdfa09065c6c77aff4a96a52f0d3bcb385b6f Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Thu, 13 Jun 2024 15:30:00 -0400 +Subject: [PATCH 1/5] zebra: Use built in data structure counter + +Instead of keeping a counter that is independent +of the queue's data structure. Just use the queue's +built-in counter. Ensure that it's pthread safe by +keeping it wrapped inside the mutex for adding/deleting +to the queue. + +Signed-off-by: Donald Sharp + +diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c +index caa2f988e2..bc9815bb10 100644 +--- a/zebra/dplane_fpm_nl.c ++++ b/zebra/dplane_fpm_nl.c +@@ -135,8 +135,6 @@ struct fpm_nl_ctx { + + /* Amount of data plane context processed. */ + _Atomic uint32_t dplane_contexts; +- /* Amount of data plane contexts enqueued. */ +- _Atomic uint32_t ctxqueue_len; + /* Peak amount of data plane contexts enqueued. */ + _Atomic uint32_t ctxqueue_len_peak; + +@@ -311,6 +309,12 @@ DEFUN(fpm_show_counters, fpm_show_counters_cmd, + FPM_STR + "FPM statistic counters\n") + { ++ uint32_t curr_queue_len; ++ ++ frr_with_mutex (&gfnc->ctxqueue_mutex) { ++ curr_queue_len = dplane_ctx_queue_count(&gfnc->ctxqueue); ++ } ++ + vty_out(vty, "%30s\n%30s\n", "FPM counters", "============"); + + #define SHOW_COUNTER(label, counter) \ +@@ -324,8 +328,7 @@ DEFUN(fpm_show_counters, fpm_show_counters_cmd, + SHOW_COUNTER("Connection errors", gfnc->counters.connection_errors); + SHOW_COUNTER("Data plane items processed", + gfnc->counters.dplane_contexts); +- SHOW_COUNTER("Data plane items enqueued", +- gfnc->counters.ctxqueue_len); ++ SHOW_COUNTER("Data plane items enqueued", curr_queue_len); + SHOW_COUNTER("Data plane items queue peak", + gfnc->counters.ctxqueue_len_peak); + SHOW_COUNTER("Buffer full hits", gfnc->counters.buffer_full); +@@ -344,6 +347,12 @@ DEFUN(fpm_show_counters_json, fpm_show_counters_json_cmd, + "FPM statistic counters\n" + JSON_STR) + { ++ uint32_t curr_queue_len; ++ ++ frr_with_mutex (&gfnc->ctxqueue_mutex) { ++ curr_queue_len = dplane_ctx_queue_count(&gfnc->ctxqueue); ++ } ++ + struct json_object *jo; + + jo = json_object_new_object(); +@@ -357,8 +366,7 @@ DEFUN(fpm_show_counters_json, fpm_show_counters_json_cmd, + gfnc->counters.connection_errors); + json_object_int_add(jo, "data-plane-contexts", + gfnc->counters.dplane_contexts); +- json_object_int_add(jo, "data-plane-contexts-queue", +- gfnc->counters.ctxqueue_len); ++ json_object_int_add(jo, "data-plane-contexts-queue", curr_queue_len); + json_object_int_add(jo, "data-plane-contexts-queue-peak", + gfnc->counters.ctxqueue_len_peak); + json_object_int_add(jo, "buffer-full-hits", gfnc->counters.buffer_full); +@@ -1380,8 +1388,6 @@ static void fpm_process_queue(struct thread *t) + + /* Account the processed entries. */ + processed_contexts++; +- atomic_fetch_sub_explicit(&fnc->counters.ctxqueue_len, 1, +- memory_order_relaxed); + + dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS); + dplane_provider_enqueue_out_ctx(fnc->prov, ctx); +@@ -1550,7 +1556,7 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov) + struct zebra_dplane_ctx *ctx; + struct fpm_nl_ctx *fnc; + int counter, limit; +- uint64_t cur_queue, peak_queue = 0, stored_peak_queue; ++ uint64_t cur_queue = 0, peak_queue = 0, stored_peak_queue; + + fnc = dplane_provider_get_data(prov); + limit = dplane_provider_get_work_limit(prov); +@@ -1564,20 +1570,12 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov) + * anyway. + */ + if (fnc->socket != -1 && fnc->connecting == false) { +- /* +- * Update the number of queued contexts *before* +- * enqueueing, to ensure counter consistency. +- */ +- atomic_fetch_add_explicit(&fnc->counters.ctxqueue_len, +- 1, memory_order_relaxed); +- + frr_with_mutex (&fnc->ctxqueue_mutex) { + dplane_ctx_enqueue_tail(&fnc->ctxqueue, ctx); ++ cur_queue = ++ dplane_ctx_queue_count(&fnc->ctxqueue); + } + +- cur_queue = atomic_load_explicit( +- &fnc->counters.ctxqueue_len, +- memory_order_relaxed); + if (peak_queue < cur_queue) + peak_queue = cur_queue; + continue; +@@ -1594,9 +1592,7 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov) + atomic_store_explicit(&fnc->counters.ctxqueue_len_peak, + peak_queue, memory_order_relaxed); + +- if (atomic_load_explicit(&fnc->counters.ctxqueue_len, +- memory_order_relaxed) +- > 0) ++ if (cur_queue > 0) + thread_add_timer(fnc->fthread->master, fpm_process_queue, + fnc, 0, &fnc->t_dequeue); + +diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c +index 59b8d6cc63..c252a370b8 100644 +--- a/zebra/zebra_dplane.c ++++ b/zebra/zebra_dplane.c +@@ -969,6 +969,11 @@ struct zebra_dplane_ctx *dplane_ctx_dequeue(struct dplane_ctx_list_head *q) + return ctx; + } + ++uint32_t dplane_ctx_queue_count(struct dplane_ctx_list_head *q) ++{ ++ return dplane_ctx_list_count(q); ++} ++ + /* + * Accessors for information from the context object + */ +diff --git a/zebra/zebra_dplane.h b/zebra/zebra_dplane.h +index 9f9496c8f4..c29e05bbc9 100644 +--- a/zebra/zebra_dplane.h ++++ b/zebra/zebra_dplane.h +@@ -324,6 +324,8 @@ struct zebra_dplane_ctx *dplane_ctx_get_head(struct dplane_ctx_list_head *q); + /* Init a list of contexts */ + void dplane_ctx_q_init(struct dplane_ctx_list_head *q); + ++uint32_t dplane_ctx_queue_count(struct dplane_ctx_list_head *q); ++ + /* + * Accessors for information from the context object + */ +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0043-zebra-Use-the-ctx-queue-counters.patch b/src/sonic-frr/patch/0043-zebra-Use-the-ctx-queue-counters.patch new file mode 100644 index 000000000000..e57684aee9a3 --- /dev/null +++ b/src/sonic-frr/patch/0043-zebra-Use-the-ctx-queue-counters.patch @@ -0,0 +1,108 @@ +From 59e2b19d2d08349ef0197b0adcb13d0bd7de2b79 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 17 Jun 2024 11:05:28 -0400 +Subject: [PATCH 2/5] zebra: Use the ctx queue counters + +The ctx queue data structures already have a counter +associated with them. Let's just use them instead. + +Signed-off-by: Donald Sharp + +diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c +index c252a370b8..c52e032660 100644 +--- a/zebra/zebra_dplane.c ++++ b/zebra/zebra_dplane.c +@@ -492,10 +492,8 @@ struct zebra_dplane_provider { + int (*dp_fini)(struct zebra_dplane_provider *prov, bool early_p); + + _Atomic uint32_t dp_in_counter; +- _Atomic uint32_t dp_in_queued; + _Atomic uint32_t dp_in_max; + _Atomic uint32_t dp_out_counter; +- _Atomic uint32_t dp_out_queued; + _Atomic uint32_t dp_out_max; + _Atomic uint32_t dp_error_counter; + +@@ -6008,17 +6006,19 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed) + + /* Show counters, useful info from each registered provider */ + while (prov) { ++ dplane_provider_lock(prov); ++ in_q = dplane_ctx_queue_count(&prov->dp_ctx_in_list); ++ out_q = dplane_ctx_queue_count(&prov->dp_ctx_out_list); ++ dplane_provider_unlock(prov); + + in = atomic_load_explicit(&prov->dp_in_counter, + memory_order_relaxed); +- in_q = atomic_load_explicit(&prov->dp_in_queued, +- memory_order_relaxed); ++ + in_max = atomic_load_explicit(&prov->dp_in_max, + memory_order_relaxed); + out = atomic_load_explicit(&prov->dp_out_counter, + memory_order_relaxed); +- out_q = atomic_load_explicit(&prov->dp_out_queued, +- memory_order_relaxed); ++ + out_max = atomic_load_explicit(&prov->dp_out_max, + memory_order_relaxed); + +@@ -6169,10 +6169,6 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx( + dplane_provider_lock(prov); + + ctx = dplane_ctx_list_pop(&(prov->dp_ctx_in_list)); +- if (ctx) { +- atomic_fetch_sub_explicit(&prov->dp_in_queued, 1, +- memory_order_relaxed); +- } + + dplane_provider_unlock(prov); + +@@ -6200,10 +6196,6 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov, + break; + } + +- if (ret > 0) +- atomic_fetch_sub_explicit(&prov->dp_in_queued, ret, +- memory_order_relaxed); +- + dplane_provider_unlock(prov); + + return ret; +@@ -6228,10 +6220,7 @@ void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov, + dplane_ctx_list_add_tail(&(prov->dp_ctx_out_list), ctx); + + /* Maintain out-queue counters */ +- atomic_fetch_add_explicit(&(prov->dp_out_queued), 1, +- memory_order_relaxed); +- curr = atomic_load_explicit(&prov->dp_out_queued, +- memory_order_relaxed); ++ curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list); + high = atomic_load_explicit(&prov->dp_out_max, + memory_order_relaxed); + if (curr > high) +@@ -6253,9 +6242,6 @@ dplane_provider_dequeue_out_ctx(struct zebra_dplane_provider *prov) + if (!ctx) + return NULL; + +- atomic_fetch_sub_explicit(&(prov->dp_out_queued), 1, +- memory_order_relaxed); +- + return ctx; + } + +@@ -7260,10 +7246,7 @@ static void dplane_thread_loop(struct thread *event) + + atomic_fetch_add_explicit(&prov->dp_in_counter, counter, + memory_order_relaxed); +- atomic_fetch_add_explicit(&prov->dp_in_queued, counter, +- memory_order_relaxed); +- curr = atomic_load_explicit(&prov->dp_in_queued, +- memory_order_relaxed); ++ curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list); + high = atomic_load_explicit(&prov->dp_in_max, + memory_order_relaxed); + if (curr > high) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0044-zebra-Modify-dplane-loop-to-allow-backpressure-to-fi.patch b/src/sonic-frr/patch/0044-zebra-Modify-dplane-loop-to-allow-backpressure-to-fi.patch new file mode 100644 index 000000000000..67b76d03424a --- /dev/null +++ b/src/sonic-frr/patch/0044-zebra-Modify-dplane-loop-to-allow-backpressure-to-fi.patch @@ -0,0 +1,199 @@ +From 4671ddf4920553b663fda129f7c4366839347645 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 12 Jun 2024 14:14:48 -0400 +Subject: [PATCH 3/5] zebra: Modify dplane loop to allow backpressure to filter + up + +Currently when the dplane_thread_loop is run, it moves contexts +from the dg_update_list and puts the contexts on the input queue +of the first provider. This provider is given a chance to run +and then the items on the output queue are pulled off and placed +on the input queue of the next provider. Rinse/Repeat down through +the entire list of providers. Now imagine that we have a list +of multiple providers and the last provider is getting backed up. +Contexts will end up sticking in the input Queue of the `slow` +provider. This can grow without bounds. This is a real problem +when you have a situation where an interface is flapping and an +upper level protocol is sending a continous stream of route +updates to reflect the change in ecmp. You can end up with +a very very large backlog of contexts. This is bad because +zebra can easily grow to a very very large memory size and on +restricted systems you can run out of memory. Fortunately +for us, the MetaQ already participates with this process +by not doing more route processing until the dg_update_list +goes below the working limit of dg_updates_per_cycle. Thus +if FRR modifies the behavior of this loop to not move more +contexts onto the input queue if either the input queue +or output queue of the next provider has reached this limit. +FRR will naturaly start auto handling backpressure for the dplane +context system and memory will not go out of control. + +Signed-off-by: Donald Sharp + +diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c +index c52e032660..f0e1ff6f27 100644 +--- a/zebra/zebra_dplane.c ++++ b/zebra/zebra_dplane.c +@@ -7155,10 +7155,10 @@ static void dplane_thread_loop(struct thread *event) + { + struct dplane_ctx_list_head work_list; + struct dplane_ctx_list_head error_list; +- struct zebra_dplane_provider *prov; ++ struct zebra_dplane_provider *prov, *next_prov; + struct zebra_dplane_ctx *ctx; + int limit, counter, error_counter; +- uint64_t curr, high; ++ uint64_t curr, out_curr, high; + bool reschedule = false; + + /* Capture work limit per cycle */ +@@ -7182,18 +7182,48 @@ static void dplane_thread_loop(struct thread *event) + /* Locate initial registered provider */ + prov = dplane_prov_list_first(&zdplane_info.dg_providers); + +- /* Move new work from incoming list to temp list */ +- for (counter = 0; counter < limit; counter++) { +- ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list); +- if (ctx) { +- ctx->zd_provider = prov->dp_id; ++ curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list); ++ out_curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list); + +- dplane_ctx_list_add_tail(&work_list, ctx); +- } else { +- break; ++ if (curr >= (uint64_t)limit) { ++ if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) ++ zlog_debug("%s: Current first provider(%s) Input queue is %" PRIu64 ++ ", holding off work", ++ __func__, prov->dp_name, curr); ++ counter = 0; ++ } else if (out_curr >= (uint64_t)limit) { ++ if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) ++ zlog_debug("%s: Current first provider(%s) Output queue is %" PRIu64 ++ ", holding off work", ++ __func__, prov->dp_name, out_curr); ++ counter = 0; ++ } else { ++ int tlimit; ++ /* ++ * Let's limit the work to how what can be put on the ++ * in or out queue without going over ++ */ ++ tlimit = limit - MAX(curr, out_curr); ++ /* Move new work from incoming list to temp list */ ++ for (counter = 0; counter < tlimit; counter++) { ++ ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list); ++ if (ctx) { ++ ctx->zd_provider = prov->dp_id; ++ ++ dplane_ctx_list_add_tail(&work_list, ctx); ++ } else { ++ break; ++ } + } + } + ++ /* ++ * If there is anything still on the two input queues reschedule ++ */ ++ if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 || ++ dplane_ctx_queue_count(&zdplane_info.dg_update_list) > 0) ++ reschedule = true; ++ + DPLANE_UNLOCK(); + + atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, counter, +@@ -7212,8 +7242,9 @@ static void dplane_thread_loop(struct thread *event) + * items. + */ + if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) +- zlog_debug("dplane enqueues %d new work to provider '%s'", +- counter, dplane_provider_get_name(prov)); ++ zlog_debug("dplane enqueues %d new work to provider '%s' curr is %" PRIu64, ++ counter, dplane_provider_get_name(prov), ++ curr); + + /* Capture current provider id in each context; check for + * error status. +@@ -7271,18 +7302,61 @@ static void dplane_thread_loop(struct thread *event) + if (!zdplane_info.dg_run) + break; + ++ /* Locate next provider */ ++ next_prov = dplane_prov_list_next(&zdplane_info.dg_providers, ++ prov); ++ if (next_prov) { ++ curr = dplane_ctx_queue_count( ++ &next_prov->dp_ctx_in_list); ++ out_curr = dplane_ctx_queue_count( ++ &next_prov->dp_ctx_out_list); ++ } else ++ out_curr = curr = 0; ++ + /* Dequeue completed work from the provider */ + dplane_provider_lock(prov); + +- while (counter < limit) { +- ctx = dplane_provider_dequeue_out_ctx(prov); +- if (ctx) { +- dplane_ctx_list_add_tail(&work_list, ctx); +- counter++; +- } else +- break; ++ if (curr >= (uint64_t)limit) { ++ if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) ++ zlog_debug("%s: Next Provider(%s) Input queue is %" PRIu64 ++ ", holding off work", ++ __func__, next_prov->dp_name, curr); ++ counter = 0; ++ } else if (out_curr >= (uint64_t)limit) { ++ if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) ++ zlog_debug("%s: Next Provider(%s) Output queue is %" PRIu64 ++ ", holding off work", ++ __func__, next_prov->dp_name, ++ out_curr); ++ counter = 0; ++ } else { ++ int tlimit; ++ ++ /* ++ * Let's limit the work to how what can be put on the ++ * in or out queue without going over ++ */ ++ tlimit = limit - MAX(curr, out_curr); ++ while (counter < tlimit) { ++ ctx = dplane_provider_dequeue_out_ctx(prov); ++ if (ctx) { ++ dplane_ctx_list_add_tail(&work_list, ++ ctx); ++ counter++; ++ } else ++ break; ++ } + } + ++ /* ++ * Let's check if there are still any items on the ++ * input or output queus of the current provider ++ * if so then we know we need to reschedule. ++ */ ++ if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 || ++ dplane_ctx_queue_count(&prov->dp_ctx_out_list) > 0) ++ reschedule = true; ++ + dplane_provider_unlock(prov); + + if (counter >= limit) +@@ -7293,7 +7367,7 @@ static void dplane_thread_loop(struct thread *event) + counter, dplane_provider_get_name(prov)); + + /* Locate next provider */ +- prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov); ++ prov = next_prov; + } + + /* +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0045-zebra-Limit-queue-depth-in-dplane_fpm_nl.patch b/src/sonic-frr/patch/0045-zebra-Limit-queue-depth-in-dplane_fpm_nl.patch new file mode 100644 index 000000000000..7f25a773a101 --- /dev/null +++ b/src/sonic-frr/patch/0045-zebra-Limit-queue-depth-in-dplane_fpm_nl.patch @@ -0,0 +1,52 @@ +From 50f606c158f6c89abd0d3f531905005d3a48a5b6 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Wed, 12 Jun 2024 15:16:08 -0400 +Subject: [PATCH 4/5] zebra: Limit queue depth in dplane_fpm_nl + +The dplane providers have a concept of input queues +and output queues. These queues are chained together +during normal operation. The code in zebra also has +a feedback mechanism where the MetaQ will not run when +the first input queue is backed up. Having the dplane_fpm_nl +code grab all contexts when it is backed up prevents +this system from behaving appropriately. + +Modify the code to not add to the dplane_fpm_nl's internal +queue when it is already full. This will allow the backpressure +to work appropriately in zebra proper. + +Signed-off-by: Donald Sharp + +diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c +index bc9815bb10..4fd42f64a2 100644 +--- a/zebra/dplane_fpm_nl.c ++++ b/zebra/dplane_fpm_nl.c +@@ -1560,6 +1560,25 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov) + + fnc = dplane_provider_get_data(prov); + limit = dplane_provider_get_work_limit(prov); ++ ++ frr_with_mutex (&fnc->ctxqueue_mutex) { ++ cur_queue = dplane_ctx_queue_count(&fnc->ctxqueue); ++ } ++ ++ if (cur_queue >= (uint64_t)limit) { ++ if (IS_ZEBRA_DEBUG_FPM) ++ zlog_debug("%s: Already at a limit(%" PRIu64 ++ ") of internal work, hold off", ++ __func__, cur_queue); ++ limit = 0; ++ } else { ++ if (IS_ZEBRA_DEBUG_FPM) ++ zlog_debug("%s: current queue is %" PRIu64 ++ ", limiting to lesser amount of %" PRIu64, ++ __func__, cur_queue, limit - cur_queue); ++ limit -= cur_queue; ++ } ++ + for (counter = 0; counter < limit; counter++) { + ctx = dplane_provider_dequeue_in_ctx(prov); + if (ctx == NULL) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0046-zebra-Modify-show-zebra-dplane-providers-to-give-mor.patch b/src/sonic-frr/patch/0046-zebra-Modify-show-zebra-dplane-providers-to-give-mor.patch new file mode 100644 index 000000000000..e865b861d192 --- /dev/null +++ b/src/sonic-frr/patch/0046-zebra-Modify-show-zebra-dplane-providers-to-give-mor.patch @@ -0,0 +1,102 @@ +From d695dfb88418834f0054255dd4385b293efb5a17 Mon Sep 17 00:00:00 2001 +From: Donald Sharp +Date: Mon, 17 Jun 2024 10:42:41 -0400 +Subject: [PATCH 5/5] zebra: Modify show `zebra dplane providers` to give more + data + +The show zebra dplane provider command was ommitting +the input and output queues to the dplane itself. +It would be nice to have this insight as well. + +New output: +r1# show zebra dplane providers +dataplane Incoming Queue from Zebra: 100 +Zebra dataplane providers: + Kernel (1): in: 6, q: 0, q_max: 3, out: 6, q: 14, q_max: 3 + dplane_fpm_nl (2): in: 6, q: 10, q_max: 3, out: 6, q: 0, q_max: 3 +dataplane Outgoing Queue to Zebra: 43 +r1# + +Signed-off-by: Donald Sharp + +diff --git a/zebra/rib.h b/zebra/rib.h +index 2e62148ea0..b78cd218f6 100644 +--- a/zebra/rib.h ++++ b/zebra/rib.h +@@ -630,6 +630,7 @@ static inline struct nexthop_group *rib_get_fib_backup_nhg( + } + + extern void zebra_vty_init(void); ++extern uint32_t zebra_rib_dplane_results_count(void); + + extern pid_t pid; + +diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c +index f0e1ff6f27..21c73a3796 100644 +--- a/zebra/zebra_dplane.c ++++ b/zebra/zebra_dplane.c +@@ -5998,12 +5998,14 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed) + struct zebra_dplane_provider *prov; + uint64_t in, in_q, in_max, out, out_q, out_max; + +- vty_out(vty, "Zebra dataplane providers:\n"); +- + DPLANE_LOCK(); + prov = dplane_prov_list_first(&zdplane_info.dg_providers); ++ in = dplane_ctx_queue_count(&zdplane_info.dg_update_list); + DPLANE_UNLOCK(); + ++ vty_out(vty, "dataplane Incoming Queue from Zebra: %" PRIu64 "\n", in); ++ vty_out(vty, "Zebra dataplane providers:\n"); ++ + /* Show counters, useful info from each registered provider */ + while (prov) { + dplane_provider_lock(prov); +@@ -6022,13 +6024,19 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed) + out_max = atomic_load_explicit(&prov->dp_out_max, + memory_order_relaxed); + +- vty_out(vty, "%s (%u): in: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64", out: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64"\n", +- prov->dp_name, prov->dp_id, in, in_q, in_max, +- out, out_q, out_max); ++ vty_out(vty, ++ " %s (%u): in: %" PRIu64 ", q: %" PRIu64 ++ ", q_max: %" PRIu64 ", out: %" PRIu64 ", q: %" PRIu64 ++ ", q_max: %" PRIu64 "\n", ++ prov->dp_name, prov->dp_id, in, in_q, in_max, out, ++ out_q, out_max); + + prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov); + } + ++ out = zebra_rib_dplane_results_count(); ++ vty_out(vty, "dataplane Outgoing Queue to Zebra: %" PRIu64 "\n", out); ++ + return CMD_SUCCESS; + } + +diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c +index 1ff3d98545..ae051d37eb 100644 +--- a/zebra/zebra_rib.c ++++ b/zebra/zebra_rib.c +@@ -4874,6 +4874,17 @@ static int rib_dplane_results(struct dplane_ctx_list_head *ctxlist) + return 0; + } + ++uint32_t zebra_rib_dplane_results_count(void) ++{ ++ uint32_t count; ++ ++ frr_with_mutex (&dplane_mutex) { ++ count = dplane_ctx_queue_count(&rib_dplane_q); ++ } ++ ++ return count; ++} ++ + /* + * Ensure there are no empty slots in the route_info array. + * Every route type in zebra should be present there. +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0047-bgpd-backpressure-fix-evpn-route-sync-to-zebra.patch b/src/sonic-frr/patch/0047-bgpd-backpressure-fix-evpn-route-sync-to-zebra.patch new file mode 100644 index 000000000000..c8e3de24ef33 --- /dev/null +++ b/src/sonic-frr/patch/0047-bgpd-backpressure-fix-evpn-route-sync-to-zebra.patch @@ -0,0 +1,33 @@ +From bed7636621589c139ed8d83842df3ce438b8493f Mon Sep 17 00:00:00 2001 +From: Chirag Shah +Date: Mon, 17 Jun 2024 13:58:03 -0700 +Subject: [PATCH] bgpd: backpressure - fix evpn route sync to zebra + +In scaled EVPN + ipv4/ipv6 uni route sync to zebra, +some of the ipv4/ipv6 routes skipped reinstallation +due to incorrect local variable's stale value. + +Once the local variable value reset in each loop +iteration all skipped routes synced to zebra properly. + +Ticket: #3948828 + +Signed-off-by: Rajasekar Raja +Signed-off-by: Chirag Shah + +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 5d5525156b..278e228d66 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1801,6 +1801,8 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + bool install; + + while (count < ZEBRA_ANNOUNCEMENTS_LIMIT) { ++ is_evpn = false; ++ + dest = zebra_announce_pop(&bm->zebra_announce_head); + + if (!dest) +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0048-bgpd-backpressure-fix-to-properly-remove-dest-for-bg.patch b/src/sonic-frr/patch/0048-bgpd-backpressure-fix-to-properly-remove-dest-for-bg.patch new file mode 100644 index 000000000000..1731b6d21f31 --- /dev/null +++ b/src/sonic-frr/patch/0048-bgpd-backpressure-fix-to-properly-remove-dest-for-bg.patch @@ -0,0 +1,90 @@ +From 05d2c5b3ba6f83cd42a4dd5f9e40959fc438b0a6 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Wed, 10 Jul 2024 16:46:29 -0700 +Subject: [PATCH 1/2] bgpd: backpressure - fix to properly remove dest for bgp + under deletion + +In case of imported routes (L3vni/vrf leaks), when a bgp instance is +being deleted, the peer->bgp comparision with the incoming bgp to remove +the dest from the pending fifo is wrong. This can lead to the fifo +having stale entries resulting in crash. + +Two changes are done here. + - Instead of pop/push items in list if the struct bgp doesnt match, + simply iterate the list and remove the expected ones. + + - Corrected the way bgp is fetched from dest rather than relying on + path_info->peer so that it works for all kinds of routes. + +Ticket :#3980988 + +Signed-off-by: Chirag Shah +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index 2243ffdc77..b1f8f19594 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -6074,16 +6074,16 @@ struct bgpevpn *bgp_evpn_new(struct bgp *bgp, vni_t vni, + void bgp_evpn_free(struct bgp *bgp, struct bgpevpn *vpn) + { + struct bgp_dest *dest = NULL; +- uint32_t ann_count = zebra_announce_count(&bm->zebra_announce_head); ++ struct bgp_dest *dest_next = NULL; + +- while (ann_count) { +- dest = zebra_announce_pop(&bm->zebra_announce_head); +- ann_count--; ++ for (dest = zebra_announce_first(&bm->zebra_announce_head); dest; ++ dest = dest_next) { ++ dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); + if (dest->za_vpn == vpn) { + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +- } else +- zebra_announce_add_tail(&bm->zebra_announce_head, dest); ++ zebra_announce_del(&bm->zebra_announce_head, dest); ++ } + } + + bgp_evpn_remote_ip_hash_destroy(vpn); +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 492566f8c8..e16a58b443 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -3689,19 +3689,25 @@ int bgp_delete(struct bgp *bgp) + safi_t safi; + int i; + struct bgp_dest *dest = NULL; ++ struct bgp_dest *dest_next = NULL; ++ struct bgp_table *dest_table = NULL; + struct graceful_restart_info *gr_info; +- uint32_t ann_count = zebra_announce_count(&bm->zebra_announce_head); + + assert(bgp); + +- while (ann_count) { +- dest = zebra_announce_pop(&bm->zebra_announce_head); +- ann_count--; +- if (dest->za_bgp_pi->peer->bgp == bgp) { ++ /* ++ * Iterate the pending dest list and remove all the dest pertaininig to ++ * the bgp under delete. ++ */ ++ for (dest = zebra_announce_first(&bm->zebra_announce_head); dest; ++ dest = dest_next) { ++ dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); ++ dest_table = bgp_dest_table(dest); ++ if (dest_table->bgp == bgp) { + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +- } else +- zebra_announce_add_tail(&bm->zebra_announce_head, dest); ++ zebra_announce_del(&bm->zebra_announce_head, dest); ++ } + } + + bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0049-bgpd-backpressure-Improve-debuggability.patch b/src/sonic-frr/patch/0049-bgpd-backpressure-Improve-debuggability.patch new file mode 100644 index 000000000000..e74a2a7c3114 --- /dev/null +++ b/src/sonic-frr/patch/0049-bgpd-backpressure-Improve-debuggability.patch @@ -0,0 +1,46 @@ +From 0dd44dc0d99b69e6c1853f46dbae4a30fc4b9aed Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Wed, 10 Jul 2024 20:17:14 -0700 +Subject: [PATCH 2/2] bgpd: backpressure - Improve debuggability + +Improve debuggability in backpressure code. + +Ticket :#3980988 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index e16a58b443..2e1c5e555b 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -3692,6 +3692,7 @@ int bgp_delete(struct bgp *bgp) + struct bgp_dest *dest_next = NULL; + struct bgp_table *dest_table = NULL; + struct graceful_restart_info *gr_info; ++ uint32_t cnt_before, cnt_after; + + assert(bgp); + +@@ -3699,6 +3700,7 @@ int bgp_delete(struct bgp *bgp) + * Iterate the pending dest list and remove all the dest pertaininig to + * the bgp under delete. + */ ++ cnt_before = zebra_announce_count(&bm->zebra_announce_head); + for (dest = zebra_announce_first(&bm->zebra_announce_head); dest; + dest = dest_next) { + dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); +@@ -3710,6 +3712,11 @@ int bgp_delete(struct bgp *bgp) + } + } + ++ cnt_after = zebra_announce_count(&bm->zebra_announce_head); ++ if (BGP_DEBUG(zebra, ZEBRA)) ++ zlog_debug("Zebra Announce Fifo cleanup count before %u and after %u during BGP %s deletion", ++ cnt_before, cnt_after, bgp->name_pretty); ++ + bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); + + /* make sure we withdraw any exported routes */ +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0050-bgpd-backpressure-Avoid-use-after-free.patch b/src/sonic-frr/patch/0050-bgpd-backpressure-Avoid-use-after-free.patch new file mode 100644 index 000000000000..878d60eb5e37 --- /dev/null +++ b/src/sonic-frr/patch/0050-bgpd-backpressure-Avoid-use-after-free.patch @@ -0,0 +1,48 @@ +From df1d28fcc12d4f5541c9335115887d31e6197b80 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Mon, 22 Jul 2024 10:13:19 -0700 +Subject: [PATCH] bgpd: backpressure - Avoid use after free + +Coverity complains there is a use after free (1598495 and 1598496) +At this point, most likely dest->refcount cannot go 1 and free up +the dest, but there might be some code path where this can happen. + +Fixing this with a simple order change (no harm fix). + +Ticket :#4001204 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index b1f8f19594..bb3cd62950 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -6080,9 +6080,9 @@ void bgp_evpn_free(struct bgp *bgp, struct bgpevpn *vpn) + dest = dest_next) { + dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); + if (dest->za_vpn == vpn) { ++ zebra_announce_del(&bm->zebra_announce_head, dest); + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +- zebra_announce_del(&bm->zebra_announce_head, dest); + } + } + +diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c +index 2e1c5e555b..342982069b 100644 +--- a/bgpd/bgpd.c ++++ b/bgpd/bgpd.c +@@ -3706,9 +3706,9 @@ int bgp_delete(struct bgp *bgp) + dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); + dest_table = bgp_dest_table(dest); + if (dest_table->bgp == bgp) { ++ zebra_announce_del(&bm->zebra_announce_head, dest); + bgp_path_info_unlock(dest->za_bgp_pi); + bgp_dest_unlock_node(dest); +- zebra_announce_del(&bm->zebra_announce_head, dest); + } + } + +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch b/src/sonic-frr/patch/0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch new file mode 100644 index 000000000000..53d12a6a83d9 --- /dev/null +++ b/src/sonic-frr/patch/0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch @@ -0,0 +1,73 @@ +From cc56963da0e8f0ca606bc9b932e9180ad059f8c5 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Tue, 16 Jul 2024 23:34:15 -0700 +Subject: [PATCH 1/2] bgpd: backpressure - fix ret value + evpn_route_select_install + +The return value of evpn_route_select_install is ignored in all cases +except during vni route table install/uninstall and based on the +returned value, an error is logged. Fixing this. + +Ticket :#3992392 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c +index bb3cd62950..34128e7c19 100644 +--- a/bgpd/bgp_evpn.c ++++ b/bgpd/bgp_evpn.c +@@ -1433,11 +1433,12 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + && !bgp_addpath_is_addpath_used(&bgp->tx_addpath, afi, safi)) { + if (bgp_zebra_has_route_changed(old_select)) { + if (CHECK_FLAG(bgp->flags, BGP_FLAG_DELETE_IN_PROGRESS)) +- evpn_zebra_install( +- bgp, vpn, +- (const struct prefix_evpn *) +- bgp_dest_get_prefix(dest), +- old_select); ++ ret = evpn_zebra_install(bgp, vpn, ++ (const struct prefix_evpn ++ *) ++ bgp_dest_get_prefix( ++ dest), ++ old_select); + else + bgp_zebra_route_install(dest, old_select, bgp, + true, vpn, false); +@@ -1475,10 +1476,11 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + && (new_select->sub_type == BGP_ROUTE_IMPORTED || + bgp_evpn_attr_is_sync(new_select->attr))) { + if (CHECK_FLAG(bgp->flags, BGP_FLAG_DELETE_IN_PROGRESS)) +- evpn_zebra_install(bgp, vpn, +- (const struct prefix_evpn *) +- bgp_dest_get_prefix(dest), +- new_select); ++ ret = evpn_zebra_install(bgp, vpn, ++ (const struct prefix_evpn *) ++ bgp_dest_get_prefix( ++ dest), ++ new_select); + else + bgp_zebra_route_install(dest, new_select, bgp, true, + vpn, false); +@@ -1503,11 +1505,12 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn, + if (CHECK_FLAG(bgp->flags, + BGP_FLAG_DELETE_IN_PROGRESS) || + CHECK_FLAG(bgp->flags, BGP_FLAG_VNI_DOWN)) +- evpn_zebra_uninstall( +- bgp, vpn, +- (const struct prefix_evpn *) +- bgp_dest_get_prefix(dest), +- old_select, false); ++ ret = evpn_zebra_uninstall(bgp, vpn, ++ (const struct prefix_evpn ++ *) ++ bgp_dest_get_prefix( ++ dest), ++ old_select, false); + else + bgp_zebra_route_install(dest, old_select, bgp, + false, vpn, false); +-- +2.43.2 + diff --git a/src/sonic-frr/patch/0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch b/src/sonic-frr/patch/0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch new file mode 100644 index 000000000000..62ba8a3b739e --- /dev/null +++ b/src/sonic-frr/patch/0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch @@ -0,0 +1,61 @@ +From dd591de04b0e25c74a9936c854bb6dbe7839bd39 Mon Sep 17 00:00:00 2001 +From: Rajasekar Raja +Date: Thu, 18 Jul 2024 22:23:23 -0700 +Subject: [PATCH 2/2] bgpd: backpressure - log error for evpn when route + install to zebra fails. + +log error for evpn in case route install to zebra fails. + +Ticket :#3992392 + +Signed-off-by: Rajasekar Raja + +diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c +index 278e228d66..038d328a60 100644 +--- a/bgpd/bgp_zebra.c ++++ b/bgpd/bgp_zebra.c +@@ -1799,6 +1799,7 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + struct bgp_table *table = NULL; + enum zclient_send_status status = ZCLIENT_SEND_SUCCESS; + bool install; ++ const struct prefix_evpn *evp = NULL; + + while (count < ZEBRA_ANNOUNCEMENTS_LIMIT) { + is_evpn = false; +@@ -1809,10 +1810,12 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + break; + + table = bgp_dest_table(dest); +- install = +- CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); +- if (table->afi == AFI_L2VPN && table->safi == SAFI_EVPN) ++ install = CHECK_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_INSTALL); ++ if (table->afi == AFI_L2VPN && table->safi == SAFI_EVPN) { + is_evpn = true; ++ evp = (const struct prefix_evpn *)bgp_dest_get_prefix( ++ dest); ++ } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug( +@@ -1845,6 +1848,17 @@ static void bgp_handle_route_announcements_to_zebra(struct thread *e) + UNSET_FLAG(dest->flags, BGP_NODE_SCHEDULE_FOR_DELETE); + } + ++ if (is_evpn && status == ZCLIENT_SEND_FAILURE) ++ flog_err(EC_BGP_EVPN_FAIL, ++ "%s (%u): Failed to %s EVPN %pFX %s route in VNI %u", ++ vrf_id_to_name(table->bgp->vrf_id), ++ table->bgp->vrf_id, ++ install ? "install" : "uninstall", evp, ++ evp->prefix.route_type == BGP_EVPN_MAC_IP_ROUTE ++ ? "MACIP" ++ : "IMET", ++ dest->za_vpn->vni); ++ + bgp_path_info_unlock(dest->za_bgp_pi); + dest->za_bgp_pi = NULL; + dest->za_vpn = NULL; +-- +2.43.2 + diff --git a/src/sonic-frr/patch/series b/src/sonic-frr/patch/series index 445bc9dd9653..7f1d79d8199d 100644 --- a/src/sonic-frr/patch/series +++ b/src/sonic-frr/patch/series @@ -27,3 +27,26 @@ 0027-lib-Do-not-convert-EVPN-prefixes-into-IPv4-IPv6-if-n.patch 0028-zebra-fix-parse-attr-problems-for-encap.patch 0029-zebra-nhg-fix-on-intf-up.patch +0030-zebra-backpressure-Zebra-push-back-on-Buffer-Stream-.patch +0031-bgpd-backpressure-Add-a-typesafe-list-for-Zebra-Anno.patch +0032-bgpd-fix-flushing-ipv6-flowspec-entries-when-peering.patch +0033-bgpd-backpressure-cleanup-bgp_zebra_XX-func-args.patch +0034-gpd-backpressure-Handle-BGP-Zebra-Install-evt-Creat.patch +0035-bgpd-backpressure-Handle-BGP-Zebra-EPVN-Install-evt-.patch +0036-zebra-backpressure-Fix-Null-ptr-access-Coverity-Issu.patch +0037-bgpd-Increase-install-uninstall-speed-of-evpn-vpn-vn.patch +0038-zebra-Actually-display-I-O-buffer-sizes.patch +0039-zebra-Actually-display-I-O-buffer-sizes-part-2.patch +0040-bgpd-backpressure-Fix-to-withdraw-evpn-type-5-routes.patch +0041-bgpd-backpressure-Fix-to-avoid-CPU-hog.patch +0042-zebra-Use-built-in-data-structure-counter.patch +0043-zebra-Use-the-ctx-queue-counters.patch +0044-zebra-Modify-dplane-loop-to-allow-backpressure-to-fi.patch +0045-zebra-Limit-queue-depth-in-dplane_fpm_nl.patch +0046-zebra-Modify-show-zebra-dplane-providers-to-give-mor.patch +0047-bgpd-backpressure-fix-evpn-route-sync-to-zebra.patch +0048-bgpd-backpressure-fix-to-properly-remove-dest-for-bg.patch +0049-bgpd-backpressure-Improve-debuggability.patch +0050-bgpd-backpressure-Avoid-use-after-free.patch +0051-bgpd-backpressure-fix-ret-value-evpn_route_select_in.patch +0052-bgpd-backpressure-log-error-for-evpn-when-route-inst.patch