diff options
35 files changed, 1567 insertions, 256 deletions
diff --git a/bgpd/bgp_attr.c b/bgpd/bgp_attr.c index c64d153f1..e21c84355 100644 --- a/bgpd/bgp_attr.c +++ b/bgpd/bgp_attr.c @@ -2956,7 +2956,8 @@ void bgp_packet_mpattr_prefix(struct stream *s, afi_t afi, safi_t safi, addpath_encode, addpath_tx_id); } else if (safi == SAFI_LABELED_UNICAST) { /* Prefix write with label. */ - stream_put_labeled_prefix(s, p, label); + stream_put_labeled_prefix(s, p, label, addpath_encode, + addpath_tx_id); } else if (safi == SAFI_FLOWSPEC) { if (PSIZE (p->prefixlen)+2 < FLOWSPEC_NLRI_SIZELIMIT) stream_putc(s, PSIZE (p->prefixlen)+2); diff --git a/bgpd/bgp_evpn_vty.c b/bgpd/bgp_evpn_vty.c index a22082c07..d031d34f1 100644 --- a/bgpd/bgp_evpn_vty.c +++ b/bgpd/bgp_evpn_vty.c @@ -1069,11 +1069,9 @@ static int bgp_show_ethernet_vpn(struct vty *vty, struct prefix_rd *prd, pi = pi->next) { total_count++; if (type == bgp_show_type_neighbor) { - union sockunion *su = output_arg; + struct peer *peer = output_arg; - if (pi->peer->su_remote == NULL - || !sockunion_same( - pi->peer->su_remote, su)) + if (peer_cmp(peer, pi->peer) != 0) continue; } if (header) { @@ -1292,29 +1290,41 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_tags, 0); } -DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_routes, - show_ip_bgp_l2vpn_evpn_all_neighbor_routes_cmd, - "show [ip] bgp l2vpn evpn all neighbors A.B.C.D routes [json]", +DEFUN(show_ip_bgp_l2vpn_evpn_neighbor_routes, + show_ip_bgp_l2vpn_evpn_neighbor_routes_cmd, + "show [ip] bgp l2vpn evpn neighbors <A.B.C.D|X:X::X:X|WORD> routes [json]", SHOW_STR IP_STR BGP_STR L2VPN_HELP_STR EVPN_HELP_STR - "Display information about all EVPN NLRIs\n" "Detailed information on TCP and BGP neighbor connections\n" - "Neighbor to display information about\n" + "IPv4 Neighbor to display information about\n" + "IPv6 Neighbor to display information about\n" + "Neighbor on BGP configured interface\n" "Display routes learned from neighbor\n" JSON_STR) { - int idx_ipv4 = 0; - union sockunion su; + int idx = 0; struct peer *peer; - int ret; + char *peerstr = NULL; bool uj = use_json(argc, argv); + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + struct bgp *bgp = NULL; - argv_find(argv, argc, "A.B.C.D", &idx_ipv4); + bgp_vty_find_and_parse_afi_safi_bgp(vty, argv, argc, &idx, &afi, &safi, + &bgp, uj); + if (!idx) { + vty_out(vty, "No index\n"); + return CMD_WARNING; + } + + /* neighbors <A.B.C.D|X:X::X:X|WORD> */ + argv_find(argv, argc, "neighbors", &idx); + peerstr = argv[++idx]->arg; - ret = str2sockunion(argv[idx_ipv4]->arg, &su); - if (ret < 0) { + peer = peer_lookup_in_view(vty, bgp, peerstr, uj); + if (!peer) { if (uj) { json_object *json_no = NULL; json_no = json_object_new_object(); @@ -1325,11 +1335,9 @@ DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_routes, json_object_free(json_no); } else vty_out(vty, "Malformed address: %s\n", - argv[idx_ipv4]->arg); + argv[idx]->arg); return CMD_WARNING; } - - peer = peer_lookup(NULL, &su); if (!peer || !peer->afc[AFI_L2VPN][SAFI_EVPN]) { if (uj) { json_object *json_no = NULL; @@ -1345,13 +1353,13 @@ DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_routes, return CMD_WARNING; } - return bgp_show_ethernet_vpn(vty, NULL, bgp_show_type_neighbor, &su, 0, + return bgp_show_ethernet_vpn(vty, NULL, bgp_show_type_neighbor, peer, 0, uj); } DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_routes, show_ip_bgp_l2vpn_evpn_rd_neighbor_routes_cmd, - "show [ip] bgp l2vpn evpn rd ASN:NN_OR_IP-ADDRESS:NN neighbors A.B.C.D routes [json]", + "show [ip] bgp l2vpn evpn rd ASN:NN_OR_IP-ADDRESS:NN neighbors <A.B.C.D|X:X::X:X|WORD> routes [json]", SHOW_STR IP_STR BGP_STR @@ -1360,20 +1368,30 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_routes, "Display information for a route distinguisher\n" "VPN Route Distinguisher\n" "Detailed information on TCP and BGP neighbor connections\n" - "Neighbor to display information about\n" + "IPv4 Neighbor to display information about\n" + "IPv6 Neighbor to display information about\n" + "Neighbor on BGP configured interface\n" "Display routes learned from neighbor\n" JSON_STR) { int idx_ext_community = 0; - int idx_ipv4 = 0; + int idx = 0; int ret; - union sockunion su; struct peer *peer; + char *peerstr = NULL; struct prefix_rd prd; bool uj = use_json(argc, argv); + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + struct bgp *bgp = NULL; - argv_find(argv, argc, "ASN:NN_OR_IP-ADDRESS:NN", &idx_ext_community); - argv_find(argv, argc, "A.B.C.D", &idx_ipv4); + bgp_vty_find_and_parse_afi_safi_bgp(vty, argv, argc, &idx, &afi, &safi, + &bgp, uj); + if (!idx) { + vty_out(vty, "No index\n"); + return CMD_WARNING; + } + argv_find(argv, argc, "ASN:NN_OR_IP-ADDRESS:NN", &idx_ext_community); ret = str2prefix_rd(argv[idx_ext_community]->arg, &prd); if (!ret) { if (uj) { @@ -1389,8 +1407,12 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_routes, return CMD_WARNING; } - ret = str2sockunion(argv[idx_ipv4]->arg, &su); - if (ret < 0) { + /* neighbors <A.B.C.D|X:X::X:X|WORD> */ + argv_find(argv, argc, "neighbors", &idx); + peerstr = argv[++idx]->arg; + + peer = peer_lookup_in_view(vty, bgp, peerstr, uj); + if (!peer) { if (uj) { json_object *json_no = NULL; json_no = json_object_new_object(); @@ -1401,11 +1423,9 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_routes, json_object_free(json_no); } else vty_out(vty, "Malformed address: %s\n", - argv[idx_ext_community]->arg); + argv[idx]->arg); return CMD_WARNING; } - - peer = peer_lookup(NULL, &su); if (!peer || !peer->afc[AFI_L2VPN][SAFI_EVPN]) { if (uj) { json_object *json_no = NULL; @@ -1421,33 +1441,48 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_routes, return CMD_WARNING; } - return bgp_show_ethernet_vpn(vty, &prd, bgp_show_type_neighbor, &su, 0, + return bgp_show_ethernet_vpn(vty, &prd, bgp_show_type_neighbor, peer, 0, uj); } -DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_advertised_routes, - show_ip_bgp_l2vpn_evpn_all_neighbor_advertised_routes_cmd, - "show [ip] bgp l2vpn evpn all neighbors A.B.C.D advertised-routes [json]", +DEFUN(show_ip_bgp_l2vpn_evpn_neighbor_advertised_routes, + show_ip_bgp_l2vpn_evpn_neighbor_advertised_routes_cmd, + "show [ip] bgp l2vpn evpn neighbors <A.B.C.D|X:X::X:X|WORD> advertised-routes [json]", SHOW_STR IP_STR BGP_STR L2VPN_HELP_STR EVPN_HELP_STR - "Display information about all EVPN NLRIs\n" "Detailed information on TCP and BGP neighbor connections\n" - "Neighbor to display information about\n" + "IPv4 Neighbor to display information about\n" + "IPv6 Neighbor to display information about\n" + "Neighbor on BGP configured interface\n" "Display the routes advertised to a BGP neighbor\n" JSON_STR) { - int idx_ipv4 = 0; - int ret; + int idx = 0; struct peer *peer; - union sockunion su; bool uj = use_json(argc, argv); + struct bgp *bgp = NULL; + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + char *peerstr = NULL; + + if (uj) + argc--; - argv_find(argv, argc, "A.B.C.D", &idx_ipv4); + bgp_vty_find_and_parse_afi_safi_bgp(vty, argv, argc, &idx, &afi, &safi, + &bgp, uj); + if (!idx) { + vty_out(vty, "No index\n"); + return CMD_WARNING; + } + + /* neighbors <A.B.C.D|X:X::X:X|WORD> */ + argv_find(argv, argc, "neighbors", &idx); + peerstr = argv[++idx]->arg; - ret = str2sockunion(argv[idx_ipv4]->arg, &su); - if (ret < 0) { + peer = peer_lookup_in_view(vty, bgp, peerstr, uj); + if (!peer) { if (uj) { json_object *json_no = NULL; json_no = json_object_new_object(); @@ -1458,10 +1493,9 @@ DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_advertised_routes, json_object_free(json_no); } else vty_out(vty, "Malformed address: %s\n", - argv[idx_ipv4]->arg); + argv[idx]->arg); return CMD_WARNING; } - peer = peer_lookup(NULL, &su); if (!peer || !peer->afc[AFI_L2VPN][SAFI_EVPN]) { if (uj) { json_object *json_no = NULL; @@ -1482,7 +1516,7 @@ DEFUN(show_ip_bgp_l2vpn_evpn_all_neighbor_advertised_routes, DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_advertised_routes, show_ip_bgp_l2vpn_evpn_rd_neighbor_advertised_routes_cmd, - "show [ip] bgp l2vpn evpn rd ASN:NN_OR_IP-ADDRESS:NN neighbors A.B.C.D advertised-routes [json]", + "show [ip] bgp l2vpn evpn rd ASN:NN_OR_IP-ADDRESS:NN neighbors <A.B.C.D|X:X::X:X|WORD> advertised-routes [json]", SHOW_STR IP_STR BGP_STR @@ -1491,22 +1525,43 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_advertised_routes, "Display information for a route distinguisher\n" "VPN Route Distinguisher\n" "Detailed information on TCP and BGP neighbor connections\n" - "Neighbor to display information about\n" + "IPv4 Neighbor to display information about\n" + "IPv6 Neighbor to display information about\n" + "Neighbor on BGP configured interface\n" "Display the routes advertised to a BGP neighbor\n" JSON_STR) { int idx_ext_community = 0; - int idx_ipv4 = 0; + int idx = 0; int ret; struct peer *peer; struct prefix_rd prd; - union sockunion su; + struct bgp *bgp = NULL; bool uj = use_json(argc, argv); + char *peerstr = NULL; + afi_t afi = AFI_L2VPN; + safi_t safi = SAFI_EVPN; + + if (uj) + argc--; + + if (uj) + argc--; + + bgp_vty_find_and_parse_afi_safi_bgp(vty, argv, argc, &idx, &afi, &safi, + &bgp, uj); + if (!idx) { + vty_out(vty, "No index\n"); + return CMD_WARNING; + } argv_find(argv, argc, "ASN:NN_OR_IP-ADDRESS:NN", &idx_ext_community); - argv_find(argv, argc, "A.B.C.D", &idx_ipv4); - ret = str2sockunion(argv[idx_ipv4]->arg, &su); - if (ret < 0) { + /* neighbors <A.B.C.D|X:X::X:X|WORD> */ + argv_find(argv, argc, "neighbors", &idx); + peerstr = argv[++idx]->arg; + + peer = peer_lookup_in_view(vty, bgp, peerstr, uj); + if (!peer) { if (uj) { json_object *json_no = NULL; json_no = json_object_new_object(); @@ -1517,10 +1572,9 @@ DEFUN(show_ip_bgp_l2vpn_evpn_rd_neighbor_advertised_routes, json_object_free(json_no); } else vty_out(vty, "Malformed address: %s\n", - argv[idx_ext_community]->arg); + argv[idx]->arg); return CMD_WARNING; } - peer = peer_lookup(NULL, &su); if (!peer || !peer->afc[AFI_L2VPN][SAFI_EVPN]) { if (uj) { json_object *json_no = NULL; @@ -1599,7 +1653,7 @@ DEFUN(show_ip_bgp_evpn_rd_overlay, use_json(argc, argv)); } -/* For testing purpose, static route of MPLS-VPN. */ +/* For testing purpose, static route of EVPN RT-5. */ DEFUN(evpnrt5_network, evpnrt5_network_cmd, "network <A.B.C.D/M|X:X::X:X/M> rd ASN:NN_OR_IP-ADDRESS:NN ethtag WORD label WORD esi WORD gwip <A.B.C.D|X:X::X:X> routermac WORD [route-map WORD]", @@ -1638,7 +1692,7 @@ DEFUN(evpnrt5_network, argv[idx_routermac]->arg); } -/* For testing purpose, static route of MPLS-VPN. */ +/* For testing purpose, static route of EVPN RT-5. */ DEFUN(no_evpnrt5_network, no_evpnrt5_network_cmd, "no network <A.B.C.D/M|X:X::X:X/M> rd ASN:NN_OR_IP-ADDRESS:NN ethtag WORD label WORD esi WORD gwip <A.B.C.D|X:X::X:X>", @@ -5323,12 +5377,12 @@ void bgp_ethernetvpn_init(void) install_element(VIEW_NODE, &show_ip_bgp_l2vpn_evpn_all_tags_cmd); install_element(VIEW_NODE, &show_ip_bgp_l2vpn_evpn_rd_tags_cmd); install_element(VIEW_NODE, - &show_ip_bgp_l2vpn_evpn_all_neighbor_routes_cmd); + &show_ip_bgp_l2vpn_evpn_neighbor_routes_cmd); install_element(VIEW_NODE, &show_ip_bgp_l2vpn_evpn_rd_neighbor_routes_cmd); install_element( VIEW_NODE, - &show_ip_bgp_l2vpn_evpn_all_neighbor_advertised_routes_cmd); + &show_ip_bgp_l2vpn_evpn_neighbor_advertised_routes_cmd); install_element( VIEW_NODE, &show_ip_bgp_l2vpn_evpn_rd_neighbor_advertised_routes_cmd); diff --git a/bgpd/bgp_lcommunity.c b/bgpd/bgp_lcommunity.c index 2b09a2954..aeb290719 100644 --- a/bgpd/bgp_lcommunity.c +++ b/bgpd/bgp_lcommunity.c @@ -439,7 +439,8 @@ struct lcommunity *lcommunity_str2com(const char *str) enum lcommunity_token token = lcommunity_token_unknown; struct lcommunity_val lval; - while ((str = lcommunity_gettoken(str, &lval, &token))) { + do { + str = lcommunity_gettoken(str, &lval, &token); switch (token) { case lcommunity_token_val: if (lcom == NULL) @@ -452,7 +453,8 @@ struct lcommunity *lcommunity_str2com(const char *str) lcommunity_free(&lcom); return NULL; } - } + } while (str); + return lcom; } diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index a5591e29f..a37256837 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -5332,6 +5332,13 @@ static void bgp_purge_af_static_redist_routes(struct bgp *bgp, afi_t afi, struct bgp_node *rn; struct bgp_path_info *pi; + /* Do not install the aggregate route if BGP is in the + * process of termination. + */ + if (bgp_flag_check(bgp, BGP_FLAG_DELETE_IN_PROGRESS) || + (bgp->peer_self == NULL)) + return; + table = bgp->rib[afi][safi]; for (rn = bgp_table_top(table); rn; rn = bgp_route_next(rn)) { for (pi = bgp_node_get_bgp_path_info(rn); pi; pi = pi->next) { @@ -10493,63 +10500,6 @@ static int bgp_show_prefix_longer(struct vty *vty, struct bgp *bgp, return ret; } -static struct peer *peer_lookup_in_view(struct vty *vty, struct bgp *bgp, - const char *ip_str, bool use_json) -{ - int ret; - struct peer *peer; - union sockunion su; - - /* Get peer sockunion. */ - ret = str2sockunion(ip_str, &su); - if (ret < 0) { - peer = peer_lookup_by_conf_if(bgp, ip_str); - if (!peer) { - peer = peer_lookup_by_hostname(bgp, ip_str); - - if (!peer) { - if (use_json) { - json_object *json_no = NULL; - json_no = json_object_new_object(); - json_object_string_add( - json_no, - "malformedAddressOrName", - ip_str); - vty_out(vty, "%s\n", - json_object_to_json_string_ext( - json_no, - JSON_C_TO_STRING_PRETTY)); - json_object_free(json_no); - } else - vty_out(vty, - "%% Malformed address or name: %s\n", - ip_str); - return NULL; - } - } - return peer; - } - - /* Peer structure lookup. */ - peer = peer_lookup(bgp, &su); - if (!peer) { - if (use_json) { - json_object *json_no = NULL; - json_no = json_object_new_object(); - json_object_string_add(json_no, "warning", - "No such neighbor in this view/vrf"); - vty_out(vty, "%s\n", - json_object_to_json_string_ext( - json_no, JSON_C_TO_STRING_PRETTY)); - json_object_free(json_no); - } else - vty_out(vty, "No such neighbor in this view/vrf\n"); - return NULL; - } - - return peer; -} - enum bgp_stats { BGP_STATS_MAXBITLEN = 0, BGP_STATS_RIB, diff --git a/bgpd/bgp_routemap.c b/bgpd/bgp_routemap.c index 5ffc416dc..7f1a9b71c 100644 --- a/bgpd/bgp_routemap.c +++ b/bgpd/bgp_routemap.c @@ -3406,31 +3406,34 @@ int bgp_route_map_update_timer(struct thread *thread) static void bgp_route_map_mark_update(const char *rmap_name) { - if (bm->t_rmap_update == NULL) { - struct listnode *node, *nnode; - struct bgp *bgp; - - /* rmap_update_timer of 0 means don't do route updates */ - if (bm->rmap_update_timer) { - bm->t_rmap_update = NULL; - thread_add_timer(bm->master, bgp_route_map_update_timer, - NULL, bm->rmap_update_timer, - &bm->t_rmap_update); - - /* Signal the groups that a route-map update event has - * started */ - for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) - update_group_policy_update(bgp, - BGP_POLICY_ROUTE_MAP, - rmap_name, 1, 1); - } else { - for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) - bgp_route_map_process_update(bgp, rmap_name, 0); + struct listnode *node, *nnode; + struct bgp *bgp; + + /* If new update is received before the current timer timed out, + * turn it off and start a new timer. + */ + if (bm->t_rmap_update != NULL) + THREAD_OFF(bm->t_rmap_update); + + /* rmap_update_timer of 0 means don't do route updates */ + if (bm->rmap_update_timer) { + thread_add_timer(bm->master, bgp_route_map_update_timer, + NULL, bm->rmap_update_timer, + &bm->t_rmap_update); + + /* Signal the groups that a route-map update event has + * started */ + for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) + update_group_policy_update(bgp, + BGP_POLICY_ROUTE_MAP, + rmap_name, 1, 1); + } else { + for (ALL_LIST_ELEMENTS(bm->bgp, node, nnode, bgp)) + bgp_route_map_process_update(bgp, rmap_name, 0); #if ENABLE_BGP_VNC - zlog_debug("%s: calling vnc_routemap_update", __func__); - vnc_routemap_update(bgp, __func__); + zlog_debug("%s: calling vnc_routemap_update", __func__); + vnc_routemap_update(bgp, __func__); #endif - } } } diff --git a/bgpd/bgp_vpn.c b/bgpd/bgp_vpn.c index 54ca980ca..09b1cb429 100644 --- a/bgpd/bgp_vpn.c +++ b/bgpd/bgp_vpn.c @@ -74,7 +74,7 @@ int show_adj_route_vpn(struct vty *vty, struct peer *peer, json_object_string_add(json_ocode, "incomplete", "?"); } - for (rn = bgp_table_top(bgp->rib[afi][SAFI_MPLS_VPN]); rn; + for (rn = bgp_table_top(bgp->rib[afi][safi]); rn; rn = bgp_route_next(rn)) { if (prd && memcmp(rn->p.u.val, prd->val, 8) != 0) continue; @@ -200,7 +200,7 @@ int show_adj_route_vpn(struct vty *vty, struct peer *peer, json_array); } else { route_vty_out_tmp(vty, &rm->p, path->attr, - SAFI_MPLS_VPN, use_json, + safi, use_json, json_array); } } diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index d79a68dca..b5f267cc3 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -8047,3 +8047,61 @@ void bgp_terminate(void) bgp_mac_finish(); } + +struct peer *peer_lookup_in_view(struct vty *vty, struct bgp *bgp, + const char *ip_str, bool use_json) +{ + int ret; + struct peer *peer; + union sockunion su; + + /* Get peer sockunion. */ + ret = str2sockunion(ip_str, &su); + if (ret < 0) { + peer = peer_lookup_by_conf_if(bgp, ip_str); + if (!peer) { + peer = peer_lookup_by_hostname(bgp, ip_str); + + if (!peer) { + if (use_json) { + json_object *json_no = NULL; + json_no = json_object_new_object(); + json_object_string_add( + json_no, + "malformedAddressOrName", + ip_str); + vty_out(vty, "%s\n", + json_object_to_json_string_ext( + json_no, + JSON_C_TO_STRING_PRETTY)); + json_object_free(json_no); + } else + vty_out(vty, + "%% Malformed address or name: %s\n", + ip_str); + return NULL; + } + } + return peer; + } + + /* Peer structure lookup. */ + peer = peer_lookup(bgp, &su); + if (!peer) { + if (use_json) { + json_object *json_no = NULL; + json_no = json_object_new_object(); + json_object_string_add(json_no, "warning", + "No such neighbor in this view/vrf"); + vty_out(vty, "%s\n", + json_object_to_json_string_ext( + json_no, JSON_C_TO_STRING_PRETTY)); + json_object_free(json_no); + } else + vty_out(vty, "No such neighbor in this view/vrf\n"); + return NULL; + } + + return peer; +} + diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 777db0ce2..9e05fd562 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -1927,4 +1927,8 @@ extern void bgp_unset_redist_vrf_bitmaps(struct bgp *, vrf_id_t); /* For benefit of rfapi */ extern struct peer *peer_new(struct bgp *bgp); + +extern struct peer *peer_lookup_in_view(struct vty *vty, struct bgp *bgp, + const char *ip_str, bool use_json); + #endif /* _QUAGGA_BGPD_H */ diff --git a/configure.ac b/configure.ac index 961336fbd..134c8692d 100755 --- a/configure.ac +++ b/configure.ac @@ -572,6 +572,20 @@ AC_ARG_ENABLE([thread-sanitizer], AS_HELP_STRING([--enable-thread-sanitizer], [enable ThreadSanitizer support for detecting data races])) AC_ARG_ENABLE([memory-sanitizer], AS_HELP_STRING([--enable-memory-sanitizer], [enable MemorySanitizer support for detecting uninitialized memory reads])) +AC_ARG_WITH([crypto], + AS_HELP_STRING([--with-crypto=<internal|openssl>], [choose between different implementations of cryptographic functions(default value is --with-crypto=internal)])) + +#if openssl, else use the internal +AS_IF([test x"${with_crypto}" = x"openssl"], [ +AC_CHECK_LIB([crypto], [EVP_DigestInit], [LIBS="$LIBS -lcrypto"], [], []) +if test $ac_cv_lib_crypto_EVP_DigestInit = no; then + AC_MSG_ERROR([build with openssl has been specified but openssl library was not found on your system]) +else + AC_DEFINE([CRYPTO_OPENSSL], [1], [Compile with openssl support]) +fi +], [test x"${with_crypto}" = x"internal" || test x"${with_crypto}" = x"" ], [AC_DEFINE([CRYPTO_INTERNAL], [1], [Compile with internal cryptographic implementation]) +], [AC_MSG_ERROR([Unknown value for --with-crypto])] +) AS_IF([test "${enable_clippy_only}" != "yes"], [ AC_CHECK_HEADERS([json-c/json.h]) diff --git a/doc/developer/library.rst b/doc/developer/library.rst index 4ba0c0ebc..7cd493ccc 100644 --- a/doc/developer/library.rst +++ b/doc/developer/library.rst @@ -8,6 +8,7 @@ Library Facilities (libfrr) :maxdepth: 2 memtypes + rcu lists logging hooks diff --git a/doc/developer/rcu.rst b/doc/developer/rcu.rst new file mode 100644 index 000000000..c2ddf93f5 --- /dev/null +++ b/doc/developer/rcu.rst @@ -0,0 +1,269 @@ +.. highlight:: c + +RCU +=== + +Introduction +------------ + +RCU (Read-Copy-Update) is, fundamentally, a paradigm of multithreaded +operation (and not a set of APIs.) The core ideas are: + +* longer, complicated updates to structures are made only on private, + "invisible" copies. Other threads, when they access the structure, see an + older (but consistent) copy. + +* once done, the updated copy is swapped in in a single operation so that + other threads see either the old or the new data but no inconsistent state + between. + +* the old instance is only released after making sure that it is impossible + any other thread might still be reading it. + +For more information, please search for general or Linux kernel RCU +documentation; there is no way this doc can be comprehensive in explaining the +interactions: + +* https://en.wikipedia.org/wiki/Read-copy-update +* https://www.kernel.org/doc/html/latest/kernel-hacking/locking.html#avoiding-locks-read-copy-update +* https://lwn.net/Articles/262464/ +* http://www.rdrop.com/users/paulmck/RCU/rclock_OLS.2001.05.01c.pdf +* http://lse.sourceforge.net/locking/rcupdate.html + +RCU, the TL;DR +^^^^^^^^^^^^^^ + +#. data structures are always consistent for reading. That's the "R" part. +#. reading never blocks / takes a lock. +#. rcu_read_lock is not a lock in the traditional sense. Think of it as a + "reservation"; it notes what the *oldest* possible thing the thread might + be seeing is, and which thus can't be deleted yet. +#. you create some object, finish it up, and then publish it. +#. publishing is an ``atomic_*`` call with ``memory_order_release``, which + tells the compiler to make sure prior memory writes have completed before + doing the atomic op. +#. ``ATOMLIST_*`` ``add`` operations do the ``memory_order_release`` for you. +#. you can't touch the object after it is published, except with atomic ops. +#. because you can't touch it, if you want to change it you make a new copy, + work on that, and then publish the new copy. That's the "CU" part. +#. deleting the object is also an atomic op. +#. other threads that started working before you published / deleted an object + might not see the new object / still see the deleted object. +#. because other threads may still see deleted objects, the ``free()`` needs + to be delayed. That's what :c:func:`rcu_free()` is for. + + +When (not) to use RCU +^^^^^^^^^^^^^^^^^^^^^ + +RCU is designed for read-heavy workloads where objects are updated relatively +rarely, but frequently accessed. Do *not* indiscriminately replace locking by +RCU patterns. + +The "copy" part of RCU implies that, while updating, several copies of a given +object exist in parallel. Even after the updated copy is swapped in, the old +object remains queued for freeing until all other threads are guaranteed to +not be accessing it anymore, due to passing a sequence point. In addition to +the increased memory usage, there may be some bursted (due to batching) malloc +contention when the RCU cleanup thread does its thing and frees memory. + +Other useful patterns +^^^^^^^^^^^^^^^^^^^^^ + +In addition to the full "copy object, apply changes, atomically update" +approach, there are 2 "reduced" usage cases that can be done: + +* atomically updating single pieces of a particular object, e.g. some flags + or configuration piece + +* straight up read-only / immutable objects + +Both of these cases can be considered RCU "subsets". For example, when +maintaining an atomic list of items, but these items only have a single +integer value that needs to be updated, that value can be atomically updated +without copying the entire object. However, the object still needs to be +free'd through :c:func:`rcu_free()` since reading/updating and deleting might +be happening concurrently. The same applies for immutable objects; deletion +might still race with reading so they need to be free'd through RCU. + +FRR API +------- + +Before diving into detail on the provided functions, it is important to note +that the FRR RCU API covers the **cleanup part of RCU, not the read-copy-update +paradigm itself**. These parts are handled by standard C11 atomic operations, +and by extension through the atomic data structures (ATOMLIST, ATOMSORT & co.) + +The ``rcu_*`` functions only make sense in conjunction with these RCU access +patterns. If you're calling the RCU API but not using these, something is +wrong. The other way around is not necessarily true; it is possible to use +atomic ops & datastructures with other types of locking, e.g. rwlocks. + +.. c:function:: void rcu_read_lock() +.. c:function:: void rcu_read_unlock() + + These functions acquire / release the RCU read-side lock. All access to + RCU-guarded data must be inside a block guarded by these. Any number of + threads may hold the RCU read-side lock at a given point in time, including + both no threads at all and all threads. + + The functions implement a depth counter, i.e. can be nested. The nested + calls are cheap, since they only increment/decrement the counter. + Therefore, any place that uses RCU data and doesn't have a guarantee that + the caller holds RCU (e.g. ``lib/`` code) should just have its own + rcu_read_lock/rcu_read_unlock pair. + + At the "root" level (e.g. un-nested), these calls can incur the cost of one + syscall (to ``futex()``). That puts them on about the same cost as a + mutex lock/unlock. + + The ``thread_master`` code currently always holds RCU everywhere, except + while doing the actual ``poll()`` syscall. This is both an optimization as + well as an "easement" into getting RCU going. The current implementation + contract is that any ``struct thread *`` callback is called with a RCU + holding depth of 1, and that this is owned by the thread so it may (should) + drop and reacquire it when doing some longer-running work. + + .. warning:: + + The RCU read-side lock must be held **continuously** for the entire time + any piece of RCU data is used. This includes any access to RCU data + after the initial ``atomic_load``. If the RCU read-side lock is + released, any RCU-protected pointers as well as the data they refer to + become invalid, as another thread may have called :c:func:`rcu_free` on + them. + +.. c:type:: struct rcu_head +.. c:type:: struct rcu_head_close +.. c:type:: struct rcu_action + + The ``rcu_head`` structures are small (16-byte) bits that contain the + queueing machinery for the RCU sweeper/cleanup mechanisms. + + Any piece of data that is cleaned up by RCU needs to have a matching + ``rcu_head`` embedded in it. If there is more than one cleanup operation + to be done (e.g. closing a file descriptor), more than one ``rcu_head`` may + be embedded. + + .. warning:: + + It is not possible to reuse a ``rcu_head``. It is owned by the RCU code + as soon as ``rcu_*`` is called on it. + + The ``_close`` variant carries an extra ``int fd`` field to store the fd to + be closed. + + To minimize the amount of memory used for ``rcu_head``, details about the + RCU operation to be performed are moved into the ``rcu_action`` structure. + It contains e.g. the MTYPE for :c:func:`rcu_free` calls. The pointer to be + freed is stored as an offset relative to the ``rcu_head``, which means it + must be embedded as a struct field so the offset is constant. + + The ``rcu_action`` structure is an implementation detail. Using + ``rcu_free`` or ``rcu_close`` will set it up correctly without further + code needed. + + The ``rcu_head`` may be put in an union with other data if the other data + is only used during "life" of the data, since the ``rcu_head`` is used only + for the "death" of data. But note that other threads may still be reading + a piece of data while a thread is working to free it. + +.. c:function:: void rcu_free(struct memtype *mtype, struct X *ptr, field) + + Free a block of memory after RCU has ensured no other thread can be + accessing it anymore. The pointer remains valid for any other thread that + has called :c:func:`rcu_read_lock` before the ``rcu_free`` call. + + .. warning:: + + In some other RCU implementations, the pointer remains valid to the + *calling* thread if it is holding the RCU read-side lock. This is not + the case in FRR, particularly when running single-threaded. Enforcing + this rule also allows static analysis to find use-after-free issues. + + ``mtype`` is the libfrr ``MTYPE_FOO`` allocation type to pass to + :c:func:`XFREE`. + + ``field`` must be the name of a ``struct rcu_head`` member field in ``ptr``. + The offset of this field (which must be constant) is used to reduce the + memory size of ``struct rcu_head``. + + .. note:: + + ``rcu_free`` (and ``rcu_close``) calls are more efficient if they are + put close to each other. When freeing several RCU'd resources, try to + move the calls next to each other (even if the data structures do not + directly point to each other.) + + Having the calls bundled reduces the cost of adding the ``rcu_head`` to + the RCU queue; the RCU queue is an atomic data structure whose usage + will require the CPU to acquire an exclusive hold on relevant cache + lines. + +.. c:function:: void rcu_close(struct rcu_head_close *head, int fd) + + Close a file descriptor after ensuring no other thread might be using it + anymore. Same as :c:func:`rcu_free`, except it calls ``close`` instead of + ``free``. + +Internals +^^^^^^^^^ + +.. c:type:: struct rcu_thread + + Per-thread state maintained by the RCU code, set up by the following + functions. A pointer to a thread's own ``rcu_thread`` is saved in + thread-local storage. + +.. c:function:: struct rcu_thread *rcu_thread_prepare(void) +.. c:function:: void rcu_thread_unprepare(struct rcu_thread *rcu_thread) +.. c:function:: void rcu_thread_start(struct rcu_thread *rcu_thread) + + Since the RCU code needs to have a list of all active threads, these + functions are used by the ``frr_pthread`` code to set up threads. Teardown + is automatic. It should not be necessary to call these functions. + + Any thread that accesses RCU-protected data needs to be registered with + these functions. Threads that do not access RCU-protected data may call + these functions but do not need to. + + Note that passing a pointer to RCU-protected data to some library which + accesses that pointer makes the library "access RCU-protected data". In + that case, either all of the library's threads must be registered for RCU, + or the code must instead pass a (non-RCU) copy of the data to the library. + +.. c:function:: void rcu_shutdown(void) + + Stop the RCU sweeper thread and make sure all cleanup has finished. + + This function is called on daemon exit by the libfrr code to ensure pending + RCU operations are completed. This is mostly to get a clean exit without + memory leaks from queued RCU operations. It should not be necessary to + call this function as libfrr handles this. + +FRR specifics and implementation details +---------------------------------------- + +The FRR RCU infrastructure has the following characteristics: + +* it is Epoch-based with a 32-bit wrapping counter. (This is somewhat + different from other Epoch-based approaches which may be designed to only + use 3 counter values, but works out to a simple implementation.) + +* instead of tracking CPUs as the Linux kernel does, threads are tracked. This + has exactly zero semantic impact, RCU just cares about "threads of + execution", which the kernel can optimize to CPUs but we can't. But it + really boils down to the same thing. + +* there are no ``rcu_dereference`` and ``rcu_assign_pointer`` - use + ``atomic_load`` and ``atomic_store`` instead. (These didn't exist when the + Linux RCU code was created.) + +* there is no ``synchronize_rcu``; this is a design choice but may be revisited + at a later point. ``synchronize_rcu`` blocks a thread until it is guaranteed + that no other threads might still be accessing data structures that they may + have access to at the beginning of the function call. This is a blocking + design and probably not appropriate for FRR. Instead, ``rcu_call`` can be + used to have the RCU sweeper thread make a callback after the same constraint + is fulfilled in an asynchronous way. Most needs should be covered by + ``rcu_free`` and ``rcu_close``. diff --git a/doc/developer/subdir.am b/doc/developer/subdir.am index 996f12d47..1fc593e56 100644 --- a/doc/developer/subdir.am +++ b/doc/developer/subdir.am @@ -42,6 +42,7 @@ dev_RSTFILES = \ doc/developer/packaging-debian.rst \ doc/developer/packaging-redhat.rst \ doc/developer/packaging.rst \ + doc/developer/rcu.rst \ doc/developer/testing.rst \ doc/developer/topotests-snippets.rst \ doc/developer/topotests.rst \ diff --git a/lib/frr_pthread.c b/lib/frr_pthread.c index e588571c0..bdb6c2a39 100644 --- a/lib/frr_pthread.c +++ b/lib/frr_pthread.c @@ -133,18 +133,29 @@ int frr_pthread_set_name(struct frr_pthread *fpt) return ret; } +static void *frr_pthread_inner(void *arg) +{ + struct frr_pthread *fpt = arg; + + rcu_thread_start(fpt->rcu_thread); + return fpt->attr.start(fpt); +} + int frr_pthread_run(struct frr_pthread *fpt, const pthread_attr_t *attr) { int ret; - ret = pthread_create(&fpt->thread, attr, fpt->attr.start, fpt); + fpt->rcu_thread = rcu_thread_prepare(); + ret = pthread_create(&fpt->thread, attr, frr_pthread_inner, fpt); /* * Per pthread_create(3), the contents of fpt->thread are undefined if * pthread_create() did not succeed. Reset this value to zero. */ - if (ret < 0) + if (ret < 0) { + rcu_thread_unprepare(fpt->rcu_thread); memset(&fpt->thread, 0x00, sizeof(fpt->thread)); + } return ret; } diff --git a/lib/frr_pthread.h b/lib/frr_pthread.h index 3afe7ba96..6096a5037 100644 --- a/lib/frr_pthread.h +++ b/lib/frr_pthread.h @@ -23,6 +23,7 @@ #include <pthread.h> #include "frratomic.h" #include "memory.h" +#include "frrcu.h" #include "thread.h" #ifdef __cplusplus @@ -50,6 +51,8 @@ struct frr_pthread { /* pthread id */ pthread_t thread; + struct rcu_thread *rcu_thread; + /* thread master for this pthread's thread.c event loop */ struct thread_master *master; diff --git a/lib/frrcu.c b/lib/frrcu.c new file mode 100644 index 000000000..7e6475b64 --- /dev/null +++ b/lib/frrcu.c @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2017-19 David Lamparter, for NetDEF, Inc. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* implementation notes: this is an epoch-based RCU implementation. rcu_seq + * (global variable) counts the current epoch. Threads hold a specific epoch + * in rcu_read_lock(). This is the oldest epoch a thread might be accessing + * data from. + * + * The rcu_seq global is only pushed forward on rcu_read_lock() and + * rcu_read_unlock() calls. This makes things a tad more efficient since + * those are the only places it matters: + * - on rcu_read_lock, we don't want to hold an old epoch pointlessly + * - on rcu_read_unlock, we want to make sure we're not stuck on an old epoch + * when heading into a long idle period where no thread holds RCU + * + * rcu_thread structures themselves are RCU-free'd. + * + * rcu_head structures are the most iffy; normally for an ATOMLIST we would + * need to make sure we use rcu_free or pthread_rwlock to deallocate old items + * to prevent ABA or use-after-free problems. However, our ATOMLIST code + * guarantees that if the list remains non-empty in all cases, we only need + * the "last" pointer to do an "add_tail()", i.e. we can't run into ABA/UAF + * issues - but we do need to keep at least 1 item on the list. + * + * (Search the atomlist code for all uses of "last") + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <pthread.h> +#ifdef HAVE_PTHREAD_NP_H +#include <pthread_np.h> +#endif +#include <string.h> +#include <unistd.h> +#include <signal.h> + +#include "frrcu.h" +#include "seqlock.h" +#include "atomlist.h" + +DEFINE_MTYPE_STATIC(LIB, RCU_THREAD, "RCU thread") +DEFINE_MTYPE_STATIC(LIB, RCU_NEXT, "RCU sequence barrier") + +DECLARE_ATOMLIST(rcu_heads, struct rcu_head, head) + +PREDECL_ATOMLIST(rcu_threads) +struct rcu_thread { + struct rcu_threads_item head; + + struct rcu_head rcu_head; + + struct seqlock rcu; + + /* only accessed by thread itself, not atomic */ + unsigned depth; +}; +DECLARE_ATOMLIST(rcu_threads, struct rcu_thread, head) + +static const struct rcu_action rcua_next = { .type = RCUA_NEXT }; +static const struct rcu_action rcua_end = { .type = RCUA_END }; +static const struct rcu_action rcua_close = { .type = RCUA_CLOSE }; + +struct rcu_next { + struct rcu_head head_free; + struct rcu_head head_next; +}; + +#define rcu_free_internal(mtype, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_FREE, \ + .u.free = { \ + .mt = mtype, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + _rcu_head->action = &_rcu_action; \ + rcu_heads_add_tail(&rcu_heads, _rcu_head); \ + } while (0) + +/* primary global RCU position */ +static struct seqlock rcu_seq; +/* this is set to rcu_seq whenever something is added on the RCU queue. + * rcu_read_lock() and rcu_read_unlock() will then bump rcu_seq up one step. + */ +static _Atomic seqlock_val_t rcu_dirty; + +static struct rcu_threads_head rcu_threads; +static struct rcu_heads_head rcu_heads; + +/* main thread & RCU sweeper have pre-setup rcu_thread structures. The + * reasons are different: + * + * - rcu_thread_main is there because the main thread isn't started like + * other threads, it's implicitly created when the program is started. So + * rcu_thread_main matches up implicitly. + * + * - rcu_thread_rcu isn't actually put on the rcu_threads list (makes no + * sense really), it only exists so we can call RCU-using functions from + * the RCU thread without special handling in rcu_read_lock/unlock. + */ +static struct rcu_thread rcu_thread_main; +static struct rcu_thread rcu_thread_rcu; + +static pthread_t rcu_pthread; +static pthread_key_t rcu_thread_key; +static bool rcu_active; + +static void rcu_start(void); +static void rcu_bump(void); + +/* + * preinitialization for main thread + */ +static void rcu_thread_end(void *rcu_thread); + +static void rcu_preinit(void) __attribute__((constructor)); +static void rcu_preinit(void) +{ + struct rcu_thread *rt; + + rt = &rcu_thread_main; + rt->depth = 1; + seqlock_init(&rt->rcu); + seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); + + pthread_key_create(&rcu_thread_key, rcu_thread_end); + pthread_setspecific(rcu_thread_key, rt); + + rcu_threads_add_tail(&rcu_threads, rt); + + /* RCU sweeper's rcu_thread is a dummy, NOT added to rcu_threads */ + rt = &rcu_thread_rcu; + rt->depth = 1; + + seqlock_init(&rcu_seq); + seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); +} + +static struct rcu_thread *rcu_self(void) +{ + return (struct rcu_thread *)pthread_getspecific(rcu_thread_key); +} + +/* + * thread management (for the non-main thread) + */ +struct rcu_thread *rcu_thread_prepare(void) +{ + struct rcu_thread *rt, *cur; + + rcu_assert_read_locked(); + + if (!rcu_active) + rcu_start(); + + cur = rcu_self(); + assert(cur->depth); + + /* new thread always starts with rcu_read_lock held at depth 1, and + * holding the same epoch as the parent (this makes it possible to + * use RCU for things passed into the thread through its arg) + */ + rt = XCALLOC(MTYPE_RCU_THREAD, sizeof(*rt)); + rt->depth = 1; + + seqlock_init(&rt->rcu); + seqlock_acquire(&rt->rcu, &cur->rcu); + + rcu_threads_add_tail(&rcu_threads, rt); + + return rt; +} + +void rcu_thread_start(struct rcu_thread *rt) +{ + pthread_setspecific(rcu_thread_key, rt); +} + +void rcu_thread_unprepare(struct rcu_thread *rt) +{ + if (rt == &rcu_thread_rcu) + return; + + rt->depth = 1; + seqlock_acquire(&rt->rcu, &rcu_seq); + + rcu_bump(); + if (rt != &rcu_thread_main) + /* this free() happens after seqlock_release() below */ + rcu_free_internal(MTYPE_RCU_THREAD, rt, rcu_head); + + rcu_threads_del(&rcu_threads, rt); + seqlock_release(&rt->rcu); +} + +static void rcu_thread_end(void *rtvoid) +{ + struct rcu_thread *rt = rtvoid; + rcu_thread_unprepare(rt); +} + +/* + * main RCU control aspects + */ + +static void rcu_bump(void) +{ + struct rcu_next *rn; + + rn = XMALLOC(MTYPE_RCU_NEXT, sizeof(*rn)); + + /* note: each RCUA_NEXT item corresponds to exactly one seqno bump. + * This means we don't need to communicate which seqno is which + * RCUA_NEXT, since we really don't care. + */ + + /* + * Important race condition: while rcu_heads_add_tail is executing, + * there is an intermediate point where the rcu_heads "last" pointer + * already points to rn->head_next, but rn->head_next isn't added to + * the list yet. That means any other "add_tail" calls append to this + * item, which isn't fully on the list yet. Freeze this thread at + * that point and look at another thread doing a rcu_bump. It adds + * these two items and then does a seqlock_bump. But the rcu_heads + * list is still "interrupted" and there's no RCUA_NEXT on the list + * yet (from either the frozen thread or the second thread). So + * rcu_main() might actually hit the end of the list at the + * "interrupt". + * + * This situation is prevented by requiring that rcu_read_lock is held + * for any calls to rcu_bump, since if we're holding the current RCU + * epoch, that means rcu_main can't be chewing on rcu_heads and hit + * that interruption point. Only by the time the thread has continued + * to rcu_read_unlock() - and therefore completed the add_tail - the + * RCU sweeper gobbles up the epoch and can be sure to find at least + * the RCUA_NEXT and RCUA_FREE items on rcu_heads. + */ + rn->head_next.action = &rcua_next; + rcu_heads_add_tail(&rcu_heads, &rn->head_next); + + /* free rn that we allocated above. + * + * This is INTENTIONALLY not built into the RCUA_NEXT action. This + * ensures that after the action above is popped off the queue, there + * is still at least 1 item on the RCU queue. This means we never + * delete the last item, which is extremely important since it keeps + * the atomlist ->last pointer alive and well. + * + * If we were to "run dry" on the RCU queue, add_tail may run into the + * "last item is being deleted - start over" case, and then we may end + * up accessing old RCU queue items that are already free'd. + */ + rcu_free_internal(MTYPE_RCU_NEXT, rn, head_free); + + /* Only allow the RCU sweeper to run after these 2 items are queued. + * + * If another thread enqueues some RCU action in the intermediate + * window here, nothing bad happens - the queued action is associated + * with a larger seq# than strictly necessary. Thus, it might get + * executed a bit later, but that's not a problem. + * + * If another thread acquires the read lock in this window, it holds + * the previous epoch, but its RCU queue actions will be in the next + * epoch. This isn't a problem either, just a tad inefficient. + */ + seqlock_bump(&rcu_seq); +} + +static void rcu_bump_maybe(void) +{ + seqlock_val_t dirty; + + dirty = atomic_load_explicit(&rcu_dirty, memory_order_relaxed); + /* no problem if we race here and multiple threads bump rcu_seq; + * bumping too much causes no issues while not bumping enough will + * result in delayed cleanup + */ + if (dirty == seqlock_cur(&rcu_seq)) + rcu_bump(); +} + +void rcu_read_lock(void) +{ + struct rcu_thread *rt = rcu_self(); + + assert(rt); + if (rt->depth++ > 0) + return; + + seqlock_acquire(&rt->rcu, &rcu_seq); + /* need to hold RCU for bump ... */ + rcu_bump_maybe(); + /* ... but no point in holding the old epoch if we just bumped */ + seqlock_acquire(&rt->rcu, &rcu_seq); +} + +void rcu_read_unlock(void) +{ + struct rcu_thread *rt = rcu_self(); + + assert(rt && rt->depth); + if (--rt->depth > 0) + return; + rcu_bump_maybe(); + seqlock_release(&rt->rcu); +} + +void rcu_assert_read_locked(void) +{ + struct rcu_thread *rt = rcu_self(); + assert(rt && rt->depth && seqlock_held(&rt->rcu)); +} + +void rcu_assert_read_unlocked(void) +{ + struct rcu_thread *rt = rcu_self(); + assert(rt && !rt->depth && !seqlock_held(&rt->rcu)); +} + +/* + * RCU resource-release thread + */ + +static void *rcu_main(void *arg); + +static void rcu_start(void) +{ + /* ensure we never handle signals on the RCU thread by blocking + * everything here (new thread inherits signal mask) + */ + sigset_t oldsigs, blocksigs; + + sigfillset(&blocksigs); + pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs); + + rcu_active = true; + + assert(!pthread_create(&rcu_pthread, NULL, rcu_main, NULL)); + + pthread_sigmask(SIG_SETMASK, &oldsigs, NULL); + +#ifdef HAVE_PTHREAD_SETNAME_NP +# ifdef GNU_LINUX + pthread_setname_np(rcu_pthread, "RCU sweeper"); +# elif defined(__NetBSD__) + pthread_setname_np(rcu_pthread, "RCU sweeper", NULL); +# endif +#elif defined(HAVE_PTHREAD_SET_NAME_NP) + pthread_set_name_np(rcu_pthread, "RCU sweeper"); +#endif +} + +static void rcu_do(struct rcu_head *rh) +{ + struct rcu_head_close *rhc; + void *p; + + switch (rh->action->type) { + case RCUA_FREE: + p = (char *)rh - rh->action->u.free.offset; + if (rh->action->u.free.mt) + qfree(rh->action->u.free.mt, p); + else + free(p); + break; + case RCUA_CLOSE: + rhc = container_of(rh, struct rcu_head_close, + rcu_head); + close(rhc->fd); + break; + case RCUA_CALL: + p = (char *)rh - rh->action->u.call.offset; + rh->action->u.call.fptr(p); + break; + + case RCUA_INVALID: + case RCUA_NEXT: + case RCUA_END: + default: + assert(0); + } +} + +static void rcu_watchdog(struct rcu_thread *rt) +{ +#if 0 + /* future work: print a backtrace for the thread that's holding up + * RCU. The only (good) way of doing that is to send a signal to the + * other thread, save away the backtrace in the signal handler, and + * block here until the signal is done processing. + * + * Just haven't implemented that yet. + */ + fprintf(stderr, "RCU watchdog %p\n", rt); +#endif +} + +static void *rcu_main(void *arg) +{ + struct rcu_thread *rt; + struct rcu_head *rh = NULL; + bool end = false; + struct timespec maxwait; + + seqlock_val_t rcuval = SEQLOCK_STARTVAL; + + pthread_setspecific(rcu_thread_key, &rcu_thread_rcu); + + while (!end) { + seqlock_wait(&rcu_seq, rcuval); + + /* RCU watchdog timeout, TODO: configurable value */ + clock_gettime(CLOCK_MONOTONIC, &maxwait); + maxwait.tv_nsec += 100 * 1000 * 1000; + if (maxwait.tv_nsec >= 1000000000) { + maxwait.tv_sec++; + maxwait.tv_nsec -= 1000000000; + } + + frr_each (rcu_threads, &rcu_threads, rt) + if (!seqlock_timedwait(&rt->rcu, rcuval, &maxwait)) { + rcu_watchdog(rt); + seqlock_wait(&rt->rcu, rcuval); + } + + while ((rh = rcu_heads_pop(&rcu_heads))) { + if (rh->action->type == RCUA_NEXT) + break; + else if (rh->action->type == RCUA_END) + end = true; + else + rcu_do(rh); + } + + rcuval += SEQLOCK_INCR; + } + + /* rcu_shutdown can only be called singlethreaded, and it does a + * pthread_join, so it should be impossible that anything ended up + * on the queue after RCUA_END + */ +#if 1 + assert(!rcu_heads_first(&rcu_heads)); +#else + while ((rh = rcu_heads_pop(&rcu_heads))) + if (rh->action->type >= RCUA_FREE) + rcu_do(rh); +#endif + return NULL; +} + +void rcu_shutdown(void) +{ + static struct rcu_head rcu_head_end; + struct rcu_thread *rt = rcu_self(); + void *retval; + + if (!rcu_active) + return; + + rcu_assert_read_locked(); + assert(rcu_threads_count(&rcu_threads) == 1); + + rcu_enqueue(&rcu_head_end, &rcua_end); + + rt->depth = 0; + seqlock_release(&rt->rcu); + seqlock_release(&rcu_seq); + rcu_active = false; + + /* clearing rcu_active is before pthread_join in case we hang in + * pthread_join & get a SIGTERM or something - in that case, just + * ignore the maybe-still-running RCU thread + */ + if (pthread_join(rcu_pthread, &retval) == 0) { + seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); + seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); + rt->depth = 1; + } +} + +/* + * RCU'd free functions + */ + +void rcu_enqueue(struct rcu_head *rh, const struct rcu_action *action) +{ + /* refer to rcu_bump() for why we need to hold RCU when adding items + * to rcu_heads + */ + rcu_assert_read_locked(); + + rh->action = action; + + if (!rcu_active) { + rcu_do(rh); + return; + } + rcu_heads_add_tail(&rcu_heads, rh); + atomic_store_explicit(&rcu_dirty, seqlock_cur(&rcu_seq), + memory_order_relaxed); +} + +void rcu_close(struct rcu_head_close *rhc, int fd) +{ + rhc->fd = fd; + rcu_enqueue(&rhc->rcu_head, &rcua_close); +} diff --git a/lib/frrcu.h b/lib/frrcu.h new file mode 100644 index 000000000..8f789303c --- /dev/null +++ b/lib/frrcu.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017-19 David Lamparter, for NetDEF, Inc. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _FRRCU_H +#define _FRRCU_H + +#include "memory.h" +#include "atomlist.h" +#include "seqlock.h" + +/* quick RCU primer: + * There's a global sequence counter. Whenever a thread does a + * rcu_read_lock(), it is marked as holding the current sequence counter. + * When something is cleaned with RCU, the global sequence counter is + * increased and the item is queued for cleanup - *after* all threads are + * at a more recent sequence counter (or no sequence counter / unheld). + * + * So, by delaying resource cleanup, RCU ensures that things don't go away + * while another thread may hold a (stale) reference. + * + * Note that even if a thread is in rcu_read_lock(), it is invalid for that + * thread to access bits after rcu_free() & co on them. This is a design + * choice to allow no-op'ing out the entire RCU mechanism if we're running + * singlethreaded. (Also allows some optimization on the counter bumping.) + * + * differences from Linux Kernel RCU: + * - there's no rcu_synchronize(), if you really need to defer something + * use rcu_call() (and double check it's really necessary) + * - rcu_dereference() and rcu_assign_pointer() don't exist, use atomic_* + * instead (ATOM* list structures do the right thing) + */ + +/* opaque */ +struct rcu_thread; + +/* called before new thread creation, sets up rcu thread info for new thread + * before it actually exits. This ensures possible RCU references are held + * for thread startup. + * + * return value must be passed into the new thread's call to rcu_thread_start() + */ +extern struct rcu_thread *rcu_thread_prepare(void); + +/* cleanup in case pthread_create() fails */ +extern void rcu_thread_unprepare(struct rcu_thread *rcu_thread); + +/* called early in the new thread, with the return value from the above. + * NB: new thread is initially in RCU-held state! (at depth 1) + * + * TBD: maybe inherit RCU state from rcu_thread_prepare()? + */ +extern void rcu_thread_start(struct rcu_thread *rcu_thread); + +/* thread exit is handled through pthread_key_create's destructor function */ + +/* global RCU shutdown - must be called with only 1 active thread left. waits + * until remaining RCU actions are done & RCU thread has exited. + * + * This is mostly here to get a clean exit without memleaks. + */ +extern void rcu_shutdown(void); + +/* enter / exit RCU-held state. counter-based, so can be called nested. */ +extern void rcu_read_lock(void); +extern void rcu_read_unlock(void); + +/* for debugging / safety checks */ +extern void rcu_assert_read_locked(void); +extern void rcu_assert_read_unlocked(void); + +enum rcu_action_type { + RCUA_INVALID = 0, + /* used internally by the RCU code, shouldn't ever show up outside */ + RCUA_NEXT, + RCUA_END, + /* normal RCU actions, for outside use */ + RCUA_FREE, + RCUA_CLOSE, + RCUA_CALL, +}; + +/* since rcu_head is intended to be embedded into structs which may exist + * with lots of copies, rcu_head is shrunk down to its absolute minimum - + * the atomlist pointer + a pointer to this action struct. + */ +struct rcu_action { + enum rcu_action_type type; + + union { + struct { + struct memtype *mt; + ptrdiff_t offset; + } free; + + struct { + void (*fptr)(void *arg); + ptrdiff_t offset; + } call; + } u; +}; + +/* RCU cleanup function queue item */ +PREDECL_ATOMLIST(rcu_heads) +struct rcu_head { + struct rcu_heads_item head; + const struct rcu_action *action; +}; + +/* special RCU head for delayed fd-close */ +struct rcu_head_close { + struct rcu_head rcu_head; + int fd; +}; + +/* enqueue RCU action - use the macros below to get the rcu_action set up */ +extern void rcu_enqueue(struct rcu_head *head, const struct rcu_action *action); + +/* RCU free() and file close() operations. + * + * freed memory / closed fds become _immediately_ unavailable to the calling + * thread, but will remain available for other threads until they have passed + * into RCU-released state. + */ + +/* may be called with NULL mt to do non-MTYPE free() */ +#define rcu_free(mtype, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_FREE, \ + .u.free = { \ + .mt = mtype, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + rcu_enqueue(_rcu_head, &_rcu_action); \ + } while (0) + +/* use this sparingly, it runs on (and blocks) the RCU thread */ +#define rcu_call(func, ptr, field) \ + do { \ + typeof(ptr) _ptr = (ptr); \ + void (*fptype)(typeof(ptr)); \ + struct rcu_head *_rcu_head = &_ptr->field; \ + static const struct rcu_action _rcu_action = { \ + .type = RCUA_CALL, \ + .u.call = { \ + .fptr = (void *)func, \ + .offset = offsetof(typeof(*_ptr), field), \ + }, \ + }; \ + (void)(_fptype = func); \ + rcu_enqueue(_rcu_head, &_rcu_action); \ + } while (0) + +extern void rcu_close(struct rcu_head_close *head, int fd); + +#endif /* _FRRCU_H */ diff --git a/lib/libfrr.c b/lib/libfrr.c index 0fc321d6e..35c609214 100644 --- a/lib/libfrr.c +++ b/lib/libfrr.c @@ -41,6 +41,7 @@ #include "northbound_cli.h" #include "northbound_db.h" #include "debug.h" +#include "frrcu.h" DEFINE_HOOK(frr_late_init, (struct thread_master * tm), (tm)) DEFINE_KOOH(frr_early_fini, (), ()) @@ -1081,6 +1082,7 @@ void frr_fini(void) master = NULL; closezlog(); /* frrmod_init -> nothing needed / hooks */ + rcu_shutdown(); if (!debug_memstats_at_exit) return; @@ -559,9 +559,7 @@ static void crash_write(struct fbuf *fb, char *msgstart) void zlog_signal(int signo, const char *action, void *siginfo_v, void *program_counter) { -#ifdef SA_SIGINFO siginfo_t *siginfo = siginfo_v; -#endif time_t now; char buf[sizeof("DEFAULT: Received signal S at T (si_addr 0xP, PC 0xP); aborting...") + 100]; @@ -575,7 +573,6 @@ void zlog_signal(int signo, const char *action, void *siginfo_v, msgstart = fb.pos; bprintfrr(&fb, "Received signal %d at %lld", signo, (long long)now); -#ifdef SA_SIGINFO if (program_counter) bprintfrr(&fb, " (si_addr 0x%tx, PC 0x%tx)", (ptrdiff_t)siginfo->si_addr, @@ -583,7 +580,6 @@ void zlog_signal(int signo, const char *action, void *siginfo_v, else bprintfrr(&fb, " (si_addr 0x%tx)", (ptrdiff_t)siginfo->si_addr); -#endif /* SA_SIGINFO */ bprintfrr(&fb, "; %s\n", action); crash_write(&fb, msgstart); diff --git a/lib/seqlock.c b/lib/seqlock.c index 223d14952..c05ec19db 100644 --- a/lib/seqlock.c +++ b/lib/seqlock.c @@ -25,6 +25,7 @@ #include "config.h" #endif +#include <string.h> #include <unistd.h> #include <limits.h> #include <errno.h> @@ -35,44 +36,75 @@ #include "seqlock.h" +/**************************************** + * OS specific synchronization wrappers * + ****************************************/ + +/* + * Linux: sys_futex() + */ #ifdef HAVE_SYNC_LINUX_FUTEX -/* Linux-specific - sys_futex() */ #include <sys/syscall.h> #include <linux/futex.h> -static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout, - void *addr2, int val3) +static long sys_futex(void *addr1, int op, int val1, + const struct timespec *timeout, void *addr2, int val3) { return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); } #define wait_once(sqlo, val) \ sys_futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, NULL, NULL, 0) +#define wait_time(sqlo, val, time, reltime) \ + sys_futex((int *)&sqlo->pos, FUTEX_WAIT_BITSET, (int)val, time, \ + NULL, ~0U) #define wait_poke(sqlo) \ sys_futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) +/* + * OpenBSD: sys_futex(), almost the same as on Linux + */ #elif defined(HAVE_SYNC_OPENBSD_FUTEX) -/* OpenBSD variant of the above. untested, not upstream in OpenBSD. */ #include <sys/syscall.h> #include <sys/futex.h> +#define TIME_RELATIVE 1 + #define wait_once(sqlo, val) \ futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, NULL, NULL, 0) +#define wait_time(sqlo, val, time, reltime) \ + futex((int *)&sqlo->pos, FUTEX_WAIT, (int)val, reltime, NULL, 0) #define wait_poke(sqlo) \ futex((int *)&sqlo->pos, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) +/* + * FreeBSD: _umtx_op() + */ #elif defined(HAVE_SYNC_UMTX_OP) -/* FreeBSD-specific: umtx_op() */ #include <sys/umtx.h> #define wait_once(sqlo, val) \ _umtx_op((void *)&sqlo->pos, UMTX_OP_WAIT_UINT, val, NULL, NULL) +static int wait_time(struct seqlock *sqlo, uint32_t val, + const struct timespec *abstime, + const struct timespec *reltime) +{ + struct _umtx_time t; + t._flags = UMTX_ABSTIME; + t._clockid = CLOCK_MONOTONIC; + memcpy(&t._timeout, abstime, sizeof(t._timeout)); + return _umtx_op((void *)&sqlo->pos, UMTX_OP_WAIT_UINT, val, + (void *)(uintptr_t) sizeof(t), &t); +} #define wait_poke(sqlo) \ _umtx_op((void *)&sqlo->pos, UMTX_OP_WAKE, INT_MAX, NULL, NULL) -#else -/* generic version. used on *BSD, Solaris and OSX. +/* + * generic version. used on NetBSD, Solaris and OSX. really shitty. */ +#else + +#define TIME_ABS_REALTIME 1 #define wait_init(sqlo) do { \ pthread_mutex_init(&sqlo->lock, NULL); \ @@ -80,6 +112,9 @@ static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout, } while (0) #define wait_prep(sqlo) pthread_mutex_lock(&sqlo->lock) #define wait_once(sqlo, val) pthread_cond_wait(&sqlo->wake, &sqlo->lock) +#define wait_time(sqlo, val, time, reltime) \ + pthread_cond_timedwait(&sqlo->wake, \ + &sqlo->lock, time); #define wait_done(sqlo) pthread_mutex_unlock(&sqlo->lock) #define wait_poke(sqlo) do { \ pthread_mutex_lock(&sqlo->lock); \ @@ -103,18 +138,112 @@ void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val) seqlock_assert_valid(val); wait_prep(sqlo); - while (1) { - cur = atomic_load_explicit(&sqlo->pos, memory_order_acquire); - if (!(cur & 1)) + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + + while (cur & SEQLOCK_HELD) { + cal = SEQLOCK_VAL(cur) - val - 1; + assert(cal < 0x40000000 || cal > 0xc0000000); + if (cal < 0x80000000) break; - cal = cur - val - 1; + + if ((cur & SEQLOCK_WAITERS) + || atomic_compare_exchange_weak_explicit( + &sqlo->pos, &cur, cur | SEQLOCK_WAITERS, + memory_order_relaxed, memory_order_relaxed)) { + wait_once(sqlo, cur | SEQLOCK_WAITERS); + cur = atomic_load_explicit(&sqlo->pos, + memory_order_relaxed); + } + /* else: we failed to swap in cur because it just changed */ + } + wait_done(sqlo); +} + +bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, + const struct timespec *abs_monotime_limit) +{ +/* + * ABS_REALTIME - used on NetBSD, Solaris and OSX + */ +#if TIME_ABS_REALTIME +#define time_arg1 &abs_rt +#define time_arg2 NULL +#define time_prep + struct timespec curmono, abs_rt; + + clock_gettime(CLOCK_MONOTONIC, &curmono); + clock_gettime(CLOCK_REALTIME, &abs_rt); + + abs_rt.tv_nsec += abs_monotime_limit->tv_nsec - curmono.tv_nsec; + if (abs_rt.tv_nsec < 0) { + abs_rt.tv_sec--; + abs_rt.tv_nsec += 1000000000; + } else if (abs_rt.tv_nsec >= 1000000000) { + abs_rt.tv_sec++; + abs_rt.tv_nsec -= 1000000000; + } + abs_rt.tv_sec += abs_monotime_limit->tv_sec - curmono.tv_sec; + +/* + * RELATIVE - used on OpenBSD (might get a patch to get absolute monotime) + */ +#elif TIME_RELATIVE + struct timespec reltime; + +#define time_arg1 abs_monotime_limit +#define time_arg2 &reltime +#define time_prep \ + clock_gettime(CLOCK_MONOTONIC, &reltime); \ + reltime.tv_sec = abs_monotime_limit.tv_sec - reltime.tv_sec; \ + reltime.tv_nsec = abs_monotime_limit.tv_nsec - reltime.tv_nsec; \ + if (reltime.tv_nsec < 0) { \ + reltime.tv_sec--; \ + reltime.tv_nsec += 1000000000; \ + } +/* + * FreeBSD & Linux: absolute time re. CLOCK_MONOTONIC + */ +#else +#define time_arg1 abs_monotime_limit +#define time_arg2 NULL +#define time_prep +#endif + + bool ret = true; + seqlock_val_t cur, cal; + + seqlock_assert_valid(val); + + wait_prep(sqlo); + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + + while (cur & SEQLOCK_HELD) { + cal = SEQLOCK_VAL(cur) - val - 1; assert(cal < 0x40000000 || cal > 0xc0000000); if (cal < 0x80000000) break; - wait_once(sqlo, cur); + if ((cur & SEQLOCK_WAITERS) + || atomic_compare_exchange_weak_explicit( + &sqlo->pos, &cur, cur | SEQLOCK_WAITERS, + memory_order_relaxed, memory_order_relaxed)) { + int rv; + + time_prep + + rv = wait_time(sqlo, cur | SEQLOCK_WAITERS, time_arg1, + time_arg2); + if (rv) { + ret = false; + break; + } + cur = atomic_load_explicit(&sqlo->pos, + memory_order_relaxed); + } } wait_done(sqlo); + + return ret; } bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val) @@ -123,26 +252,32 @@ bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val) seqlock_assert_valid(val); - cur = atomic_load_explicit(&sqlo->pos, memory_order_acquire); - if (!(cur & 1)) + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + if (!(cur & SEQLOCK_HELD)) return 1; - cur -= val; + cur = SEQLOCK_VAL(cur) - val - 1; assert(cur < 0x40000000 || cur > 0xc0000000); return cur < 0x80000000; } void seqlock_acquire_val(struct seqlock *sqlo, seqlock_val_t val) { + seqlock_val_t prev; + seqlock_assert_valid(val); - atomic_store_explicit(&sqlo->pos, val, memory_order_release); - wait_poke(sqlo); + prev = atomic_exchange_explicit(&sqlo->pos, val, memory_order_relaxed); + if (prev & SEQLOCK_WAITERS) + wait_poke(sqlo); } void seqlock_release(struct seqlock *sqlo) { - atomic_store_explicit(&sqlo->pos, 0, memory_order_release); - wait_poke(sqlo); + seqlock_val_t prev; + + prev = atomic_exchange_explicit(&sqlo->pos, 0, memory_order_relaxed); + if (prev & SEQLOCK_WAITERS) + wait_poke(sqlo); } void seqlock_init(struct seqlock *sqlo) @@ -154,14 +289,23 @@ void seqlock_init(struct seqlock *sqlo) seqlock_val_t seqlock_cur(struct seqlock *sqlo) { - return atomic_load_explicit(&sqlo->pos, memory_order_acquire); + return SEQLOCK_VAL(atomic_load_explicit(&sqlo->pos, + memory_order_relaxed)); } seqlock_val_t seqlock_bump(struct seqlock *sqlo) { - seqlock_val_t val; + seqlock_val_t val, cur; + + cur = atomic_load_explicit(&sqlo->pos, memory_order_relaxed); + seqlock_assert_valid(cur); + + do { + val = SEQLOCK_VAL(cur) + SEQLOCK_INCR; + } while (!atomic_compare_exchange_weak_explicit(&sqlo->pos, &cur, val, + memory_order_relaxed, memory_order_relaxed)); - val = atomic_fetch_add_explicit(&sqlo->pos, 2, memory_order_release); - wait_poke(sqlo); + if (cur & SEQLOCK_WAITERS) + wait_poke(sqlo); return val; } diff --git a/lib/seqlock.h b/lib/seqlock.h index eef05a430..b551e3ffc 100644 --- a/lib/seqlock.h +++ b/lib/seqlock.h @@ -54,12 +54,28 @@ */ /* use sequentially increasing "ticket numbers". lowest bit will always - * be 1 to have a 'cleared' indication (i.e., counts 1,3,5,7,etc. ) + * be 1 to have a 'cleared' indication (i.e., counts 1,5,9,13,etc. ) + * 2nd lowest bit is used to indicate we have waiters. */ typedef _Atomic uint32_t seqlock_ctr_t; typedef uint32_t seqlock_val_t; -#define seqlock_assert_valid(val) assert(val & 1) +#define seqlock_assert_valid(val) assert((val) & SEQLOCK_HELD) +/* NB: SEQLOCK_WAITERS is only allowed if SEQLOCK_HELD is also set; can't + * have waiters on an unheld seqlock + */ +#define SEQLOCK_HELD (1U << 0) +#define SEQLOCK_WAITERS (1U << 1) +#define SEQLOCK_VAL(n) ((n) & ~SEQLOCK_WAITERS) +#define SEQLOCK_STARTVAL 1U +#define SEQLOCK_INCR 4U + +/* TODO: originally, this was using "atomic_fetch_add", which is the reason + * bit 0 is used to indicate held state. With SEQLOCK_WAITERS added, there's + * no fetch_add anymore (cmpxchg loop instead), so we don't need to use bit 0 + * for this anymore & can just special-case the value 0 for it and skip it in + * counting. + */ struct seqlock { /* always used */ @@ -74,8 +90,16 @@ struct seqlock { extern void seqlock_init(struct seqlock *sqlo); -/* while (sqlo <= val) - wait until seqlock->pos > val, or seqlock unheld */ +/* basically: "while (sqlo <= val) wait();" + * returns when sqlo > val || !seqlock_held(sqlo) + */ extern void seqlock_wait(struct seqlock *sqlo, seqlock_val_t val); + +/* same, but time-limited (limit is an absolute CLOCK_MONOTONIC value) */ +extern bool seqlock_timedwait(struct seqlock *sqlo, seqlock_val_t val, + const struct timespec *abs_monotime_limit); + +/* one-shot test, returns true if seqlock_wait would return immediately */ extern bool seqlock_check(struct seqlock *sqlo, seqlock_val_t val); static inline bool seqlock_held(struct seqlock *sqlo) @@ -85,12 +109,20 @@ static inline bool seqlock_held(struct seqlock *sqlo) /* sqlo - get seqlock position -- for the "counter" seqlock */ extern seqlock_val_t seqlock_cur(struct seqlock *sqlo); -/* sqlo++ - note: like x++, returns previous value, before bumping */ + +/* ++sqlo (but atomic & wakes waiters) - returns value that we bumped to. + * + * guarantees: + * - each seqlock_bump call bumps the position by exactly one SEQLOCK_INCR. + * There are no skipped/missed or multiple increments. + * - each return value is only returned from one seqlock_bump() call + */ extern seqlock_val_t seqlock_bump(struct seqlock *sqlo); /* sqlo = val - can be used on held seqlock. */ extern void seqlock_acquire_val(struct seqlock *sqlo, seqlock_val_t val); + /* sqlo = ref - standard pattern: acquire relative to other seqlock */ static inline void seqlock_acquire(struct seqlock *sqlo, struct seqlock *ref) { diff --git a/lib/sigevent.c b/lib/sigevent.c index d02b07422..fcd85d0d4 100644 --- a/lib/sigevent.c +++ b/lib/sigevent.c @@ -24,7 +24,6 @@ #include <memory.h> #include <lib_errors.h> -#ifdef SA_SIGINFO #ifdef HAVE_UCONTEXT_H #ifdef GNU_LINUX /* get REG_EIP from ucontext.h */ @@ -34,7 +33,6 @@ #endif /* GNU_LINUX */ #include <ucontext.h> #endif /* HAVE_UCONTEXT_H */ -#endif /* SA_SIGINFO */ /* master signals descriptor struct */ @@ -158,8 +156,6 @@ static int signal_set(int signo) return 0; } -#ifdef SA_SIGINFO - /* XXX This function should be enhanced to support more platforms (it currently works only on Linux/x86). */ static void *program_counter(void *context) @@ -199,41 +195,19 @@ static void *program_counter(void *context) return NULL; } -#endif /* SA_SIGINFO */ - static void __attribute__((noreturn)) -exit_handler(int signo -#ifdef SA_SIGINFO - , - siginfo_t *siginfo, void *context -#endif - ) +exit_handler(int signo, siginfo_t *siginfo, void *context) { -#ifndef SA_SIGINFO - void *siginfo = NULL; - void *pc = NULL; -#else void *pc = program_counter(context); -#endif zlog_signal(signo, "exiting...", siginfo, pc); _exit(128 + signo); } static void __attribute__((noreturn)) -core_handler(int signo -#ifdef SA_SIGINFO - , - siginfo_t *siginfo, void *context -#endif - ) +core_handler(int signo, siginfo_t *siginfo, void *context) { -#ifndef SA_SIGINFO - void *siginfo = NULL; - void *pc = NULL; -#else void *pc = program_counter(context); -#endif /* make sure we don't hang in here. default for SIGALRM is terminate. * - if we're in backtrace for more than a second, abort. */ @@ -290,12 +264,7 @@ static void trap_default_signals(void) static const struct { const int *sigs; unsigned int nsigs; - void (*handler)(int signo -#ifdef SA_SIGINFO - , - siginfo_t *info, void *context -#endif - ); + void (*handler)(int signo, siginfo_t *info, void *context); } sigmap[] = { {core_signals, array_size(core_signals), core_handler}, {exit_signals, array_size(exit_signals), exit_handler}, @@ -316,15 +285,10 @@ static void trap_default_signals(void) act.sa_handler = SIG_IGN; act.sa_flags = 0; } else { -#ifdef SA_SIGINFO /* Request extra arguments to signal * handler. */ act.sa_sigaction = sigmap[i].handler; act.sa_flags = SA_SIGINFO; -#else - act.sa_handler = sigmap[i].handler; - act.sa_flags = 0; -#endif #ifdef SA_RESETHAND /* don't try to print backtraces * recursively */ diff --git a/lib/stream.c b/lib/stream.c index 6c187bd35..c67bc3c99 100644 --- a/lib/stream.c +++ b/lib/stream.c @@ -904,20 +904,30 @@ int stream_put_prefix(struct stream *s, struct prefix *p) /* Put NLRI with label */ int stream_put_labeled_prefix(struct stream *s, struct prefix *p, - mpls_label_t *label) + mpls_label_t *label, int addpath_encode, + uint32_t addpath_tx_id) { size_t psize; + size_t psize_with_addpath; uint8_t *label_pnt = (uint8_t *)label; STREAM_VERIFY_SANE(s); psize = PSIZE(p->prefixlen); + psize_with_addpath = psize + (addpath_encode ? 4 : 0); - if (STREAM_WRITEABLE(s) < (psize + 3)) { + if (STREAM_WRITEABLE(s) < (psize_with_addpath + 3)) { STREAM_BOUND_WARN(s, "put"); return 0; } + if (addpath_encode) { + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 24); + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 16); + s->data[s->endp++] = (uint8_t)(addpath_tx_id >> 8); + s->data[s->endp++] = (uint8_t)addpath_tx_id; + } + stream_putc(s, (p->prefixlen + 24)); stream_putc(s, label_pnt[0]); stream_putc(s, label_pnt[1]); diff --git a/lib/stream.h b/lib/stream.h index 5341bfa40..a903ec0ea 100644 --- a/lib/stream.h +++ b/lib/stream.h @@ -199,7 +199,8 @@ extern int stream_put_prefix_addpath(struct stream *, struct prefix *, uint32_t addpath_tx_id); extern int stream_put_prefix(struct stream *, struct prefix *); extern int stream_put_labeled_prefix(struct stream *, struct prefix *, - mpls_label_t *); + mpls_label_t *, int addpath_encode, + uint32_t addpath_tx_id); extern void stream_get(void *, struct stream *, size_t); extern bool stream_get2(void *data, struct stream *s, size_t size); extern void stream_get_from(void *, struct stream *, size_t, size_t); diff --git a/lib/subdir.am b/lib/subdir.am index 3754d270a..2be7537bc 100644 --- a/lib/subdir.am +++ b/lib/subdir.am @@ -21,6 +21,7 @@ lib_libfrr_la_SOURCES = \ lib/distribute.c \ lib/ferr.c \ lib/filter.c \ + lib/frrcu.c \ lib/frrlua.c \ lib/frr_pthread.c \ lib/frrstr.c \ @@ -160,6 +161,7 @@ pkginclude_HEADERS += \ lib/frrlua.h \ lib/frr_pthread.h \ lib/frratomic.h \ + lib/frrcu.h \ lib/frrstr.h \ lib/getopt.h \ lib/graph.h \ diff --git a/lib/thread.c b/lib/thread.c index f862ce5eb..5756ebc1f 100644 --- a/lib/thread.c +++ b/lib/thread.c @@ -25,6 +25,7 @@ #include "thread.h" #include "memory.h" +#include "frrcu.h" #include "log.h" #include "hash.h" #include "command.h" @@ -737,6 +738,9 @@ static int fd_poll(struct thread_master *m, struct pollfd *pfds, nfds_t pfdsize, < 0) // effect a poll (return immediately) timeout = 0; + rcu_read_unlock(); + rcu_assert_read_unlocked(); + /* add poll pipe poker */ assert(count + 1 < pfdsize); pfds[count].fd = m->io_pipe[0]; @@ -750,6 +754,8 @@ static int fd_poll(struct thread_master *m, struct pollfd *pfds, nfds_t pfdsize, while (read(m->io_pipe[0], &trash, sizeof(trash)) > 0) ; + rcu_read_lock(); + return num; } diff --git a/lib/zebra.h b/lib/zebra.h index 22239f8e6..352887eca 100644 --- a/lib/zebra.h +++ b/lib/zebra.h @@ -134,6 +134,10 @@ typedef unsigned char uint8_t; #endif #endif +#ifdef CRYPTO_OPENSSL +#include <openssl/evp.h> +#endif + #include "openbsd-tree.h" #include <netinet/in.h> diff --git a/ospfd/ospf_packet.c b/ospfd/ospf_packet.c index 50c30a6fa..62b044479 100644 --- a/ospfd/ospf_packet.c +++ b/ospfd/ospf_packet.c @@ -33,7 +33,9 @@ #include "log.h" #include "sockopt.h" #include "checksum.h" +#ifdef CRYPTO_INTERNAL #include "md5.h" +#endif #include "vrf.h" #include "lib_errors.h" @@ -332,7 +334,11 @@ static unsigned int ospf_packet_max(struct ospf_interface *oi) static int ospf_check_md5_digest(struct ospf_interface *oi, struct ospf_header *ospfh) { +#ifdef CRYPTO_OPENSSL + EVP_MD_CTX *ctx; +#elif CRYPTO_INTERNAL MD5_CTX ctx; +#endif unsigned char digest[OSPF_AUTH_MD5_SIZE]; struct crypt_key *ck; struct ospf_neighbor *nbr; @@ -361,11 +367,21 @@ static int ospf_check_md5_digest(struct ospf_interface *oi, } /* Generate a digest for the ospf packet - their digest + our digest. */ +#ifdef CRYPTO_OPENSSL + unsigned int md5_size = OSPF_AUTH_MD5_SIZE; + ctx = EVP_MD_CTX_new(); + EVP_DigestInit(ctx, EVP_md5()); + EVP_DigestUpdate(ctx, ospfh, length); + EVP_DigestUpdate(ctx, ck->auth_key, OSPF_AUTH_MD5_SIZE); + EVP_DigestFinal(ctx, digest, &md5_size); + EVP_MD_CTX_free(ctx); +#elif CRYPTO_INTERNAL memset(&ctx, 0, sizeof(ctx)); MD5Init(&ctx); MD5Update(&ctx, ospfh, length); MD5Update(&ctx, ck->auth_key, OSPF_AUTH_MD5_SIZE); MD5Final(digest, &ctx); +#endif /* compare the two */ if (memcmp((caddr_t)ospfh + length, digest, OSPF_AUTH_MD5_SIZE)) { @@ -389,7 +405,11 @@ static int ospf_make_md5_digest(struct ospf_interface *oi, { struct ospf_header *ospfh; unsigned char digest[OSPF_AUTH_MD5_SIZE] = {0}; +#ifdef CRYPTO_OPENSSL + EVP_MD_CTX *ctx; +#elif CRYPTO_INTERNAL MD5_CTX ctx; +#endif void *ibuf; uint32_t t; struct crypt_key *ck; @@ -422,11 +442,21 @@ static int ospf_make_md5_digest(struct ospf_interface *oi, } /* Generate a digest for the entire packet + our secret key. */ +#ifdef CRYPTO_OPENSSL + unsigned int md5_size = OSPF_AUTH_MD5_SIZE; + ctx = EVP_MD_CTX_new(); + EVP_DigestInit(ctx, EVP_md5()); + EVP_DigestUpdate(ctx, ibuf, ntohs(ospfh->length)); + EVP_DigestUpdate(ctx, auth_key, OSPF_AUTH_MD5_SIZE); + EVP_DigestFinal(ctx, digest, &md5_size); + EVP_MD_CTX_free(ctx); +#elif CRYPTO_INTERNAL memset(&ctx, 0, sizeof(ctx)); MD5Init(&ctx); MD5Update(&ctx, ibuf, ntohs(ospfh->length)); MD5Update(&ctx, auth_key, OSPF_AUTH_MD5_SIZE); MD5Final(digest, &ctx); +#endif /* Append md5 digest to the end of the stream. */ stream_put(op->s, digest, OSPF_AUTH_MD5_SIZE); diff --git a/pimd/pim_neighbor.c b/pimd/pim_neighbor.c index a63b09fc1..722ecb2a7 100644 --- a/pimd/pim_neighbor.c +++ b/pimd/pim_neighbor.c @@ -424,10 +424,11 @@ struct pim_neighbor *pim_neighbor_find_by_secondary(struct interface *ifp, struct pim_neighbor *neigh; struct prefix *p; - pim_ifp = ifp->info; - if (!pim_ifp) + if (!ifp || !ifp->info) return NULL; + pim_ifp = ifp->info; + for (ALL_LIST_ELEMENTS_RO(pim_ifp->pim_neighbor_list, node, neigh)) { for (ALL_LIST_ELEMENTS_RO(neigh->prefix_list, pnode, p)) { if (prefix_same(p, src)) diff --git a/pimd/pim_nht.c b/pimd/pim_nht.c index 65ea858cb..39dc8ad2f 100644 --- a/pimd/pim_nht.c +++ b/pimd/pim_nht.c @@ -842,6 +842,14 @@ int pim_parse_nexthop_update(ZAPI_CALLBACK_ARGS) } if (!ifp->info) { + /* + * Though Multicast is not enabled on this + * Interface store it in database otheriwse we + * may miss this update and this will not cause + * any issue, because while choosing the path we + * are ommitting the Interfaces which are not + * multicast enabled + */ if (PIM_DEBUG_PIM_NHT) { char buf[NEXTHOP_STRLEN]; @@ -853,8 +861,6 @@ int pim_parse_nexthop_update(ZAPI_CALLBACK_ARGS) nexthop2str(nexthop, buf, sizeof(buf))); } - nexthop_free(nexthop); - continue; } if (nhlist_tail) { diff --git a/ripd/ripd.c b/ripd/ripd.c index e0ff0430f..561fbcb52 100644 --- a/ripd/ripd.c +++ b/ripd/ripd.c @@ -37,7 +37,9 @@ #include "if_rmap.h" #include "plist.h" #include "distribute.h" +#ifdef CRYPTO_INTERNAL #include "md5.h" +#endif #include "keychain.h" #include "privs.h" #include "lib_errors.h" @@ -870,7 +872,11 @@ static int rip_auth_md5(struct rip_packet *packet, struct sockaddr_in *from, struct rip_md5_data *md5data; struct keychain *keychain; struct key *key; +#ifdef CRYPTO_OPENSSL + EVP_MD_CTX *ctx; +#elif CRYPTO_INTERNAL MD5_CTX ctx; +#endif uint8_t digest[RIP_AUTH_MD5_SIZE]; uint16_t packet_len; char auth_str[RIP_AUTH_MD5_SIZE] = {}; @@ -934,11 +940,21 @@ static int rip_auth_md5(struct rip_packet *packet, struct sockaddr_in *from, return 0; /* MD5 digest authentication. */ +#ifdef CRYPTO_OPENSSL + unsigned int md5_size = RIP_AUTH_MD5_SIZE; + ctx = EVP_MD_CTX_new(); + EVP_DigestInit(ctx, EVP_md5()); + EVP_DigestUpdate(ctx, packet, packet_len + RIP_HEADER_SIZE); + EVP_DigestUpdate(ctx, auth_str, RIP_AUTH_MD5_SIZE); + EVP_DigestFinal(ctx, digest, &md5_size); + EVP_MD_CTX_free(ctx); +#elif CRYPTO_INTERNAL memset(&ctx, 0, sizeof(ctx)); MD5Init(&ctx); MD5Update(&ctx, packet, packet_len + RIP_HEADER_SIZE); MD5Update(&ctx, auth_str, RIP_AUTH_MD5_SIZE); MD5Final(digest, &ctx); +#endif if (memcmp(md5data->digest, digest, RIP_AUTH_MD5_SIZE) == 0) return packet_len; @@ -1063,7 +1079,11 @@ static void rip_auth_md5_set(struct stream *s, struct rip_interface *ri, size_t doff, char *auth_str, int authlen) { unsigned long len; +#ifdef CRYPTO_OPENSSL + EVP_MD_CTX *ctx; +#elif CRYPTO_INTERNAL MD5_CTX ctx; +#endif unsigned char digest[RIP_AUTH_MD5_SIZE]; /* Make it sure this interface is configured as MD5 @@ -1092,11 +1112,21 @@ static void rip_auth_md5_set(struct stream *s, struct rip_interface *ri, stream_putw(s, RIP_AUTH_DATA); /* Generate a digest for the RIP packet. */ +#ifdef CRYPTO_OPENSSL + unsigned int md5_size = RIP_AUTH_MD5_SIZE; + ctx = EVP_MD_CTX_new(); + EVP_DigestInit(ctx, EVP_md5()); + EVP_DigestUpdate(ctx, STREAM_DATA(s), stream_get_endp(s)); + EVP_DigestUpdate(ctx, auth_str, RIP_AUTH_MD5_SIZE); + EVP_DigestFinal(ctx, digest, &md5_size); + EVP_MD_CTX_free(ctx); +#elif CRYPTO_INTERNAL memset(&ctx, 0, sizeof(ctx)); MD5Init(&ctx); MD5Update(&ctx, STREAM_DATA(s), stream_get_endp(s)); MD5Update(&ctx, auth_str, RIP_AUTH_MD5_SIZE); MD5Final(digest, &ctx); +#endif /* Copy the digest to the packet. */ stream_write(s, digest, RIP_AUTH_MD5_SIZE); diff --git a/tests/lib/test_atomlist.c b/tests/lib/test_atomlist.c index 249fff8ed..238ee9539 100644 --- a/tests/lib/test_atomlist.c +++ b/tests/lib/test_atomlist.c @@ -253,7 +253,7 @@ static void *thr1func(void *arg) struct testrun *tr; for (tr = runs; tr; tr = tr->next) { - sv = seqlock_bump(&p->sqlo); + sv = seqlock_bump(&p->sqlo) - SEQLOCK_INCR; seqlock_wait(&sqlo, sv); tr->func(offset); @@ -288,14 +288,14 @@ static void run_tr(struct testrun *tr) size_t c = 0, s = 0, n = 0; struct item *item, *prev, dummy; - printf("[%02u] %35s %s\n", seqlock_cur(&sqlo) >> 1, "", desc); + printf("[%02u] %35s %s\n", seqlock_cur(&sqlo) >> 2, "", desc); fflush(stdout); if (tr->prefill != NOCLEAR) clear_list(tr->prefill); monotime(&tv); - sv = seqlock_bump(&sqlo); + sv = seqlock_bump(&sqlo) - SEQLOCK_INCR; for (size_t i = 0; i < NTHREADS; i++) { seqlock_wait(&thr[i].sqlo, seqlock_cur(&sqlo)); s += thr[i].counter; @@ -325,7 +325,7 @@ static void run_tr(struct testrun *tr) assert(c == alist_count(&ahead)); } printf("\033[1A[%02u] %9"PRId64"us c=%5zu s=%5zu n=%5zu %s\n", - sv >> 1, delta, c, s, n, desc); + sv >> 2, delta, c, s, n, desc); } #ifdef BASIC_TESTS @@ -381,7 +381,7 @@ int main(int argc, char **argv) basic_tests(); seqlock_init(&sqlo); - seqlock_acquire_val(&sqlo, 1); + seqlock_acquire_val(&sqlo, SEQLOCK_STARTVAL); for (i = 0; i < NTHREADS; i++) { seqlock_init(&thr[i].sqlo); diff --git a/tests/lib/test_seqlock.c b/tests/lib/test_seqlock.c index 9cc6f8070..639c2bdc2 100644 --- a/tests/lib/test_seqlock.c +++ b/tests/lib/test_seqlock.c @@ -66,20 +66,20 @@ static void *thr1func(void *arg) seqlock_wait(&sqlo, 1); writestr("thr1 @1\n"); - seqlock_wait(&sqlo, 3); - writestr("thr1 @3\n"); - seqlock_wait(&sqlo, 5); writestr("thr1 @5\n"); - seqlock_wait(&sqlo, 7); - writestr("thr1 @7\n"); - seqlock_wait(&sqlo, 9); writestr("thr1 @9\n"); - seqlock_wait(&sqlo, 11); - writestr("thr1 @11\n"); + seqlock_wait(&sqlo, 13); + writestr("thr1 @13\n"); + + seqlock_wait(&sqlo, 17); + writestr("thr1 @17\n"); + + seqlock_wait(&sqlo, 21); + writestr("thr1 @21\n"); return NULL; } @@ -95,11 +95,11 @@ int main(int argc, char **argv) assert(seqlock_cur(&sqlo) == 1); assert(seqlock_bump(&sqlo) == 1); - assert(seqlock_cur(&sqlo) == 3); - assert(seqlock_bump(&sqlo) == 3); + assert(seqlock_cur(&sqlo) == 5); assert(seqlock_bump(&sqlo) == 5); - assert(seqlock_bump(&sqlo) == 7); - assert(seqlock_cur(&sqlo) == 9); + assert(seqlock_bump(&sqlo) == 9); + assert(seqlock_bump(&sqlo) == 13); + assert(seqlock_cur(&sqlo) == 17); assert(seqlock_held(&sqlo)); seqlock_release(&sqlo); @@ -108,16 +108,16 @@ int main(int argc, char **argv) pthread_create(&thr1, NULL, thr1func, NULL); sleep(1); - writestr("main @3\n"); - seqlock_acquire_val(&sqlo, 3); + writestr("main @5\n"); + seqlock_acquire_val(&sqlo, 5); sleep(2); - writestr("main @5\n"); + writestr("main @9\n"); seqlock_bump(&sqlo); sleep(1); - writestr("main @9\n"); - seqlock_acquire_val(&sqlo, 9); + writestr("main @17\n"); + seqlock_acquire_val(&sqlo, 17); sleep(1); writestr("main @release\n"); diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c index f93562b31..c90e027f0 100644 --- a/zebra/zebra_dplane.c +++ b/zebra/zebra_dplane.c @@ -3023,7 +3023,7 @@ void zebra_dplane_start(void) /* Start dataplane pthread */ zdplane_info.dg_pthread = frr_pthread_new(&pattr, "Zebra dplane thread", - "Zebra dplane"); + "zebra_dplane"); zdplane_info.dg_master = zdplane_info.dg_pthread->master; diff --git a/zebra/zebra_fpm_netlink.c b/zebra/zebra_fpm_netlink.c index 822def318..f347d3955 100644 --- a/zebra/zebra_fpm_netlink.c +++ b/zebra/zebra_fpm_netlink.c @@ -238,7 +238,7 @@ static int netlink_route_info_add_nh(netlink_route_info_t *ri, if (re && CHECK_FLAG(re->flags, ZEBRA_FLAG_EVPN_ROUTE)) { nhi.encap_info.encap_type = FPM_NH_ENCAP_VXLAN; - zl3vni = zl3vni_from_vrf(ri->rtm_table); + zl3vni = zl3vni_from_vrf(nexthop->vrf_id); if (zl3vni && is_l3vni_oper_up(zl3vni)) { /* Add VNI to VxLAN encap info */ diff --git a/zebra/zebra_vxlan.c b/zebra/zebra_vxlan.c index 2e8c81bdd..1450072aa 100644 --- a/zebra/zebra_vxlan.c +++ b/zebra/zebra_vxlan.c @@ -2787,27 +2787,40 @@ static int zvni_gw_macip_add(struct interface *ifp, zebra_vni_t *zvni, /* Set "local" forwarding info. */ SET_FLAG(n->flags, ZEBRA_NEIGH_LOCAL); - SET_FLAG(n->flags, ZEBRA_NEIGH_DEF_GW); ZEBRA_NEIGH_SET_ACTIVE(n); - /* Set Router flag (R-bit) */ - if (ip->ipa_type == IPADDR_V6) - SET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG); memcpy(&n->emac, macaddr, ETH_ALEN); n->ifindex = ifp->ifindex; /* Only advertise in BGP if the knob is enabled */ - if (!advertise_gw_macip_enabled(zvni)) - return 0; + if (advertise_gw_macip_enabled(zvni)) { - if (IS_ZEBRA_DEBUG_VXLAN) - zlog_debug( + SET_FLAG(mac->flags, ZEBRA_MAC_DEF_GW); + SET_FLAG(n->flags, ZEBRA_NEIGH_DEF_GW); + /* Set Router flag (R-bit) */ + if (ip->ipa_type == IPADDR_V6) + SET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG); + + if (IS_ZEBRA_DEBUG_VXLAN) + zlog_debug( "SVI %s(%u) L2-VNI %u, sending GW MAC %s IP %s add to BGP with flags 0x%x", ifp->name, ifp->ifindex, zvni->vni, prefix_mac2str(macaddr, buf, sizeof(buf)), ipaddr2str(ip, buf2, sizeof(buf2)), n->flags); - zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, - n->flags, n->loc_seq); + zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, + n->flags, n->loc_seq); + } else if (advertise_svi_macip_enabled(zvni)) { + + if (IS_ZEBRA_DEBUG_VXLAN) + zlog_debug( + "SVI %s(%u) L2-VNI %u, sending SVI MAC %s IP %s add to BGP with flags 0x%x", + ifp->name, ifp->ifindex, zvni->vni, + prefix_mac2str(macaddr, buf, sizeof(buf)), + ipaddr2str(ip, buf2, sizeof(buf2)), n->flags); + + zvni_neigh_send_add_to_client(zvni->vni, ip, macaddr, + n->flags, n->loc_seq); + } return 0; } @@ -9015,7 +9028,7 @@ void zebra_vxlan_advertise_svi_macip(ZAPI_HANDLER_ARGS) if (IS_ZEBRA_DEBUG_VXLAN) zlog_debug("EVPN SVI-MACIP Adv %s, currently %s", advertise ? "enabled" : "disabled", - advertise_gw_macip_enabled(NULL) + advertise_svi_macip_enabled(NULL) ? "enabled" : "disabled"); |