diff --git a/doc/bird.sgml b/doc/bird.sgml index 4b593ef6..648b4a1c 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2377,6 +2377,7 @@ avoid routing loops. - BGP Administrative Shutdown Communication - Default EBGP Route Propagation Behavior without Policies - Revised Validation Procedure for BGP Flow Specifications + - Route Leak Prevention and Detection Using Roles Route selection rules @@ -2817,6 +2818,29 @@ using the following configuration parameters: protocol itself (for example, if a route is received through eBGP and therefore does not have such attribute). Default: 100 (0 in pre-1.2.0 versions of BIRD). + + + This attribute is defined in . OTC is a flag that marks + routes that should be sent only to customers. If is configured it set automatically. Example diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 2f6cf640..a56aeb19 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -903,6 +903,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); } + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_ONLY_TO_CUSTOMER, flags, val); +} + + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -1116,6 +1128,13 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = EAF_TYPE_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { .name = "mpls_label_stack", .type = EAF_TYPE_INT_SET, @@ -1452,6 +1471,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) REPORT("Discarding AIGP attribute received on non-AIGP session"); bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP); } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(a->eattrs, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(&a->eattrs, s->pool, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); + } } @@ -1681,6 +1723,7 @@ bgp_preexport(struct channel *C, rte *e) struct proto *SRC = e->src->proto; struct bgp_proto *p = (struct bgp_proto *) C->proto; struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; + struct bgp_channel *c = (struct bgp_channel *) C; /* Reject our routes */ if (src == p) @@ -1708,11 +1751,11 @@ bgp_preexport(struct channel *C, rte *e) } /* Handle well-known communities, RFC 1997 */ - struct eattr *c; + struct eattr *a; if (p->cf->interpret_communities && - (c = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs->eattrs, BA_COMMUNITY))) { - const struct adata *d = c->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1731,6 +1774,16 @@ bgp_preexport(struct channel *C, rte *e) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs->eattrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } @@ -1840,6 +1893,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, pool, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 2b97cc4b..f07fcc16 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -102,6 +102,7 @@ * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -1963,6 +1964,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2325,6 +2335,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +static const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2453,6 +2472,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 13102857..15a759fb 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -113,6 +113,8 @@ struct bgp_config { int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ @@ -166,6 +168,13 @@ struct bgp_channel_config { #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -226,6 +235,7 @@ struct bgp_caps { u8 ext_messages; /* Extended message length, RFC draft */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -487,6 +497,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -658,6 +674,7 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ #define BA_AIGP 0x1a /* RFC 7311 */ #define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ #define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 241aa7c2..cb410a5e 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -31,7 +31,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST, FREE, VALIDATE, BASE) + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC) %type bgp_nh %type bgp_afi @@ -40,7 +41,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type bgp_cease_mask bgp_cease_list bgp_cease_flag +%type bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR @@ -72,6 +73,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -114,6 +116,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -197,6 +207,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: @@ -343,6 +355,8 @@ dynamic_attr: BGP_AIGP { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ; dynamic_attr: BGP_LARGE_COMMUNITY { $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ; +dynamic_attr: BGP_OTC + { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_ONLY_TO_CUSTOMER)); } ; diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index a805cdf0..3aa62c39 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, NULL, 0); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, NULL, 0); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -2985,6 +3028,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..68078bbe 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..c640bef4 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index e103c8ef..342b746c 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -74,51 +74,16 @@ #endif #define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) const int rt_default_ecmp = 16; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - struct nl_parse_state { + struct krt_proto *proto; struct linpool *pool; int scan; - int merge; - net *net; - rta *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - - u32 rta_flow; /* Used during parsing */ + u32 rta_flow; }; /* @@ -161,16 +126,13 @@ nl_open_sock(struct nl_sock *nl) } } -static void +static int nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) { - /* - * Strict checking is not necessary, it improves behavior on newer kernels. - * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT - * run-time), we can just ignore it. - */ #ifdef SOL_NETLINK - setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; #endif } @@ -198,10 +160,17 @@ nl_cfg_rx_buffer_size(struct config *cfg) static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); - nl_set_strict_dump(&nl_scan, 1); + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -256,11 +225,13 @@ nl_request_dump_addr(int af) } static void -nl_request_dump_route(int af) +nl_request_dump_route(int af, int table_id) { struct { struct nlmsghdr nh; struct rtmsg rtm; + struct rtattr rta; + u32 table_id; } req = { .nh.nlmsg_type = RTM_GETROUTE, .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), @@ -269,7 +240,17 @@ nl_request_dump_route(int af) .rtm.rtm_family = af, }; - send(nl_scan.fd, &req, sizeof(req), 0); + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); nl_scan.last_hdr = NULL; } @@ -1325,7 +1306,7 @@ nh_bufsize(struct nexthop *nh) } static int -nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) +nl_send_route(struct krt_proto *p, rte *e, int op) { eattr *ea; net *net = e->net; @@ -1407,15 +1388,17 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) r->r.rtm_scope = ea->u.data; + else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1438,13 +1421,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: - switch (dest) + switch (a->dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (nh->next && !krt_ecmp6(p)) + struct nexthop *nh = &(a->nh); + if (nh->next) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { @@ -1470,82 +1452,53 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - rta *a = e->attrs; - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - if (krt_ecmp6(p) && a->nh.next) - { - struct nexthop *nh = &(a->nh); + if (krt_ipv4(p)) + return 1; - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh); - if (err < 0) - return err; - - for (nh = nh->next; nh; nh = nh->next) - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh); - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh)); + rta *a = new->attrs; + return (a->dest == RTD_UNICAST) && ipa_nonzero(a->nh.gw); } -static inline int -nl_delete_rte(struct krt_proto *p, rte *e) -{ - int err = 0; - - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); - - return err; -} - -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh)); -} - - void krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1557,71 +1510,6 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) } } -static int -nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte *e = rte_get_temp(s->attrs, s->proto->p.main_source); - e->net = s->net; - - ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .count = 2, .next = e->attrs->eattrs }; - e->attrs->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = s->krt_proto, - }; - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = s->krt_metric, - }; - - if (s->scan) - krt_got_route(s->proto, e, s->krt_src); - else - krt_got_route_async(s->proto, e, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) @@ -1759,13 +1647,29 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net *net = net_get(p->p.main_channel->table, n); - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); ra->source = RTS_INHERIT; ra->scope = SCOPE_UNIVERSE; + { + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + 2 * sizeof(eattr)); + *ea = (ea_list) { .flags = EALF_SORTED, .count = 2 }; + ea->next = ra->eattrs; + ra->eattrs = ea; + + ea->attrs[0] = (eattr) { + .id = EA_KRT_SOURCE, + .type = EAF_TYPE_INT, + .u.data = i->rtm_protocol + }; + + ea->attrs[1] = (eattr) { + .id = EA_KRT_METRIC, + .type = EAF_TYPE_INT, + .u.data = priority, + }; + } + if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); else @@ -1942,60 +1846,40 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) } } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte *e = rte_get_temp(ra, p->p.main_source); + e->net = net; - if (!s->net) - { - /* Store the new route */ - s->net = net; - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, e, krt_src); else - { - /* Merge next hops with the stored route */ - rta *oa = s->attrs; + krt_got_route_async(p, e, new, krt_src); - struct nexthop *nhs = &oa->nh; - nexthop_insert(&nhs, &ra->nh); - - /* Perhaps new nexthop is inserted at the first position */ - if (nhs == &ra->nh) - { - /* Swap rtas */ - s->attrs = ra; - - /* Keep old eattrs */ - ra->eattrs = oa->eattrs; - } - } + lp_flush(s->pool); } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; - nl_parse_begin(&s, 1); - nl_request_dump_route(AF_UNSPEC); + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); + + struct nlmsghdr *h; while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); + } } /* @@ -2010,16 +1894,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index be547a93..c4a3a4a8 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -785,18 +785,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) rte_free(e); } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -817,35 +816,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); - if (!krt_scan_count) + krt_scan_all_count--; + + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -863,26 +869,33 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates @@ -992,11 +1005,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 20858cd7..18a206e6 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -55,10 +55,7 @@ struct krt_proto { struct rtable *krt_table; /* Internal table of inherited routes */ #endif -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -79,6 +76,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);