diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..68078bbe 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..c640bef4 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index e103c8ef..342b746c 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -74,51 +74,16 @@ #endif #define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) const int rt_default_ecmp = 16; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - struct nl_parse_state { + struct krt_proto *proto; struct linpool *pool; int scan; - int merge; - net *net; - rta *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - - u32 rta_flow; /* Used during parsing */ + u32 rta_flow; }; /* @@ -161,16 +126,13 @@ nl_open_sock(struct nl_sock *nl) } } -static void +static int nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) { - /* - * Strict checking is not necessary, it improves behavior on newer kernels. - * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT - * run-time), we can just ignore it. - */ #ifdef SOL_NETLINK - setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; #endif } @@ -198,10 +160,17 @@ nl_cfg_rx_buffer_size(struct config *cfg) static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); - nl_set_strict_dump(&nl_scan, 1); + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -256,11 +225,13 @@ nl_request_dump_addr(int af) } static void -nl_request_dump_route(int af) +nl_request_dump_route(int af, int table_id) { struct { struct nlmsghdr nh; struct rtmsg rtm; + struct rtattr rta; + u32 table_id; } req = { .nh.nlmsg_type = RTM_GETROUTE, .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), @@ -269,7 +240,17 @@ nl_request_dump_route(int af) .rtm.rtm_family = af, }; - send(nl_scan.fd, &req, sizeof(req), 0); + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); nl_scan.last_hdr = NULL; } @@ -1325,7 +1306,7 @@ nh_bufsize(struct nexthop *nh) } static int -nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) +nl_send_route(struct krt_proto *p, rte *e, int op) { eattr *ea; net *net = e->net; @@ -1407,15 +1388,17 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) r->r.rtm_scope = ea->u.data; + else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1438,13 +1421,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: - switch (dest) + switch (a->dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (nh->next && !krt_ecmp6(p)) + struct nexthop *nh = &(a->nh); + if (nh->next) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { @@ -1470,82 +1452,53 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - rta *a = e->attrs; - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - if (krt_ecmp6(p) && a->nh.next) - { - struct nexthop *nh = &(a->nh); + if (krt_ipv4(p)) + return 1; - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh); - if (err < 0) - return err; - - for (nh = nh->next; nh; nh = nh->next) - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh); - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh)); + rta *a = new->attrs; + return (a->dest == RTD_UNICAST) && ipa_nonzero(a->nh.gw); } -static inline int -nl_delete_rte(struct krt_proto *p, rte *e) -{ - int err = 0; - - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); - - return err; -} - -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh)); -} - - void krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1557,71 +1510,6 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) } } -static int -nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte *e = rte_get_temp(s->attrs, s->proto->p.main_source); - e->net = s->net; - - ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .count = 2, .next = e->attrs->eattrs }; - e->attrs->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = s->krt_proto, - }; - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = s->krt_metric, - }; - - if (s->scan) - krt_got_route(s->proto, e, s->krt_src); - else - krt_got_route_async(s->proto, e, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) @@ -1759,13 +1647,29 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net *net = net_get(p->p.main_channel->table, n); - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); ra->source = RTS_INHERIT; ra->scope = SCOPE_UNIVERSE; + { + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + 2 * sizeof(eattr)); + *ea = (ea_list) { .flags = EALF_SORTED, .count = 2 }; + ea->next = ra->eattrs; + ra->eattrs = ea; + + ea->attrs[0] = (eattr) { + .id = EA_KRT_SOURCE, + .type = EAF_TYPE_INT, + .u.data = i->rtm_protocol + }; + + ea->attrs[1] = (eattr) { + .id = EA_KRT_METRIC, + .type = EAF_TYPE_INT, + .u.data = priority, + }; + } + if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); else @@ -1942,60 +1846,40 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) } } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte *e = rte_get_temp(ra, p->p.main_source); + e->net = net; - if (!s->net) - { - /* Store the new route */ - s->net = net; - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, e, krt_src); else - { - /* Merge next hops with the stored route */ - rta *oa = s->attrs; + krt_got_route_async(p, e, new, krt_src); - struct nexthop *nhs = &oa->nh; - nexthop_insert(&nhs, &ra->nh); - - /* Perhaps new nexthop is inserted at the first position */ - if (nhs == &ra->nh) - { - /* Swap rtas */ - s->attrs = ra; - - /* Keep old eattrs */ - ra->eattrs = oa->eattrs; - } - } + lp_flush(s->pool); } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; - nl_parse_begin(&s, 1); - nl_request_dump_route(AF_UNSPEC); + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); + + struct nlmsghdr *h; while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); + } } /* @@ -2010,16 +1894,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index be547a93..c4a3a4a8 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -785,18 +785,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) rte_free(e); } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -817,35 +816,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); - if (!krt_scan_count) + krt_scan_all_count--; + + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -863,26 +869,33 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates @@ -992,11 +1005,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 20858cd7..18a206e6 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -55,10 +55,7 @@ struct krt_proto { struct rtable *krt_table; /* Internal table of inherited routes */ #endif -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -79,6 +76,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);