Merge commit '082905a8' into HEAD

This commit is contained in:
Maria Matejka 2022-08-03 15:04:42 +02:00
commit bc4ad83dac
11 changed files with 342 additions and 263 deletions

View File

@ -2377,6 +2377,7 @@ avoid routing loops.
<item> <rfc id="8203"> - BGP Administrative Shutdown Communication
<item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications
<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles
</itemize>
<sect1>Route selection rules
@ -2817,6 +2818,29 @@ using the following configuration parameters:
protocol itself (for example, if a route is received through eBGP and
therefore does not have such attribute). Default: 100 (0 in pre-1.2.0
versions of BIRD).
<tag><label id="bgp-local-role">local role <m/role-name/</tag>
BGP roles are a mechanism for route leak prevention and automatic route
filtering based on common BGP topology relationships. They are defined
in <rfc id="9234">. Instead of manually configuring filters and
communities, automatic filtering is done with the help of the OTC
attribute - a flag for routes that should be sent only to customers.
The same attribute is also used to automatically detect and filter route
leaks created by third parties.
This option is valid for EBGP sessions, but it is not recommended to be
used within AS confederations (which would require manual filtering of
<cf/bgp_otc/ attribute on confederation boundaries).
Possible <cf><m/role-name/</cf> values are: <cf/provider/,
<cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/.
Default: No local role assigned.
<tag><label id="bgp-require-roles">require roles <m/switch/</tag>
If this option is set, the BGP roles must be defined on both sides,
otherwise the session will not be established. This behavior is defined
in <rfc id="9234"> as "strict mode" and is used to enforce corresponding
configuration at your conterpart side. Default: disabled.
</descrip>
<sect1>Channel configuration
@ -3124,6 +3148,11 @@ some of them (marked with `<tt/O/') are optional.
This attribute contains accumulated IGP metric, which is a total
distance to the destination through multiple autonomous systems.
Currently, the attribute is not accessible from filters.
<tag><label id="bgp-otc">int bgp_otc [O]</tag>
This attribute is defined in <rfc id="9234">. OTC is a flag that marks
routes that should be sent only to customers. If <ref id="bgp-role"
name="local Role"> is configured it set automatically.
</descrip>
<sect1>Example

View File

@ -903,6 +903,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla
bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad);
}
static void
bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
{
if (len != 4)
WITHDRAW(BAD_LENGTH, "OTC", len);
u32 val = get_u32(data);
bgp_set_attr_u32(to, s->pool, BA_ONLY_TO_CUSTOMER, flags, val);
}
static void
bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
{
@ -1116,6 +1128,13 @@ static const struct bgp_attr_desc bgp_attr_table[] = {
.encode = bgp_encode_u32s,
.decode = bgp_decode_large_community,
},
[BA_ONLY_TO_CUSTOMER] = {
.name = "otc",
.type = EAF_TYPE_INT,
.flags = BAF_OPTIONAL | BAF_TRANSITIVE,
.encode = bgp_encode_u32,
.decode = bgp_decode_otc,
},
[BA_MPLS_LABEL_STACK] = {
.name = "mpls_label_stack",
.type = EAF_TYPE_INT_SET,
@ -1452,6 +1471,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a)
REPORT("Discarding AIGP attribute received on non-AIGP session");
bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP);
}
/* Handle OTC ingress procedure, RFC 9234 */
if (bgp_channel_is_role_applicable(s->channel))
{
struct bgp_proto *p = s->proto;
eattr *e = bgp_find_attr(a->eattrs, BA_ONLY_TO_CUSTOMER);
/* Reject routes from downstream if they are leaked */
if (e && (p->cf->local_role == BGP_ROLE_PROVIDER ||
p->cf->local_role == BGP_ROLE_RS_SERVER))
WITHDRAW("Route leak detected - OTC attribute from downstream");
/* Reject routes from peers if they are leaked */
if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as))
WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)",
(uint) e->u.data);
/* Mark routes from upstream if it did not happened before */
if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER ||
p->cf->local_role == BGP_ROLE_PEER ||
p->cf->local_role == BGP_ROLE_RS_CLIENT))
bgp_set_attr_u32(&a->eattrs, s->pool, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as);
}
}
@ -1681,6 +1723,7 @@ bgp_preexport(struct channel *C, rte *e)
struct proto *SRC = e->src->proto;
struct bgp_proto *p = (struct bgp_proto *) C->proto;
struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL;
struct bgp_channel *c = (struct bgp_channel *) C;
/* Reject our routes */
if (src == p)
@ -1708,11 +1751,11 @@ bgp_preexport(struct channel *C, rte *e)
}
/* Handle well-known communities, RFC 1997 */
struct eattr *c;
struct eattr *a;
if (p->cf->interpret_communities &&
(c = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY))))
(a = bgp_find_attr(e->attrs->eattrs, BA_COMMUNITY)))
{
const struct adata *d = c->u.ptr;
const struct adata *d = a->u.ptr;
/* Do not export anywhere */
if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
@ -1731,6 +1774,16 @@ bgp_preexport(struct channel *C, rte *e)
return -1;
}
/* Do not export routes marked with OTC to upstream, RFC 9234 */
if (bgp_channel_is_role_applicable(c))
{
a = bgp_find_attr(e->attrs->eattrs, BA_ONLY_TO_CUSTOMER);
if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER ||
p->cf->local_role==BGP_ROLE_PEER ||
p->cf->local_role==BGP_ROLE_RS_CLIENT))
return -1;
}
return 0;
}
@ -1840,6 +1893,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at
}
}
/* Mark routes for downstream with OTC, RFC 9234 */
if (bgp_channel_is_role_applicable(c))
{
a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER);
if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER ||
p->cf->local_role == BGP_ROLE_PEER ||
p->cf->local_role == BGP_ROLE_RS_SERVER))
bgp_set_attr_u32(&attrs, pool, BA_ONLY_TO_CUSTOMER, 0, p->public_as);
}
/*
* Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above
* conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute

View File

@ -102,6 +102,7 @@
* RFC 8212 - Default EBGP Route Propagation Behavior without Policies
* RFC 8654 - Extended Message Support for BGP
* RFC 9117 - Revised Validation Procedure for BGP Flow Specifications
* RFC 9234 - Route Leak Prevention and Detection Using Roles
* draft-ietf-idr-ext-opt-param-07
* draft-uttaro-idr-bgp-persistence-04
* draft-walton-bgp-hostname-capability-02
@ -1963,6 +1964,15 @@ bgp_postconfig(struct proto_config *CF)
if (internal && cf->rs_client)
cf_error("Only external neighbor can be RS client");
if (internal && (cf->local_role != BGP_ROLE_UNDEFINED))
cf_error("Local role cannot be set on IBGP sessions");
if (interior && (cf->local_role != BGP_ROLE_UNDEFINED))
log(L_WARN "BGP roles are not recommended to be used within AS confederations");
if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED))
cf_error("Local role must be set if roles are required");
if (!cf->confederation && cf->confederation_member)
cf_error("Confederation ID must be set for member sessions");
@ -2325,6 +2335,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count)
cli_msg(code, b.start);
}
static const char *
bgp_format_role_name(u8 role)
{
static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" };
if (role == BGP_ROLE_UNDEFINED) return "undefined";
if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role];
return "?";
}
static void
bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
{
@ -2453,6 +2472,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
if (caps->hostname)
cli_msg(-1006, " Hostname: %s", caps->hostname);
if (caps->role != BGP_ROLE_UNDEFINED)
cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role));
}
static void

View File

@ -113,6 +113,8 @@ struct bgp_config {
int gr_mode; /* Graceful restart mode (BGP_GR_*) */
int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */
int setkey; /* Set MD5 password to system SA/SP database */
u8 local_role; /* Set peering role with neighbor [RFC 9234] */
int require_roles; /* Require configured roles on both sides */
/* Times below are in seconds */
unsigned gr_time; /* Graceful restart timeout */
unsigned llgr_time; /* Long-lived graceful restart stale time */
@ -166,6 +168,13 @@ struct bgp_channel_config {
#define BGP_PT_INTERNAL 1
#define BGP_PT_EXTERNAL 2
#define BGP_ROLE_UNDEFINED 255
#define BGP_ROLE_PROVIDER 0
#define BGP_ROLE_RS_SERVER 1
#define BGP_ROLE_RS_CLIENT 2
#define BGP_ROLE_CUSTOMER 3
#define BGP_ROLE_PEER 4
#define NH_NO 0
#define NH_ALL 1
#define NH_IBGP 2
@ -226,6 +235,7 @@ struct bgp_caps {
u8 ext_messages; /* Extended message length, RFC draft */
u8 route_refresh; /* Route refresh capability, RFC 2918 */
u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */
u8 role; /* BGP role capability, RFC 9234 */
u8 gr_aware; /* Graceful restart capability, RFC 4724 */
u8 gr_flags; /* Graceful restart flags */
@ -487,6 +497,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c)
static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c)
{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; }
static inline int bgp_channel_is_role_applicable(struct bgp_channel *c)
{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c)
{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
static inline uint bgp_max_packet_length(struct bgp_conn *conn)
{ return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; }
@ -658,6 +674,7 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to);
#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */
#define BA_AIGP 0x1a /* RFC 7311 */
#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */
#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */
/* Bird's private internal BGP attributes */
#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */

View File

@ -31,7 +31,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG,
LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS,
DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE,
FIRST, FREE, VALIDATE, BASE)
FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER,
RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC)
%type <i> bgp_nh
%type <i32> bgp_afi
@ -40,7 +41,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER,
CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION,
OUT, OF, RESOURCES)
%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag
%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name
CF_GRAMMAR
@ -72,6 +73,7 @@ bgp_proto_start: proto_start BGP {
BGP_CFG->llgr_mode = -1;
BGP_CFG->llgr_time = 3600;
BGP_CFG->setkey = 1;
BGP_CFG->local_role = BGP_ROLE_UNDEFINED;
BGP_CFG->dynamic_name = "dynbgp";
BGP_CFG->check_link = -1;
}
@ -114,6 +116,14 @@ bgp_cease_flag:
| OUT OF RESOURCES { $$ = 1 << 8; }
;
bgp_role_name:
PEER { $$ = BGP_ROLE_PEER; }
| PROVIDER { $$ = BGP_ROLE_PROVIDER; }
| CUSTOMER { $$ = BGP_ROLE_CUSTOMER; }
| RS_SERVER { $$ = BGP_ROLE_RS_SERVER; }
| RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; }
;
bgp_proto:
bgp_proto_start proto_name '{'
| bgp_proto proto_item ';'
@ -197,6 +207,8 @@ bgp_proto:
| bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; }
| bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';'
| bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; }
| bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; }
| bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; }
;
bgp_afi:
@ -343,6 +355,8 @@ dynamic_attr: BGP_AIGP
{ $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ;
dynamic_attr: BGP_LARGE_COMMUNITY
{ $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ;
dynamic_attr: BGP_OTC
{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_ONLY_TO_CUSTOMER)); } ;

View File

@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn)
caps->ext_messages = p->cf->enable_extended_messages;
caps->route_refresh = p->cf->enable_refresh;
caps->enhanced_refresh = p->cf->enable_refresh;
caps->role = p->cf->local_role;
if (caps->as4_support)
caps->as4_number = p->public_as;
@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
*buf++ = 0; /* Capability data length */
}
if (caps->role != BGP_ROLE_UNDEFINED)
{
*buf++ = 9; /* Capability 9: Announce chosen BGP role */
*buf++ = 1; /* Capability data length */
*buf++ = caps->role;
}
if (caps->gr_aware)
{
*buf++ = 64; /* Capability 64: Support for graceful restart */
@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
struct bgp_proto *p = conn->bgp;
struct bgp_caps *caps;
struct bgp_af_caps *ac;
uint err_subcode = 0;
int i, cl;
u32 af;
if (!conn->remote_caps)
{
caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps));
caps->role = BGP_ROLE_UNDEFINED;
}
else
{
caps = conn->remote_caps;
@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
caps->ext_messages = 1;
break;
case 9: /* BGP role capability, RFC 9234 */
if (cl != 1)
goto err;
/* Reserved value */
if (pos[2] == BGP_ROLE_UNDEFINED)
{ err_subcode = 11; goto err; }
/* Multiple inconsistent values */
if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2]))
{ err_subcode = 11; goto err; }
caps->role = pos[2];
break;
case 64: /* Graceful restart capability, RFC 4724 */
if (cl % 4 != 2)
goto err;
@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
err:
mb_free(caps);
bgp_error(conn, 2, 0, NULL, 0);
bgp_error(conn, 2, err_subcode, NULL, 0);
return -1;
}
@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
conn->received_as = asn;
}
/* RFC 9234 4.2 - check role agreement */
u8 local_role = p->cf->local_role;
u8 neigh_role = caps->role;
if ((local_role != BGP_ROLE_UNDEFINED) &&
(neigh_role != BGP_ROLE_UNDEFINED) &&
!((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) ||
(local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) ||
(local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) ||
(local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) ||
(local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT)))
{ bgp_error(conn, 2, 11, NULL, 0); return; }
if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED))
{ bgp_error(conn, 2, 11, NULL, 0); return; }
/* Check the other connection */
other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
switch (other->state)
@ -2985,6 +3028,7 @@ static struct {
{ 2, 6, "Unacceptable hold time" },
{ 2, 7, "Required capability missing" }, /* [RFC5492] */
{ 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
{ 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */
{ 3, 0, "Invalid UPDATE message" },
{ 3, 1, "Malformed attribute list" },
{ 3, 2, "Unrecognized well-known attribute" },

View File

@ -4,7 +4,6 @@ Available configuration variables:
CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel
CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us
CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables
CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once
CONFIG_SINGLE_ROUTE There is only one route per network
CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field

View File

@ -9,7 +9,6 @@
#define CONFIG_AUTO_ROUTES
#define CONFIG_SELF_CONSCIOUS
#define CONFIG_MULTIPLE_TABLES
#define CONFIG_ALL_TABLES_AT_ONCE
#define CONFIG_IP6_SADR_KERNEL
#define CONFIG_MC_PROPER_SRC

View File

@ -74,51 +74,16 @@
#endif
#define krt_ipv4(p) ((p)->af == AF_INET)
#define krt_ecmp6(p) ((p)->af == AF_INET6)
const int rt_default_ecmp = 16;
/*
* Structure nl_parse_state keeps state of received route processing. Ideally,
* we could just independently parse received Netlink messages and immediately
* propagate received routes to the rest of BIRD, but older Linux kernel (before
* version 4.11) represents and announces IPv6 ECMP routes not as one route with
* multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
* routes with the same prefix. More recent kernels work as with IPv4.
*
* Therefore, BIRD keeps currently processed route in nl_parse_state structure
* and postpones its propagation until we expect it to be final; i.e., when
* non-matching route is received or when the scan ends. When another matching
* route is received, it is merged with the already processed route to form an
* ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
* postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
* routes with RTA_MULTIPATH set are just considered non-matching.
*
* This is ignored for asynchronous notifications (every notification is handled
* as a separate route). It is not an issue for our routes, as we ignore such
* notifications anyways. But importing alien IPv6 ECMP routes does not work
* properly with older kernels.
*
* Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
* for the same prefix.
*/
struct nl_parse_state
{
struct krt_proto *proto;
struct linpool *pool;
int scan;
int merge;
net *net;
rta *attrs;
struct krt_proto *proto;
s8 new;
s8 krt_src;
u8 krt_type;
u8 krt_proto;
u32 krt_metric;
u32 rta_flow; /* Used during parsing */
u32 rta_flow;
};
/*
@ -161,16 +126,13 @@ nl_open_sock(struct nl_sock *nl)
}
}
static void
static int
nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED)
{
/*
* Strict checking is not necessary, it improves behavior on newer kernels.
* If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT
* run-time), we can just ignore it.
*/
#ifdef SOL_NETLINK
setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
#else
return -1;
#endif
}
@ -198,10 +160,17 @@ nl_cfg_rx_buffer_size(struct config *cfg)
static void
nl_open(void)
{
if ((nl_scan.fd >= 0) && (nl_req.fd >= 0))
return;
nl_open_sock(&nl_scan);
nl_open_sock(&nl_req);
nl_set_strict_dump(&nl_scan, 1);
if (nl_set_strict_dump(&nl_scan, 1) < 0)
{
log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once");
krt_use_shared_scan();
}
}
static void
@ -256,11 +225,13 @@ nl_request_dump_addr(int af)
}
static void
nl_request_dump_route(int af)
nl_request_dump_route(int af, int table_id)
{
struct {
struct nlmsghdr nh;
struct rtmsg rtm;
struct rtattr rta;
u32 table_id;
} req = {
.nh.nlmsg_type = RTM_GETROUTE,
.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
@ -269,7 +240,17 @@ nl_request_dump_route(int af)
.rtm.rtm_family = af,
};
send(nl_scan.fd, &req, sizeof(req), 0);
if (table_id < 256)
req.rtm.rtm_table = table_id;
else
{
req.rta.rta_type = RTA_TABLE;
req.rta.rta_len = RTA_LENGTH(4);
req.table_id = table_id;
req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len;
}
send(nl_scan.fd, &req, req.nh.nlmsg_len, 0);
nl_scan.last_hdr = NULL;
}
@ -1325,7 +1306,7 @@ nh_bufsize(struct nexthop *nh)
}
static int
nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
nl_send_route(struct krt_proto *p, rte *e, int op)
{
eattr *ea;
net *net = e->net;
@ -1407,15 +1388,17 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
/* For route delete, we do not specify remaining route attributes */
if (op == NL_OP_DELETE)
goto dest;
goto done;
/* Default scope is LINK for device routes, UNIVERSE otherwise */
if (p->af == AF_MPLS)
r->r.rtm_scope = RT_SCOPE_UNIVERSE;
else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
r->r.rtm_scope = ea->u.data;
else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw))
r->r.rtm_scope = RT_SCOPE_LINK;
else
r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
r->r.rtm_scope = RT_SCOPE_UNIVERSE;
if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
@ -1438,13 +1421,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
if (metrics[0])
nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
dest:
switch (dest)
switch (a->dest)
{
case RTD_UNICAST:
r->r.rtm_type = RTN_UNICAST;
if (nh->next && !krt_ecmp6(p))
struct nexthop *nh = &(a->nh);
if (nh->next)
nl_add_multipath(&r->h, rsize, nh, p->af, eattrs);
else
{
@ -1470,82 +1452,53 @@ dest:
bug("krt_capable inconsistent with nl_send_route");
}
done:
/* Ignore missing for DELETE */
return nl_exchange(&r->h, (op == NL_OP_DELETE));
}
static inline int
nl_add_rte(struct krt_proto *p, rte *e)
nl_allow_replace(struct krt_proto *p, rte *new)
{
rta *a = e->attrs;
int err = 0;
/*
* We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
* matching rtm_protocol, but that is OK when dedicated priority is used.
*
* For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS
* (although it seems to be fixed in Linux 5.10 LTS) for sequence:
*
* ip route add 2001:db8::/32 via fe80::1 dev eth0
* ip route replace 2001:db8::/32 dev eth0
*
* (it ends with two routes instead of replacing the first by the second one)
*
* Replacing with direct and special type (e.g. unreachable) routes does not
* work, but replacing with regular routes work reliably
*/
if (krt_ecmp6(p) && a->nh.next)
{
struct nexthop *nh = &(a->nh);
if (krt_ipv4(p))
return 1;
err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh);
if (err < 0)
return err;
for (nh = nh->next; nh; nh = nh->next)
err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh);
return err;
}
return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh));
rta *a = new->attrs;
return (a->dest == RTD_UNICAST) && ipa_nonzero(a->nh.gw);
}
static inline int
nl_delete_rte(struct krt_proto *p, rte *e)
{
int err = 0;
/* For IPv6, we just repeatedly request DELETE until we get error */
do
err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL);
while (krt_ecmp6(p) && !err);
return err;
}
static inline int
nl_replace_rte(struct krt_proto *p, rte *e)
{
rta *a = e->attrs;
return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh));
}
void
krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
{
int err = 0;
/*
* We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
* matching rtm_protocol, but that is OK when dedicated priority is used.
*
* We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP
* and with some kernel versions ECMP replace crashes kernel. Would need more
* testing and checks for kernel versions.
*
* For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the
* old route value, so we do not try to optimize IPv6 ECMP reconfigurations.
*/
if (krt_ipv4(p) && old && new)
if (old && new && nl_allow_replace(p, new))
{
err = nl_replace_rte(p, new);
err = nl_send_route(p, new, NL_OP_REPLACE);
}
else
{
if (old)
nl_delete_rte(p, old);
nl_send_route(p, old, NL_OP_DELETE);
if (new)
err = nl_add_rte(p, new);
err = nl_send_route(p, new, NL_OP_ADD);
}
if (new)
@ -1557,71 +1510,6 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
}
}
static int
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
{
/* Route merging is used for IPv6 scans */
if (!s->scan || (rtm_family != AF_INET6))
return 0;
/* Saved and new route must have same network, proto/table, and priority */
if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
return 0;
/* Both must be regular unicast routes */
if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
return 0;
return 1;
}
static void
nl_announce_route(struct nl_parse_state *s)
{
rte *e = rte_get_temp(s->attrs, s->proto->p.main_source);
e->net = s->net;
ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr));
*ea = (ea_list) { .count = 2, .next = e->attrs->eattrs };
e->attrs->eattrs = ea;
ea->attrs[0] = (eattr) {
.id = EA_KRT_SOURCE,
.type = EAF_TYPE_INT,
.u.data = s->krt_proto,
};
ea->attrs[1] = (eattr) {
.id = EA_KRT_METRIC,
.type = EAF_TYPE_INT,
.u.data = s->krt_metric,
};
if (s->scan)
krt_got_route(s->proto, e, s->krt_src);
else
krt_got_route_async(s->proto, e, s->new, s->krt_src);
s->net = NULL;
s->attrs = NULL;
s->proto = NULL;
lp_flush(s->pool);
}
static inline void
nl_parse_begin(struct nl_parse_state *s, int scan)
{
memset(s, 0, sizeof (struct nl_parse_state));
s->pool = nl_linpool;
s->scan = scan;
}
static inline void
nl_parse_end(struct nl_parse_state *s)
{
if (s->net)
nl_announce_route(s);
}
#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0)
#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0)
@ -1759,13 +1647,29 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
net *net = net_get(p->p.main_channel->table, n);
if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family))
nl_announce_route(s);
rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
ra->source = RTS_INHERIT;
ra->scope = SCOPE_UNIVERSE;
{
ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + 2 * sizeof(eattr));
*ea = (ea_list) { .flags = EALF_SORTED, .count = 2 };
ea->next = ra->eattrs;
ra->eattrs = ea;
ea->attrs[0] = (eattr) {
.id = EA_KRT_SOURCE,
.type = EAF_TYPE_INT,
.u.data = i->rtm_protocol
};
ea->attrs[1] = (eattr) {
.id = EA_KRT_METRIC,
.type = EAF_TYPE_INT,
.u.data = priority,
};
}
if (a[RTA_FLOW])
s->rta_flow = rta_get_u32(a[RTA_FLOW]);
else
@ -1942,60 +1846,40 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
}
}
/*
* Ideally, now we would send the received route to the rest of kernel code.
* But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
* postpone it and merge next hops until the end of the sequence. Note that
* when doing merging of next hops, we expect the new route to be unipath.
* Otherwise, we ignore additional next hops in nexthop_insert().
*/
rte *e = rte_get_temp(ra, p->p.main_source);
e->net = net;
if (!s->net)
{
/* Store the new route */
s->net = net;
s->attrs = ra;
s->proto = p;
s->new = new;
s->krt_src = krt_src;
s->krt_type = i->rtm_type;
s->krt_proto = i->rtm_protocol;
s->krt_metric = priority;
}
if (s->scan)
krt_got_route(p, e, krt_src);
else
{
/* Merge next hops with the stored route */
rta *oa = s->attrs;
krt_got_route_async(p, e, new, krt_src);
struct nexthop *nhs = &oa->nh;
nexthop_insert(&nhs, &ra->nh);
/* Perhaps new nexthop is inserted at the first position */
if (nhs == &ra->nh)
{
/* Swap rtas */
s->attrs = ra;
/* Keep old eattrs */
ra->eattrs = oa->eattrs;
}
}
lp_flush(s->pool);
}
void
krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
krt_do_scan(struct krt_proto *p)
{
struct nlmsghdr *h;
struct nl_parse_state s;
struct nl_parse_state s = {
.proto = p,
.pool = nl_linpool,
.scan = 1,
};
nl_parse_begin(&s, 1);
nl_request_dump_route(AF_UNSPEC);
/* Table-specific scan or shared scan */
if (p)
nl_request_dump_route(p->af, krt_table_id(p));
else
nl_request_dump_route(AF_UNSPEC, 0);
struct nlmsghdr *h;
while (h = nl_get_scan())
{
if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
nl_parse_route(&s, h);
else
log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
nl_parse_end(&s);
}
}
/*
@ -2010,16 +1894,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz
static void
nl_async_msg(struct nlmsghdr *h)
{
struct nl_parse_state s;
struct nl_parse_state s = {
.proto = NULL,
.pool = nl_linpool,
.scan = 0,
};
switch (h->nlmsg_type)
{
case RTM_NEWROUTE:
case RTM_DELROUTE:
DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
nl_parse_begin(&s, 0);
nl_parse_route(&s, h);
nl_parse_end(&s);
break;
case RTM_NEWLINK:
case RTM_DELLINK:

View File

@ -785,18 +785,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src)
rte_free(e);
}
/*
* Periodic scanning
*/
#ifdef CONFIG_ALL_TABLES_AT_ONCE
static timer *krt_scan_timer;
static int krt_scan_count;
static timer *krt_scan_all_timer;
static int krt_scan_all_count;
static _Bool krt_scan_all_tables;
static void
krt_scan(timer *t UNUSED)
krt_scan_all(timer *t UNUSED)
{
struct krt_proto *p;
node *n;
@ -817,35 +816,42 @@ krt_scan(timer *t UNUSED)
}
static void
krt_scan_timer_start(struct krt_proto *p)
krt_scan_all_timer_start(struct krt_proto *p)
{
if (!krt_scan_count)
krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0);
if (!krt_scan_all_count)
krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0);
krt_scan_count++;
krt_scan_all_count++;
tm_start(krt_scan_timer, 1 S);
tm_start(krt_scan_all_timer, 1 S);
}
static void
krt_scan_timer_stop(struct krt_proto *p UNUSED)
krt_scan_all_timer_stop(void)
{
krt_scan_count--;
ASSERT(krt_scan_all_count > 0);
if (!krt_scan_count)
krt_scan_all_count--;
if (!krt_scan_all_count)
{
rfree(krt_scan_timer);
krt_scan_timer = NULL;
rfree(krt_scan_all_timer);
krt_scan_all_timer = NULL;
}
}
static void
krt_scan_timer_kick(struct krt_proto *p UNUSED)
krt_scan_all_timer_kick(void)
{
tm_start(krt_scan_timer, 0);
tm_start(krt_scan_all_timer, 0);
}
void
krt_use_shared_scan(void)
{
krt_scan_all_tables = 1;
}
#else
static void
krt_scan(timer *t)
@ -863,26 +869,33 @@ krt_scan(timer *t)
static void
krt_scan_timer_start(struct krt_proto *p)
{
p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
tm_start(p->scan_timer, 1 S);
if (krt_scan_all_tables)
krt_scan_all_timer_start(p);
else
{
p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
tm_start(p->scan_timer, 1 S);
}
}
static void
krt_scan_timer_stop(struct krt_proto *p)
{
tm_stop(p->scan_timer);
if (krt_scan_all_tables)
krt_scan_all_timer_stop();
else
tm_stop(p->scan_timer);
}
static void
krt_scan_timer_kick(struct krt_proto *p)
{
tm_start(p->scan_timer, 0);
if (krt_scan_all_tables)
krt_scan_all_timer_kick();
else
tm_start(p->scan_timer, 0);
}
#endif
/*
* Updates
@ -992,11 +1005,6 @@ krt_postconfig(struct proto_config *CF)
if (! proto_cf_main_channel(CF))
cf_error("Channel not specified");
#ifdef CONFIG_ALL_TABLES_AT_ONCE
if (krt_cf->scan_time != cf->scan_time)
cf_error("All kernel syncers must use the same table scan interval");
#endif
struct channel_config *cc = proto_cf_main_channel(CF);
struct rtable_config *tab = cc->table;
if (tab->krt_attached)

View File

@ -55,10 +55,7 @@ struct krt_proto {
struct rtable *krt_table; /* Internal table of inherited routes */
#endif
#ifndef CONFIG_ALL_TABLES_AT_ONCE
timer *scan_timer;
#endif
struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */
struct bmap seen_map; /* Routes seen during last periodic scan */
node krt_node; /* Node in krt_proto_list */
@ -79,6 +76,7 @@ extern pool *krt_pool;
struct proto_config * kif_init_config(int class);
void kif_request_scan(void);
void krt_use_shared_scan(void);
void krt_got_route(struct krt_proto *p, struct rte *e, s8 src);
void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);