From 52e21323b6c49af9d076586241451973a7d1e7c6 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 25 Nov 2015 15:52:58 +0100 Subject: [PATCH 01/21] BGP: Update capability number from IANA for extended messages --- proto/bgp/packets.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index ed99f623..72ca3728 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -163,6 +163,14 @@ bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf) return buf; } +static byte * +bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) +{ + *buf++ = 6; /* Capability 6: Support for extended messages */ + *buf++ = 0; /* Capability data length */ + return buf; +} + static byte * bgp_put_cap_gr1(struct bgp_proto *p, byte *buf) { @@ -223,14 +231,6 @@ bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) return buf; } -static byte * -bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 230; /* Capability TBD: Support for extended messages */ - *buf++ = 0; /* Capability data length */ - return buf; -} - static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) @@ -827,6 +827,12 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_refresh_support = 1; break; + case 6: /* Extended message length capability, draft */ + if (cl != 0) + goto err; + conn->peer_ext_messages_support = 1; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -867,12 +873,6 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_enhanced_refresh_support = 1; break; - case 230: /* Extended message length capability, draft, cap number TBD */ - if (cl != 0) - goto err; - conn->peer_ext_messages_support = 1; - break; - /* We can safely ignore all other capabilities */ } len -= 2 + cl; From 487c6961cb29046dbe9560262e3e742e38691b83 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 11 Feb 2016 16:38:28 +0100 Subject: [PATCH 02/21] BGP: Fix bug in incoming connection handling When a BGP session was established by an outgoing connection with Graceful Restart behavior negotiated, a pending incoming connection in OpenSent state, and another incoming connection was received, then the outgoing connection (and whole BGP session) was closed, but the old incoming connection was just overwritten by the new one. That later caused a crash when the hold timer from the old connection fired. --- proto/bgp/bgp.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index f549b0ed..11f33b14 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -813,7 +813,13 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED) return 0; } - /* We are in proper state and there is no other incoming connection */ + /* + * BIRD should keep multiple incoming connections in OpenSent state (for + * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming + * connections are rejected istead. The exception is the case where an + * incoming connection triggers a graceful restart. + */ + acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) && (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); @@ -823,6 +829,10 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED) bgp_handle_graceful_restart(p); bgp_conn_enter_idle_state(p->conn); acc = 1; + + /* There might be separate incoming connection in OpenSent state */ + if (p->incoming_conn.state > BS_ACTIVE) + bgp_close_conn(&p->incoming_conn); } BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", From c2106b674ca632f7c0bffd7cab4b1940f74d353c Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 11 Feb 2016 21:53:55 +0100 Subject: [PATCH 03/21] Unix: Fix bug in syslog name handling Pointer to current_log_name has to be changed even if the name is the same, because the old one will be invalid/freed after reconfiguration. --- sysdep/unix/log.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 7cb26360..b90bbbd2 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -284,17 +284,18 @@ log_switch(int debug, list *l, char *new_syslog_name) current_log_list = l; #ifdef HAVE_SYSLOG - if (current_syslog_name && new_syslog_name && - !strcmp(current_syslog_name, new_syslog_name)) + char *old_syslog_name = current_syslog_name; + current_syslog_name = new_syslog_name; + + if (old_syslog_name && new_syslog_name && + !strcmp(old_syslog_name, new_syslog_name)) return; - if (current_syslog_name) + if (old_syslog_name) closelog(); if (new_syslog_name) openlog(new_syslog_name, LOG_CONS | LOG_NDELAY, LOG_DAEMON); - - current_syslog_name = new_syslog_name; #endif } From 9c9cc35c0273f8bcae10fb8b546d199514b2bbc5 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Tue, 16 Feb 2016 17:33:58 +0100 Subject: [PATCH 04/21] Filter: Implement last_nonaggregated operator on bgp_path --- doc/bird.sgml | 5 ++++- filter/config.Y | 3 ++- filter/filter.c | 8 ++++++++ nest/a-path.c | 31 ++++++++++++++++++++++++++++++- nest/attrs.h | 1 + 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 86df0456..c5316d87 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1119,9 +1119,12 @@ foot). returns the last ASN (the source ASN) in path returns the last ASN in the non-aggregated part of the path returns the length of path code = P('i','M'); $$->a1.p = $1; $$->a2.p = $5; } | term '.' FIRST { $$ = f_new_inst(); $$->code = P('a','f'); $$->a1.p = $1; } | term '.' LAST { $$ = f_new_inst(); $$->code = P('a','l'); $$->a1.p = $1; } + | term '.' LAST_NONAGGREGATED { $$ = f_new_inst(); $$->code = P('a','L'); $$->a1.p = $1; } /* Communities */ /* This causes one shift/reduce conflict diff --git a/filter/filter.c b/filter/filter.c index 55062aca..eddf4228 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -1091,6 +1091,14 @@ interpret(struct f_inst *what) res.type = T_INT; res.val.i = as; break; + case P('a','L'): /* Get last ASN from non-aggregated part of AS PATH */ + ONEARG; + if (v1.type != T_PATH) + runtime( "AS path expected" ); + + res.type = T_INT; + res.val.i = as_path_get_last_nonaggregated(v1.val.ad); + break; case 'r': ONEARG; res = v1; diff --git a/nest/a-path.c b/nest/a-path.c index c9c5aefb..32e2d27e 100644 --- a/nest/a-path.c +++ b/nest/a-path.c @@ -220,7 +220,7 @@ as_path_get_last(struct adata *path, u32 *orig_as) p += BS * len; } break; - default: bug("as_path_get_first: Invalid path segment"); + default: bug("Invalid path segment"); } } @@ -229,6 +229,35 @@ as_path_get_last(struct adata *path, u32 *orig_as) return found; } +u32 +as_path_get_last_nonaggregated(struct adata *path) +{ + u8 *p = path->data; + u8 *q = p+path->length; + u32 res = 0; + int len; + + while (p Date: Wed, 20 Jan 2016 15:23:17 +0100 Subject: [PATCH 05/21] All the current pthread implementations are OK and working with us. No more need to disable pthread for specific BSD's. --- configure.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configure.in b/configure.in index c81709e6..b9220a1d 100644 --- a/configure.in +++ b/configure.in @@ -85,12 +85,12 @@ if test -z "$GCC" ; then fi # Enable threads by default just in Linux and FreeBSD -if test "$enable_pthreads" = try ; then - case "$host_os" in - (linux* | freebsd*) enable_pthreads=try ;; - (*) enable_pthreads=no ;; - esac -fi +#if test "$enable_pthreads" = try ; then +# case "$host_os" in +# (linux* | freebsd* | openbsd* | netbsd* ) enable_pthreads=try ;; +# (*) enable_pthreads=no ;; +# esac +#fi if test "$enable_pthreads" != no ; then BIRD_CHECK_PTHREADS From e3f506f9b53bd8e44976df1c935c7ec417793ace Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 25 Feb 2016 18:16:59 +0100 Subject: [PATCH 06/21] OSPF: Multicast ability is irrelevant for stub interfaces --- proto/ospf/iface.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 77ce839a..67ae094d 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -599,10 +599,10 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i if (ospf_is_v2(p) && (ifa->type == OSPF_IT_NBMA) && (addr->flags & IA_PEER)) ifa->type = OSPF_IT_PTMP; - if ((ifa->type == OSPF_IT_BCAST) && !(iface->flags & if_multi_flag)) + if ((ifa->type == OSPF_IT_BCAST) && !(iface->flags & if_multi_flag) && !ifa->stub) ifa->type = OSPF_IT_NBMA; - if ((ifa->type == OSPF_IT_PTP) && !(iface->flags & if_multi_flag)) + if ((ifa->type == OSPF_IT_PTP) && !(iface->flags & if_multi_flag) && !ifa->stub) ifa->type = OSPF_IT_PTMP; if (ifa->type != old_type) From e1c13a5a7b86f2ba09178300bad960224658c833 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Wed, 9 Mar 2016 12:12:02 +0100 Subject: [PATCH 07/21] Unix: Rework of select-loop to poll-loop This should lift the limit of FD_SETSIZE and allow more than 1024 fd's. FD_SETSIZE limit doesn't matter now when creating new sockets. --- sysdep/unix/io.c | 101 ++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index b636e799..bc212de1 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -41,12 +42,12 @@ #include "lib/sysio.h" /* Maximum number of calls of tx handler for one socket in one - * select iteration. Should be small enough to not monopolize CPU by + * poll iteration. Should be small enough to not monopolize CPU by * one protocol instance. */ #define MAX_STEPS 4 -/* Maximum number of calls of rx handler for all sockets in one select +/* Maximum number of calls of rx handler for all sockets in one poll iteration. RX callbacks are often much more costly so we limit this to gen small latencies */ #define MAX_RX_STEPS 4 @@ -1022,7 +1023,6 @@ sk_log_error(sock *s, const char *p) static list sock_list; static struct birdsock *current_sock; static struct birdsock *stored_sock; -static int sock_recalc_fdsets_p; static inline sock * sk_next(sock *s) @@ -1078,7 +1078,6 @@ sk_free(resource *r) if (s == stored_sock) stored_sock = sk_next(s); rem_node(&s->n); - sock_recalc_fdsets_p = 1; } } @@ -1276,7 +1275,6 @@ static void sk_insert(sock *s) { add_tail(&sock_list, &s->n); - sock_recalc_fdsets_p = 1; } static void @@ -1328,18 +1326,6 @@ sk_passive_connected(sock *s, int type) log(L_WARN "SOCK: Cannot get remote IP address for TCP<"); } - if (fd >= FD_SETSIZE) - { - /* FIXME: Call err_hook instead ? */ - log(L_ERR "SOCK: Incoming connection from %I%J (port %d) %s", - t->daddr, ipa_is_link_local(t->daddr) ? t->iface : NULL, - t->dport, "rejected due to FD_SETSIZE limit"); - close(fd); - t->fd = -1; - rfree(t); - return 1; - } - if (sk_setup(t) < 0) { /* FIXME: Call err_hook instead ? */ @@ -1416,9 +1402,6 @@ sk_open(sock *s) if (fd < 0) ERR("socket"); - if (fd >= FD_SETSIZE) - ERR2("FD_SETSIZE limit reached"); - s->af = af; s->fd = fd; @@ -2062,15 +2045,15 @@ static int short_loops = 0; void io_loop(void) { - fd_set rd, wr; - struct timeval timo; + int poll_tout; time_t tout; - int hi, events; + int nfds, events; sock *s; node *n; + int fdmax = 256; + struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); watchdog_start1(); - sock_recalc_fdsets_p = 1; for(;;) { events = ev_run_list(&global_event_list); @@ -2081,43 +2064,43 @@ io_loop(void) tm_shot(); continue; } - timo.tv_sec = events ? 0 : MIN(tout - now, 3); - timo.tv_usec = 0; + poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */ io_close_event(); - if (sock_recalc_fdsets_p) - { - sock_recalc_fdsets_p = 0; - FD_ZERO(&rd); - FD_ZERO(&wr); - } - - hi = 0; + nfds = 0; WALK_LIST(n, sock_list) { + pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ s = SKIP_BACK(sock, n, n); if (s->rx_hook) { - FD_SET(s->fd, &rd); - if (s->fd > hi) - hi = s->fd; + pfd[nfds].fd = s->fd; + pfd[nfds].events |= POLLIN; } - else - FD_CLR(s->fd, &rd); if (s->tx_hook && s->ttx != s->tpos) { - FD_SET(s->fd, &wr); - if (s->fd > hi) - hi = s->fd; + pfd[nfds].fd = s->fd; + pfd[nfds].events |= POLLOUT; + } + if (pfd[nfds].fd != -1) + { + s->index = nfds; + nfds++; } else - FD_CLR(s->fd, &wr); + s->index = -1; + + if (nfds >= fdmax) + { + fdmax *= 2; + pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd)); + } } /* * Yes, this is racy. But even if the signal comes before this test - * and entering select(), it gets caught on the next timer tick. + * and entering poll(), it gets caught on the next timer tick. */ if (async_config_flag) @@ -2142,18 +2125,18 @@ io_loop(void) continue; } - /* And finally enter select() to find active sockets */ + /* And finally enter poll() to find active sockets */ watchdog_stop(); - hi = select(hi+1, &rd, &wr, NULL, &timo); + events = poll(pfd, nfds, poll_tout); watchdog_start(); - if (hi < 0) + if (events < 0) { if (errno == EINTR || errno == EAGAIN) continue; - die("select: %m"); + die("poll: %m"); } - if (hi) + if (events) { /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); @@ -2161,11 +2144,17 @@ io_loop(void) while (current_sock) { sock *s = current_sock; + if (s->index == -1) + { + current_sock = sk_next(s); + goto next; + } + int e; int steps; steps = MAX_STEPS; - if ((s->type >= SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook) + if ((s->type >= SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) do { steps--; @@ -2177,7 +2166,7 @@ io_loop(void) while (e && s->rx_hook && steps); steps = MAX_STEPS; - if (FD_ISSET(s->fd, &wr)) + if (pfd[s->index].revents & POLLOUT) do { steps--; @@ -2204,13 +2193,17 @@ io_loop(void) while (current_sock && count < MAX_RX_STEPS) { sock *s = current_sock; - int e UNUSED; + if (s->index == -1) + { + current_sock = sk_next(s); + goto next2; + } - if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook) + if ((s->type < SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) { count++; io_log_event(s->rx_hook, s->data); - e = sk_read(s); + sk_read(s); if (s != current_sock) goto next2; } From fd926ed4eea319b94bd0e09e093b90846bcb169b Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Tue, 15 Mar 2016 14:57:49 +0100 Subject: [PATCH 08/21] Poll: Prevent the improbable case of EAGAIN after POLLIN --- proto/bfd/io.c | 4 ++-- sysdep/unix/io.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/proto/bfd/io.c b/proto/bfd/io.c index fb150040..79ed9af7 100644 --- a/proto/bfd/io.c +++ b/proto/bfd/io.c @@ -576,7 +576,7 @@ sockets_close_fds(struct birdloop *loop) loop->close_scheduled = 0; } -int sk_read(sock *s); +int sk_read(sock *s, int revents); int sk_write(sock *s); static void @@ -605,7 +605,7 @@ sockets_fire(struct birdloop *loop) if (pfd->revents & POLLIN) while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk); + e = sk_read(*psk, 0); e = 1; if (pfd->revents & POLLOUT) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index bc212de1..b769de58 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1760,7 +1760,7 @@ sk_send_full(sock *s, unsigned len, struct iface *ifa, /* sk_read() and sk_write() are called from BFD's event loop */ int -sk_read(sock *s) +sk_read(sock *s, int revents) { switch (s->type) { @@ -1779,6 +1779,11 @@ sk_read(sock *s) { if (errno != EINTR && errno != EAGAIN) s->err_hook(s, errno); + else if (errno == EAGAIN && !(revents & POLLIN)) + { + log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents); + s->err_hook(s, 0); + } } else if (!c) s->err_hook(s, 0); @@ -2159,7 +2164,7 @@ io_loop(void) { steps--; io_log_event(s->rx_hook, s->data); - e = sk_read(s); + e = sk_read(s, pfd[s->index].revents); if (s != current_sock) goto next; } @@ -2203,7 +2208,7 @@ io_loop(void) { count++; io_log_event(s->rx_hook, s->data); - sk_read(s); + sk_read(s, pfd[s->index].revents); if (s != current_sock) goto next2; } From 79a4f74a65941603cc42680d2e61b00ec88abe97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Tvrd=C3=ADk?= Date: Tue, 15 Mar 2016 10:29:32 +0100 Subject: [PATCH 09/21] BGP: Add documentaion for extended messages --- doc/bird.sgml | 9 +++++++-- nest/config.Y | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index c5316d87..73061202 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1862,6 +1862,11 @@ using the following configuration parameters: in neighbor's implementation of 4B AS extension. Even when disabled (off), BIRD behaves internally as AS4-aware BGP router. Default: on. + enable extended messages + The BGP protocol uses maximum message length of 4096 bytes. This option + provides an extension to allow extended messages with length up + to 65535 bytes. Default: off. + capabilities Use capability advertisement to advertise optional capabilities. This is standard behavior for newer BGP implementations, but there might be some @@ -2057,7 +2062,7 @@ protocol bgp { multihop; # ... which is connected indirectly export filter { # We use non-trivial export rules if source = RTS_STATIC then { # Export only static routes - # Assign our community + # Assign our community bgp_community.add((65000,64501)); # Artificially increase path length # by advertising local AS number twice @@ -2266,7 +2271,7 @@ these attributes: ip (Linux) The preferred source address. Used in source address selection for - outgoing packets. Has to be one of the IP addresses of the router. + outgoing packets. Has to be one of the IP addresses of the router. int (Linux) The realm of the route. Can be used for traffic classification. diff --git a/nest/config.Y b/nest/config.Y index 799a09f9..87827c10 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -111,7 +111,7 @@ idval: $$ = ipa_to_u32(SYM_VAL($1).px.ip); #endif else - cf_error("Number of IPv4 address constant expected"); + cf_error("Number or IPv4 address constant expected"); } ; From 9036bbf2b7cc781c87f2a6b3979198f77ec6ada1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Tvrd=C3=ADk?= Date: Tue, 15 Mar 2016 14:55:40 +0100 Subject: [PATCH 10/21] RIP: fix typo in configuration at rx length opt --- proto/rip/config.Y | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proto/rip/config.Y b/proto/rip/config.Y index 29ea7eb1..083d2e91 100644 --- a/proto/rip/config.Y +++ b/proto/rip/config.Y @@ -131,7 +131,7 @@ rip_iface_item: | TIMEOUT TIME expr { RIP_IFACE->timeout_time = $3; if ($3<=0) cf_error("Timeout time must be positive"); } | GARBAGE TIME expr { RIP_IFACE->garbage_time = $3; if ($3<=0) cf_error("Garbage time must be positive"); } | ECMP WEIGHT expr { RIP_IFACE->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } - | RX BUFFER expr { RIP_IFACE->rx_buffer = $3; if (($3<256) || ($3>65535)) cf_error("TX length must be in range 256-65535"); } + | RX BUFFER expr { RIP_IFACE->rx_buffer = $3; if (($3<256) || ($3>65535)) cf_error("RX length must be in range 256-65535"); } | TX LENGTH expr { RIP_IFACE->tx_length = $3; if (($3<256) || ($3>65535)) cf_error("TX length must be in range 256-65535"); } | TX tos { RIP_IFACE->tx_tos = $2; } | TX PRIORITY expr { RIP_IFACE->tx_priority = $3; } From 9c92f69272de3795f7289969e815d99a93d0d9b3 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Fri, 18 Mar 2016 11:44:28 +0100 Subject: [PATCH 11/21] Unix: Substituted select -> poll also in congestion checker It does strange things when even one fd larger than FD_SETSIZE is passed to select(). --- sysdep/unix/io.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index b769de58..eb1c1cad 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1679,19 +1679,12 @@ sk_maybe_write(sock *s) int sk_rx_ready(sock *s) { - fd_set rd, wr; - struct timeval timo; int rv; - - FD_ZERO(&rd); - FD_ZERO(&wr); - FD_SET(s->fd, &rd); - - timo.tv_sec = 0; - timo.tv_usec = 0; + struct pollfd pfd = { .fd = s->fd }; + pfd.events |= POLLIN; redo: - rv = select(s->fd+1, &rd, &wr, NULL, &timo); + rv = poll(&pfd, 1, 0); if ((rv < 0) && (errno == EINTR || errno == EAGAIN)) goto redo; From a459f4df16eca156cb7c6def4a758fd89c9fa504 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 9 Mar 2016 17:37:44 +0100 Subject: [PATCH 12/21] OSPF: Fix reading from freed memory Thanks to Pavel Tvrdik for noticing it. --- proto/ospf/neighbor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/proto/ospf/neighbor.c b/proto/ospf/neighbor.c index c5d44dec..0223ccdf 100644 --- a/proto/ospf/neighbor.c +++ b/proto/ospf/neighbor.c @@ -108,6 +108,7 @@ ospf_neigh_down(struct ospf_neighbor *n) { struct ospf_iface *ifa = n->ifa; struct ospf_proto *p = ifa->oa->po; + u32 rid = n->rid; if ((ifa->type == OSPF_IT_NBMA) || (ifa->type == OSPF_IT_PTMP)) { @@ -121,7 +122,7 @@ ospf_neigh_down(struct ospf_neighbor *n) rem_node(NODE n); rfree(n->pool); - OSPF_TRACE(D_EVENTS, "Neighbor %R on %s removed", n->rid, ifa->ifname); + OSPF_TRACE(D_EVENTS, "Neighbor %R on %s removed", rid, ifa->ifname); } /** From 0a505706bc909b2625d308ffb2eaed3dc8398538 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 9 Mar 2016 17:51:50 +0100 Subject: [PATCH 13/21] Minor changes in documentation --- doc/bird.sgml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 73061202..5e5aeee4 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -318,8 +318,9 @@ protocol rip {

include " This statement causes inclusion of a new file. - On every multiple access network (e.g., the Ethernet) Designed Router - and Backup Designed router are elected. These routers have some special + On every multiple access network (e.g., the Ethernet) Designated Router + and Backup Designated router are elected. These routers have some special functions in the flooding process. Higher priority increases preferences in this election. Routers with priority 0 are not eligible. Default value is 1. From 39a6b19d6d7e420805bd75783b77bf442745bccb Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Tue, 22 Mar 2016 12:51:31 +0100 Subject: [PATCH 14/21] OSPF: Fix bogus LSA ID collisions between received and originated LSAs After restart, LSAs locally originated by the previous instance are received from neighbors. They are installed to LSA db and flushed. If export of a route triggers origination of a new external LSA before flush of the received one is complete, the check in ospf_originate_lsa() causes origination to fail (because en->nf is NULL for the old LSA and non-NULL for the new LSA). The patch fixes this by updating the en->nf for LSAs being flushed (as is already done for empty ones). Generally, en->nf field deserves some better description in the code. Thanks to Jigar Mehta for analyzing the problem. --- proto/ospf/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index 8119cfa6..7558d4a0 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -278,7 +278,7 @@ ospf_originate_lsa(struct ospf_proto *p, struct ospf_new_lsa *lsa) if (!SNODE_VALID(en)) s_add_tail(&p->lsal, SNODE en); - if (en->lsa_body == NULL) + if (!en->nf || !en->lsa_body) en->nf = lsa->nf; if (en->nf != lsa->nf) From 665b8e5283df4f64eb44d8fb434489be1474b5d4 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Tue, 22 Mar 2016 13:35:40 +0100 Subject: [PATCH 15/21] Birdlib: Do cleanups after remove/free To avoid byzantine behavior in case of some errors, linked lists are cleared after rem_node() and resource headers are cleared after rfree(). --- lib/lists.c | 19 +------------------ lib/lists.h | 1 - lib/resource.c | 12 +++--------- proto/bfd/bfd.c | 2 +- 4 files changed, 5 insertions(+), 29 deletions(-) diff --git a/lib/lists.c b/lib/lists.c index d323a4b6..20a9a072 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -88,7 +88,7 @@ insert_node(node *n, node *after) * rem_node - remove a node from a list * @n: node to be removed * - * Removes a node @n from the list it's linked in. + * Removes a node @n from the list it's linked in. Afterwards, node @n is cleared. */ LIST_INLINE void rem_node(node *n) @@ -96,23 +96,6 @@ rem_node(node *n) node *z = n->prev; node *x = n->next; - z->next = x; - x->prev = z; -} - -/** - * rem2_node - remove a node from a list, with cleanup - * @n: node to be removed - * - * Removes a node @n from the list it's linked in and resets its pointers to NULL. - * Useful if you want to distinguish between linked and unlinked nodes. - */ -LIST_INLINE void -rem2_node(node *n) -{ - node *z = n->prev; - node *x = n->next; - z->next = x; x->prev = z; n->next = NULL; diff --git a/lib/lists.h b/lib/lists.h index 80a4dc93..4204cbc5 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -61,7 +61,6 @@ typedef struct list { /* In fact two overlayed nodes */ void add_tail(list *, node *); void add_head(list *, node *); void rem_node(node *); -void rem2_node(node *); void add_tail_list(list *, list *); void init_list(list *); void insert_node(node *, node *); diff --git a/lib/resource.c b/lib/resource.c index 64f9a39c..68718dfb 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -163,6 +163,7 @@ rfree(void *res) if (r->n.next) rem_node(&r->n); r->class->free(r); + r->class = NULL; xfree(r); } @@ -383,16 +384,9 @@ mb_allocz(pool *p, unsigned size) void * mb_realloc(void *m, unsigned size) { - struct mblock *ob = NULL; + struct mblock *b = SKIP_BACK(struct mblock, data, m); - if (m) - { - ob = SKIP_BACK(struct mblock, data, m); - if (ob->r.n.next) - rem_node(&ob->r.n); - } - - struct mblock *b = xrealloc(ob, sizeof(struct mblock) + size); + b = xrealloc(b, sizeof(struct mblock) + size); replace_node(&b->r.n, &b->r.n); b->size = size; return b->data; diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 7a085791..5de2f556 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -872,7 +872,7 @@ bfd_notify_hook(sock *sk, int len) WALK_LIST_FIRST(s, tmp_list) { bfd_lock_sessions(p); - rem2_node(&s->n); + rem_node(&s->n); state = s->loc_state; diag = s->loc_diag; bfd_unlock_sessions(p); From 54bb032d21d25a2221877e15325e79add10278ec Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Wed, 23 Mar 2016 01:45:37 +0100 Subject: [PATCH 16/21] Birdlib: Modify lists to avoid problems with pointer aliasing rules The old linked list implementation used some wild typecasts and required GCC option -fno-strict-aliasing to work properly. This patch fixes that. However, we still keep the option due to other potential problems. (Commited by Ondrej Santiago Zajicek) --- lib/lists.c | 10 +++++----- lib/lists.h | 17 +++++++++++++++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/lists.c b/lib/lists.c index 20a9a072..12ef3cc6 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -41,7 +41,7 @@ add_tail(list *l, node *n) { node *z = l->tail; - n->next = (node *) &l->null; + n->next = &l->tail_node; n->prev = z; z->next = n; l->tail = n; @@ -60,7 +60,7 @@ add_head(list *l, node *n) node *z = l->head; n->next = z; - n->prev = (node *) &l->head; + n->prev = &l->head_node; z->prev = n; l->head = n; } @@ -133,9 +133,9 @@ replace_node(node *old, node *new) LIST_INLINE void init_list(list *l) { - l->head = (node *) &l->null; + l->head = &l->tail_node; l->null = NULL; - l->tail = (node *) &l->head; + l->tail = &l->head_node; } /** @@ -155,6 +155,6 @@ add_tail_list(list *to, list *l) p->next = q; q->prev = p; q = l->tail; - q->next = (node *) &to->null; + q->next = &to->tail_node; to->tail = q; } diff --git a/lib/lists.h b/lib/lists.h index 4204cbc5..51856b05 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -26,10 +26,23 @@ typedef struct node { struct node *next, *prev; } node; -typedef struct list { /* In fact two overlayed nodes */ - struct node *head, *null, *tail; +typedef union list { /* In fact two overlayed nodes */ + struct { /* Head node */ + struct node head_node; + void *head_padding; + }; + struct { /* Tail node */ + void *tail_padding; + struct node tail_node; + }; + struct { /* Split to separate pointers */ + struct node *head; + struct node *null; + struct node *tail; + }; } list; + #define NODE (node *) #define HEAD(list) ((void *)((list).head)) #define TAIL(list) ((void *)((list).tail)) From ea0a8be2ff5afb8385a69cc0df70984e0fd3a570 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Wed, 30 Mar 2016 16:21:32 +0200 Subject: [PATCH 17/21] IO/Poll: fix mistaken variable merge The events variable is used in the short loop decision. The reasons are not much clear, keeping this to keep the former behaviour. --- sysdep/unix/io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index eb1c1cad..112ebe91 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -2045,7 +2045,7 @@ io_loop(void) { int poll_tout; time_t tout; - int nfds, events; + int nfds, events, pout; sock *s; node *n; int fdmax = 256; @@ -2125,16 +2125,16 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); - events = poll(pfd, nfds, poll_tout); + pout = poll(pfd, nfds, poll_tout); watchdog_start(); - if (events < 0) + if (pout < 0) { if (errno == EINTR || errno == EAGAIN) continue; die("poll: %m"); } - if (events) + if (pout) { /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); From e86cfd41d975122cc944db68383aef4028da9575 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 23 Mar 2016 18:25:15 +0100 Subject: [PATCH 18/21] KRT: Fix route learn scan when route changed When a kernel route changed, function krt_learn_scan() noticed that and replaced the route in internal kernel FIB, but after that, function krt_learn_prune() failed to propagate the new route to the nest, because it confused the new route with the (removed) old best route and decided that the best route did not changed. Wow, the original code (and the bug) is almost 17 years old. --- nest/route.h | 2 +- sysdep/bsd/krt-sock.c | 5 ++--- sysdep/linux/netlink.c | 3 ++- sysdep/unix/krt.c | 46 ++++++++++++++++++++++++++---------------- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/nest/route.h b/nest/route.h index c435b9e0..9368808f 100644 --- a/nest/route.h +++ b/nest/route.h @@ -223,8 +223,8 @@ typedef struct rte { struct { /* Routes generated by krt sync (both temporary and inherited ones) */ s8 src; /* Alleged route source (see krt.h) */ u8 proto; /* Kernel source protocol ID */ - u8 type; /* Kernel route type */ u8 seen; /* Seen during last scan */ + u8 best; /* Best route in network, propagated to core */ u32 metric; /* Kernel metric */ } krt; } u; diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 29203d1b..6ff3b2b7 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -493,9 +493,8 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) e->net = net; e->u.krt.src = src; e->u.krt.proto = src2; - - /* These are probably too Linux-specific */ - e->u.krt.type = 0; + e->u.krt.seen = 0; + e->u.krt.best = 0; e->u.krt.metric = 0; if (scan) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 640d1877..1ffdff07 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1102,7 +1102,8 @@ nl_parse_route(struct nlmsghdr *h, int scan) e->net = net; e->u.krt.src = src; e->u.krt.proto = i->rtm_protocol; - e->u.krt.type = i->rtm_type; + e->u.krt.seen = 0; + e->u.krt.best = 0; e->u.krt.metric = 0; if (a[RTA_PRIORITY]) diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 5e78586b..f5dee877 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -417,46 +417,58 @@ again: net *n = (net *) f; rte *e, **ee, *best, **pbest, *old_best; - old_best = n->routes; + /* + * Note that old_best may be NULL even if there was an old best route in + * the previous step, because it might be replaced in krt_learn_scan(). + * But in that case there is a new valid best route. + */ + + old_best = NULL; best = NULL; pbest = NULL; ee = &n->routes; while (e = *ee) { + if (e->u.krt.best) + old_best = e; + if (!e->u.krt.seen) { *ee = e->next; rte_free(e); continue; } + if (!best || best->u.krt.metric > e->u.krt.metric) { best = e; pbest = ee; } + e->u.krt.seen = 0; + e->u.krt.best = 0; ee = &e->next; } if (!n->routes) { DBG("%I/%d: deleting\n", n->n.prefix, n->n.pxlen); if (old_best) - { - krt_learn_announce_delete(p, n); - n->n.flags &= ~KRF_INSTALLED; - } + krt_learn_announce_delete(p, n); + FIB_ITERATE_PUT(&fit, f); fib_delete(fib, f); goto again; } + + best->u.krt.best = 1; *pbest = best->next; best->next = n->routes; n->routes = best; - if (best != old_best || !(n->n.flags & KRF_INSTALLED) || p->reload) + + if ((best != old_best) || p->reload) { DBG("%I/%d: announcing (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric); krt_learn_announce_update(p, best); - n->n.flags |= KRF_INSTALLED; } else DBG("%I/%d: uptodate (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric); @@ -515,31 +527,31 @@ krt_learn_async(struct krt_proto *p, rte *e, int new) best = n->routes; bestp = &n->routes; for(gg=&n->routes; g=*gg; gg=&g->next) + { if (best->u.krt.metric > g->u.krt.metric) { best = g; bestp = gg; } + + g->u.krt.best = 0; + } + if (best) { + best->u.krt.best = 1; *bestp = best->next; best->next = n->routes; n->routes = best; } + if (best != old_best) { DBG("krt_learn_async: distributing change\n"); if (best) - { - krt_learn_announce_update(p, best); - n->n.flags |= KRF_INSTALLED; - } + krt_learn_announce_update(p, best); else - { - n->routes = NULL; - krt_learn_announce_delete(p, n); - n->n.flags &= ~KRF_INSTALLED; - } + krt_learn_announce_delete(p, n); } } @@ -564,7 +576,7 @@ krt_dump(struct proto *P) static void krt_dump_attrs(rte *e) { - debug(" [m=%d,p=%d,t=%d]", e->u.krt.metric, e->u.krt.proto, e->u.krt.type); + debug(" [m=%d,p=%d]", e->u.krt.metric, e->u.krt.proto); } #endif From 9e7b3ebdf9556d7464911dd39e862b1c003319b3 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 6 Apr 2016 11:49:34 +0200 Subject: [PATCH 19/21] IO: Replace RX priority heuristic with explicit mark In BIRD, RX has lower priority than TX with the exception of RX from control socket. The patch replaces heuristic based on socket type with explicit mark and uses it for both control socket and BGP session waiting to be established. This should avoid an issue when during heavy load, outgoing connection could connect (TX event), send open, but then failed to receive OPEN / establish in time, not sending notifications between and therefore got hold timer expired error from the neighbor immediately after it finally established the connection. --- lib/socket.h | 1 + proto/bgp/bgp.c | 3 +++ sysdep/unix/io.c | 4 ++-- sysdep/unix/main.c | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/socket.h b/lib/socket.h index fbea92aa..0327e9e5 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -28,6 +28,7 @@ typedef struct birdsock { struct iface *iface; /* Interface; specify this for broad/multicast sockets */ byte *rbuf, *rpos; /* NULL=allocate automatically */ + uint fast_rx; /* RX has higher priority in event loop */ uint rbsize; int (*rx_hook)(struct birdsock *, int size); /* NULL=receiving turned off, returns 1 to clear rx buffer */ diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 11f33b14..7328cb79 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -374,6 +374,8 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) if (ipa_zero(p->source_addr)) p->source_addr = conn->sk->saddr; + conn->sk->fast_rx = 0; + p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; @@ -696,6 +698,7 @@ bgp_setup_sk(struct bgp_conn *conn, sock *s) { s->data = conn; s->err_hook = bgp_sock_err; + s->fast_rx = 1; conn->sk = s; } diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 112ebe91..078fe73c 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -2152,7 +2152,7 @@ io_loop(void) int steps; steps = MAX_STEPS; - if ((s->type >= SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) + if (s->fast_rx && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) do { steps--; @@ -2197,7 +2197,7 @@ io_loop(void) goto next2; } - if ((s->type < SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) + if (!s->fast_rx && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) { count++; io_log_event(s->rx_hook, s->data); diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 24d34f60..5d5586a0 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -450,6 +450,7 @@ cli_connect(sock *s, int size UNUSED) s->err_hook = cli_err; s->data = c = cli_new(s); s->pool = c->pool; /* We need to have all the socket buffers allocated in the cli pool */ + s->fast_rx = 1; c->rx_pos = c->rx_buf; c->rx_aux = NULL; rmove(s, c->pool); @@ -466,6 +467,7 @@ cli_init_unix(uid_t use_uid, gid_t use_gid) s->type = SK_UNIX_PASSIVE; s->rx_hook = cli_connect; s->rbsize = 1024; + s->fast_rx = 1; /* Return value intentionally ignored */ unlink(path_control_socket); From bd22d7f41d37dec8f7b8b845f6a18c775e66ddfc Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 6 Apr 2016 11:57:28 +0200 Subject: [PATCH 20/21] IO: Avoid multiple event cycles in one loop cycle. Event cycle may took too much time and trigger next timer events, so avoid cycling between timer and event cycles inside the loop cycle. --- proto/bgp/bgp.c | 4 ++++ sysdep/unix/io.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 7328cb79..94c8e5c2 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -668,6 +668,10 @@ bgp_keepalive_timeout(timer *t) DBG("BGP: Keepalive timer\n"); bgp_schedule_packet(conn, PKT_KEEPALIVE); + + /* Kick TX a bit faster */ + if (ev_active(conn->tx_ev)) + ev_run(conn->tx_ev); } static void diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 078fe73c..5955dbfe 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -2055,12 +2055,13 @@ io_loop(void) for(;;) { events = ev_run_list(&global_event_list); + timers: update_times(); tout = tm_first_shot(); if (tout <= now) { tm_shot(); - continue; + goto timers; } poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */ From 06edbb67ed807811654e7fd8f0f9b83766430216 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 7 Apr 2016 01:10:24 +0200 Subject: [PATCH 21/21] Nest: Reset export route counter during graceful restart Counter exp_routes is increased during initial route feed after GR recovery, so it has to start with zero, otherwise BIRD will end with double value in exp_routes. --- nest/proto.c | 1 + 1 file changed, 1 insertion(+) diff --git a/nest/proto.c b/nest/proto.c index d04da333..436377f1 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -1260,6 +1260,7 @@ proto_want_export_down(struct proto *p) rt_feed_baby_abort(p); p->export_state = ES_DOWN; + p->stats.exp_routes = 0; proto_unlink_ahooks(p); }