From e1c13a5a7b86f2ba09178300bad960224658c833 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Wed, 9 Mar 2016 12:12:02 +0100 Subject: [PATCH 1/3] Unix: Rework of select-loop to poll-loop This should lift the limit of FD_SETSIZE and allow more than 1024 fd's. FD_SETSIZE limit doesn't matter now when creating new sockets. --- sysdep/unix/io.c | 101 ++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index b636e799..bc212de1 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -41,12 +42,12 @@ #include "lib/sysio.h" /* Maximum number of calls of tx handler for one socket in one - * select iteration. Should be small enough to not monopolize CPU by + * poll iteration. Should be small enough to not monopolize CPU by * one protocol instance. */ #define MAX_STEPS 4 -/* Maximum number of calls of rx handler for all sockets in one select +/* Maximum number of calls of rx handler for all sockets in one poll iteration. RX callbacks are often much more costly so we limit this to gen small latencies */ #define MAX_RX_STEPS 4 @@ -1022,7 +1023,6 @@ sk_log_error(sock *s, const char *p) static list sock_list; static struct birdsock *current_sock; static struct birdsock *stored_sock; -static int sock_recalc_fdsets_p; static inline sock * sk_next(sock *s) @@ -1078,7 +1078,6 @@ sk_free(resource *r) if (s == stored_sock) stored_sock = sk_next(s); rem_node(&s->n); - sock_recalc_fdsets_p = 1; } } @@ -1276,7 +1275,6 @@ static void sk_insert(sock *s) { add_tail(&sock_list, &s->n); - sock_recalc_fdsets_p = 1; } static void @@ -1328,18 +1326,6 @@ sk_passive_connected(sock *s, int type) log(L_WARN "SOCK: Cannot get remote IP address for TCP<"); } - if (fd >= FD_SETSIZE) - { - /* FIXME: Call err_hook instead ? */ - log(L_ERR "SOCK: Incoming connection from %I%J (port %d) %s", - t->daddr, ipa_is_link_local(t->daddr) ? t->iface : NULL, - t->dport, "rejected due to FD_SETSIZE limit"); - close(fd); - t->fd = -1; - rfree(t); - return 1; - } - if (sk_setup(t) < 0) { /* FIXME: Call err_hook instead ? */ @@ -1416,9 +1402,6 @@ sk_open(sock *s) if (fd < 0) ERR("socket"); - if (fd >= FD_SETSIZE) - ERR2("FD_SETSIZE limit reached"); - s->af = af; s->fd = fd; @@ -2062,15 +2045,15 @@ static int short_loops = 0; void io_loop(void) { - fd_set rd, wr; - struct timeval timo; + int poll_tout; time_t tout; - int hi, events; + int nfds, events; sock *s; node *n; + int fdmax = 256; + struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); watchdog_start1(); - sock_recalc_fdsets_p = 1; for(;;) { events = ev_run_list(&global_event_list); @@ -2081,43 +2064,43 @@ io_loop(void) tm_shot(); continue; } - timo.tv_sec = events ? 0 : MIN(tout - now, 3); - timo.tv_usec = 0; + poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */ io_close_event(); - if (sock_recalc_fdsets_p) - { - sock_recalc_fdsets_p = 0; - FD_ZERO(&rd); - FD_ZERO(&wr); - } - - hi = 0; + nfds = 0; WALK_LIST(n, sock_list) { + pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ s = SKIP_BACK(sock, n, n); if (s->rx_hook) { - FD_SET(s->fd, &rd); - if (s->fd > hi) - hi = s->fd; + pfd[nfds].fd = s->fd; + pfd[nfds].events |= POLLIN; } - else - FD_CLR(s->fd, &rd); if (s->tx_hook && s->ttx != s->tpos) { - FD_SET(s->fd, &wr); - if (s->fd > hi) - hi = s->fd; + pfd[nfds].fd = s->fd; + pfd[nfds].events |= POLLOUT; + } + if (pfd[nfds].fd != -1) + { + s->index = nfds; + nfds++; } else - FD_CLR(s->fd, &wr); + s->index = -1; + + if (nfds >= fdmax) + { + fdmax *= 2; + pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd)); + } } /* * Yes, this is racy. But even if the signal comes before this test - * and entering select(), it gets caught on the next timer tick. + * and entering poll(), it gets caught on the next timer tick. */ if (async_config_flag) @@ -2142,18 +2125,18 @@ io_loop(void) continue; } - /* And finally enter select() to find active sockets */ + /* And finally enter poll() to find active sockets */ watchdog_stop(); - hi = select(hi+1, &rd, &wr, NULL, &timo); + events = poll(pfd, nfds, poll_tout); watchdog_start(); - if (hi < 0) + if (events < 0) { if (errno == EINTR || errno == EAGAIN) continue; - die("select: %m"); + die("poll: %m"); } - if (hi) + if (events) { /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); @@ -2161,11 +2144,17 @@ io_loop(void) while (current_sock) { sock *s = current_sock; + if (s->index == -1) + { + current_sock = sk_next(s); + goto next; + } + int e; int steps; steps = MAX_STEPS; - if ((s->type >= SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook) + if ((s->type >= SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) do { steps--; @@ -2177,7 +2166,7 @@ io_loop(void) while (e && s->rx_hook && steps); steps = MAX_STEPS; - if (FD_ISSET(s->fd, &wr)) + if (pfd[s->index].revents & POLLOUT) do { steps--; @@ -2204,13 +2193,17 @@ io_loop(void) while (current_sock && count < MAX_RX_STEPS) { sock *s = current_sock; - int e UNUSED; + if (s->index == -1) + { + current_sock = sk_next(s); + goto next2; + } - if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook) + if ((s->type < SK_MAGIC) && (pfd[s->index].revents & (POLLIN | POLLHUP | POLLERR)) && s->rx_hook) { count++; io_log_event(s->rx_hook, s->data); - e = sk_read(s); + sk_read(s); if (s != current_sock) goto next2; } From fd926ed4eea319b94bd0e09e093b90846bcb169b Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Tue, 15 Mar 2016 14:57:49 +0100 Subject: [PATCH 2/3] Poll: Prevent the improbable case of EAGAIN after POLLIN --- proto/bfd/io.c | 4 ++-- sysdep/unix/io.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/proto/bfd/io.c b/proto/bfd/io.c index fb150040..79ed9af7 100644 --- a/proto/bfd/io.c +++ b/proto/bfd/io.c @@ -576,7 +576,7 @@ sockets_close_fds(struct birdloop *loop) loop->close_scheduled = 0; } -int sk_read(sock *s); +int sk_read(sock *s, int revents); int sk_write(sock *s); static void @@ -605,7 +605,7 @@ sockets_fire(struct birdloop *loop) if (pfd->revents & POLLIN) while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk); + e = sk_read(*psk, 0); e = 1; if (pfd->revents & POLLOUT) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index bc212de1..b769de58 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1760,7 +1760,7 @@ sk_send_full(sock *s, unsigned len, struct iface *ifa, /* sk_read() and sk_write() are called from BFD's event loop */ int -sk_read(sock *s) +sk_read(sock *s, int revents) { switch (s->type) { @@ -1779,6 +1779,11 @@ sk_read(sock *s) { if (errno != EINTR && errno != EAGAIN) s->err_hook(s, errno); + else if (errno == EAGAIN && !(revents & POLLIN)) + { + log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents); + s->err_hook(s, 0); + } } else if (!c) s->err_hook(s, 0); @@ -2159,7 +2164,7 @@ io_loop(void) { steps--; io_log_event(s->rx_hook, s->data); - e = sk_read(s); + e = sk_read(s, pfd[s->index].revents); if (s != current_sock) goto next; } @@ -2203,7 +2208,7 @@ io_loop(void) { count++; io_log_event(s->rx_hook, s->data); - sk_read(s); + sk_read(s, pfd[s->index].revents); if (s != current_sock) goto next2; } From 9c92f69272de3795f7289969e815d99a93d0d9b3 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Fri, 18 Mar 2016 11:44:28 +0100 Subject: [PATCH 3/3] Unix: Substituted select -> poll also in congestion checker It does strange things when even one fd larger than FD_SETSIZE is passed to select(). --- sysdep/unix/io.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index b769de58..eb1c1cad 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1679,19 +1679,12 @@ sk_maybe_write(sock *s) int sk_rx_ready(sock *s) { - fd_set rd, wr; - struct timeval timo; int rv; - - FD_ZERO(&rd); - FD_ZERO(&wr); - FD_SET(s->fd, &rd); - - timo.tv_sec = 0; - timo.tv_usec = 0; + struct pollfd pfd = { .fd = s->fd }; + pfd.events |= POLLIN; redo: - rv = select(s->fd+1, &rd, &wr, NULL, &timo); + rv = poll(&pfd, 1, 0); if ((rv < 0) && (errno == EINTR || errno == EAGAIN)) goto redo;