Merge branch 'bgp-grace'

This commit is contained in:
Ondrej Zajicek 2014-03-24 12:41:43 +01:00
commit c980f8002e
19 changed files with 1045 additions and 238 deletions

View file

@ -98,6 +98,7 @@ config_alloc(byte *name)
c->load_time = now; c->load_time = now;
c->tf_route = c->tf_proto = (struct timeformat){"%T", "%F", 20*3600}; c->tf_route = c->tf_proto = (struct timeformat){"%T", "%F", 20*3600};
c->tf_base = c->tf_log = (struct timeformat){"%F %T", NULL, 0}; c->tf_base = c->tf_log = (struct timeformat){"%F %T", NULL, 0};
c->gr_wait = DEFAULT_GR_WAIT;
return c; return c;
} }

View file

@ -38,6 +38,7 @@ struct config {
struct timeformat tf_proto; /* Time format for 'show protocol' */ struct timeformat tf_proto; /* Time format for 'show protocol' */
struct timeformat tf_log; /* Time format for the logfile */ struct timeformat tf_log; /* Time format for the logfile */
struct timeformat tf_base; /* Time format for other purposes */ struct timeformat tf_base; /* Time format for other purposes */
u32 gr_wait; /* Graceful restart wait timeout */
int cli_debug; /* Tracing of CLI connections and commands */ int cli_debug; /* Tracing of CLI connections and commands */
char *err_msg; /* Parser error message */ char *err_msg; /* Parser error message */

View file

@ -157,6 +157,9 @@ options. The most important ones are:
<tag>-f</tag> <tag>-f</tag>
run bird in foreground. run bird in foreground.
<tag>-R</tag>
apply graceful restart recovery after start.
</descrip> </descrip>
<p>BIRD writes messages about its work to log files or syslog (according to config). <p>BIRD writes messages about its work to log files or syslog (according to config).
@ -187,6 +190,7 @@ configuration, but it is generally easy -- BIRD needs just the
standard library, privileges to read the config file and create the standard library, privileges to read the config file and create the
control socket and the CAP_NET_* capabilities. control socket and the CAP_NET_* capabilities.
<chapt>About routing tables <chapt>About routing tables
<p>BIRD has one or more routing tables which may or may not be <p>BIRD has one or more routing tables which may or may not be
@ -242,6 +246,20 @@ using comparison and ordering). Minor advantage is that routes are
shown sorted in <cf/show route/, minor disadvantage is that it is shown sorted in <cf/show route/, minor disadvantage is that it is
slightly more computationally expensive. slightly more computationally expensive.
<sect>Graceful restart
<p>When BIRD is started after restart or crash, it repopulates routing tables in
an uncoordinated manner, like after clean start. This may be impractical in some
cases, because if the forwarding plane (i.e. kernel routing tables) remains
intact, then its synchronization with BIRD would temporarily disrupt packet
forwarding until protocols converge. Graceful restart is a mechanism that could
help with this issue. Generally, it works by starting protocols and letting them
repopulate routing tables while deferring route propagation until protocols
acknowledge their convergence. Note that graceful restart behavior have to be
configured for all relevant protocols and requires protocol-specific support
(currently implemented for Kernel and BGP protocols), it is activated for
particular boot by option <cf/-R/.
<chapt>Configuration <chapt>Configuration
@ -371,6 +389,12 @@ protocol rip {
would accept IPv6 routes only). Such behavior was default in would accept IPv6 routes only). Such behavior was default in
older versions of BIRD. older versions of BIRD.
<tag>graceful restart wait <m/number/</tag>
During graceful restart recovery, BIRD waits for convergence of routing
protocols. This option allows to specify a timeout for the recovery to
prevent waiting indefinitely if some protocols cannot converge. Default:
240 seconds.
<tag>timeformat route|protocol|base|log "<m/format1/" [<m/limit/ "<m/format2/"]</tag> <tag>timeformat route|protocol|base|log "<m/format1/" [<m/limit/ "<m/format2/"]</tag>
This option allows to specify a format of date/time used by This option allows to specify a format of date/time used by
BIRD. The first argument specifies for which purpose such BIRD. The first argument specifies for which purpose such
@ -1493,6 +1517,8 @@ extended communities
(RFC 4360<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4360.txt">), (RFC 4360<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4360.txt">),
route reflectors route reflectors
(RFC 4456<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4456.txt">), (RFC 4456<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4456.txt">),
graceful restart
(RFC 4724<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4724.txt">),
multiprotocol extensions multiprotocol extensions
(RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">), (RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">),
4B AS numbers 4B AS numbers
@ -1502,9 +1528,7 @@ and 4B AS numbers in extended communities
For IPv6, it uses the standard multiprotocol extensions defined in For IPv6, it uses the standard multiprotocol extensions defined in
RFC 2283<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2283.txt"> RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">
including changes described in the
latest draft<htmlurl url="ftp://ftp.rfc-editor.org/internet-drafts/draft-ietf-idr-bgp4-multiprotocol-v2-05.txt">
and applied to IPv6 according to and applied to IPv6 according to
RFC 2545<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2545.txt">. RFC 2545<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2545.txt">.
@ -1716,6 +1740,26 @@ for each neighbor using the following configuration parameters:
capability and accepts such requests. Even when disabled, BIRD capability and accepts such requests. Even when disabled, BIRD
can send route refresh requests. Default: on. can send route refresh requests. Default: on.
<tag>graceful restart <m/switch/|aware</tag>
When a BGP speaker restarts or crashes, neighbors will discard all
received paths from the speaker, which disrupts packet forwarding even
when the forwarding plane of the speaker remains intact. RFC 4724
specifies an optional graceful restart mechanism to alleviate this
issue. This option controls the mechanism. It has three states:
Disabled, when no support is provided. Aware, when the graceful restart
support is announced and the support for restarting neighbors is
provided, but no local graceful restart is allowed (i.e. receiving-only
role). Enabled, when the full graceful restart support is provided
(i.e. both restarting and receiving role). Note that proper support for
local graceful restart requires also configuration of other protocols.
Default: aware.
<tag>graceful restart time <m/number/</tag>
The restart time is announced in the BGP graceful restart capability
and specifies how long the neighbor would wait for the BGP session to
re-establish after a restart before deleting stale routes. Default:
120 seconds.
<tag>interpret communities <m/switch/</tag> RFC 1997 demands <tag>interpret communities <m/switch/</tag> RFC 1997 demands
that BGP speaker should process well-known communities like that BGP speaker should process well-known communities like
no-export (65535, 65281) or no-advertise (65535, 65282). For no-export (65535, 65281) or no-advertise (65535, 65282). For
@ -2063,25 +2107,36 @@ overcome using another routing table and the pipe protocol.
<sect1>Configuration <sect1>Configuration
<p><descrip> <p><descrip>
<tag>persist <m/switch/</tag> Tell BIRD to leave all its routes in the <tag>persist <m/switch/</tag>
routing tables when it exits (instead of cleaning them up). Tell BIRD to leave all its routes in the routing tables when it exits
<tag>scan time <m/number/</tag> Time in seconds between two consecutive scans of the (instead of cleaning them up).
kernel routing table.
<tag>learn <m/switch/</tag> Enable learning of routes added to the kernel
routing tables by other routing daemons or by the system administrator.
This is possible only on systems which support identification of route
authorship.
<tag>device routes <m/switch/</tag> Enable export of device <tag>scan time <m/number/</tag>
routes to the kernel routing table. By default, such routes Time in seconds between two consecutive scans of the kernel routing
are rejected (with the exception of explicitly configured table.
device routes from the static protocol) regardless of the
export filter to protect device routes in kernel routing table
(managed by OS itself) from accidental overwriting or erasing.
<tag>kernel table <m/number/</tag> Select which kernel table should <tag>learn <m/switch/</tag>
this particular instance of the Kernel protocol work with. Available Enable learning of routes added to the kernel routing tables by other
only on systems supporting multiple routing tables. routing daemons or by the system administrator. This is possible only on
systems which support identification of route authorship.
<tag>device routes <m/switch/</tag>
Enable export of device routes to the kernel routing table. By default,
such routes are rejected (with the exception of explicitly configured
device routes from the static protocol) regardless of the export filter
to protect device routes in kernel routing table (managed by OS itself)
from accidental overwriting or erasing.
<tag>kernel table <m/number/</tag>
Select which kernel table should this particular instance of the Kernel
protocol work with. Available only on systems supporting multiple
routing tables.
<tag>graceful restart <m/switch/</tag>
Participate in graceful restart recovery. If this option is enabled and
a graceful restart recovery is active, the Kernel protocol will defer
synchronization of routing tables until the end of the recovery. Note
that import of kernel routes to BIRD is not affected.
</descrip> </descrip>
<sect1>Attributes <sect1>Attributes

View file

@ -32,6 +32,7 @@ Reply codes of BIRD command-line interface
0021 Undo requested 0021 Undo requested
0022 Undo scheduled 0022 Undo scheduled
0023 Evaluation of expression 0023 Evaluation of expression
0024 Graceful restart status report
1000 BIRD version 1000 BIRD version
1001 Interface list 1001 Interface list

View file

@ -36,6 +36,8 @@ typedef struct list { /* In fact two overlayed nodes */
#define NODE_NEXT(n) ((void *)((NODE (n))->next)) #define NODE_NEXT(n) ((void *)((NODE (n))->next))
#define NODE_VALID(n) ((NODE (n))->next) #define NODE_VALID(n) ((NODE (n))->next)
#define WALK_LIST(n,list) for(n=HEAD(list); NODE_VALID(n); n=NODE_NEXT(n)) #define WALK_LIST(n,list) for(n=HEAD(list); NODE_VALID(n); n=NODE_NEXT(n))
#define WALK_LIST2(n,nn,list,pos) \
for(nn=(list).head; NODE_VALID(nn) && (n=SKIP_BACK(typeof(*n),pos,nn)); nn=nn->next)
#define WALK_LIST_DELSAFE(n,nxt,list) \ #define WALK_LIST_DELSAFE(n,nxt,list) \
for(n=HEAD(list); nxt=NODE_NEXT(n); n=(void *) nxt) for(n=HEAD(list); nxt=NODE_NEXT(n); n=(void *) nxt)
/* WALK_LIST_FIRST supposes that called code removes each processed node */ /* WALK_LIST_FIRST supposes that called code removes each processed node */

View file

@ -7,6 +7,7 @@
*/ */
#include "nest/bird.h" #include "nest/bird.h"
#include "nest/protocol.h"
#include "nest/route.h" #include "nest/route.h"
#include "nest/cli.h" #include "nest/cli.h"
#include "conf/conf.h" #include "conf/conf.h"
@ -32,6 +33,8 @@ cmd_show_status(void)
tm_format_datetime(tim, &config->tf_base, config->load_time); tm_format_datetime(tim, &config->tf_base, config->load_time);
cli_msg(-1011, "Last reconfiguration on %s", tim); cli_msg(-1011, "Last reconfiguration on %s", tim);
graceful_restart_show_status();
if (shutting_down) if (shutting_down)
cli_msg(13, "Shutdown in progress"); cli_msg(13, "Shutdown in progress");
else if (configuring) else if (configuring)

View file

@ -49,6 +49,7 @@ CF_KEYWORDS(PASSWORD, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, INTERFA
CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, COMMANDS, PREEXPORT, GENERATE, ROA, MAX, FLUSH, AS) CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, COMMANDS, PREEXPORT, GENERATE, ROA, MAX, FLUSH, AS)
CF_KEYWORDS(LISTEN, BGP, V6ONLY, DUAL, ADDRESS, PORT, PASSWORDS, DESCRIPTION, SORTED) CF_KEYWORDS(LISTEN, BGP, V6ONLY, DUAL, ADDRESS, PORT, PASSWORDS, DESCRIPTION, SORTED)
CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP) CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP)
CF_KEYWORDS(GRACEFUL, RESTART, WAIT)
CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT, CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT,
RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE) RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE)
@ -110,6 +111,11 @@ listen_opt:
; ;
CF_ADDTO(conf, gr_opts)
gr_opts: GRACEFUL RESTART WAIT expr ';' { new_config->gr_wait = $4; } ;
/* Creation of routing tables */ /* Creation of routing tables */
tab_sorted: tab_sorted:

View file

@ -35,26 +35,53 @@ static struct proto *initial_device_proto;
static event *proto_flush_event; static event *proto_flush_event;
static timer *proto_shutdown_timer; static timer *proto_shutdown_timer;
static timer *gr_wait_timer;
#define GRS_NONE 0
#define GRS_INIT 1
#define GRS_ACTIVE 2
#define GRS_DONE 3
static int graceful_restart_state;
static u32 graceful_restart_locks;
static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; static char *p_states[] = { "DOWN", "START", "UP", "STOP" };
static char *c_states[] = { "HUNGRY", "FEEDING", "HAPPY", "FLUSHING" }; static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" };
static void proto_flush_loop(void *); static void proto_flush_loop(void *);
static void proto_shutdown_loop(struct timer *); static void proto_shutdown_loop(struct timer *);
static void proto_rethink_goal(struct proto *p); static void proto_rethink_goal(struct proto *p);
static void proto_want_export_up(struct proto *p);
static void proto_fell_down(struct proto *p);
static char *proto_state_name(struct proto *p); static char *proto_state_name(struct proto *p);
static void
proto_enqueue(list *l, struct proto *p)
{
add_tail(l, &p->n);
}
static void static void
proto_relink(struct proto *p) proto_relink(struct proto *p)
{ {
list *l = NULL; list *l = NULL;
switch (p->core_state)
{
case FS_HUNGRY:
l = &inactive_proto_list;
break;
case FS_HAPPY:
l = &active_proto_list;
break;
case FS_FLUSHING:
l = &flush_proto_list;
break;
default:
ASSERT(0);
}
rem_node(&p->n);
add_tail(l, &p->n);
}
static void
proto_log_state_change(struct proto *p)
{
if (p->debug & D_STATES) if (p->debug & D_STATES)
{ {
char *name = proto_state_name(p); char *name = proto_state_name(p);
@ -66,25 +93,9 @@ proto_relink(struct proto *p)
} }
else else
p->last_state_name_announced = NULL; p->last_state_name_announced = NULL;
rem_node(&p->n);
switch (p->core_state)
{
case FS_HUNGRY:
l = &inactive_proto_list;
break;
case FS_FEEDING:
case FS_HAPPY:
l = &active_proto_list;
break;
case FS_FLUSHING:
l = &flush_proto_list;
break;
default:
ASSERT(0);
}
proto_enqueue(l, p);
} }
/** /**
* proto_new - create a new protocol instance * proto_new - create a new protocol instance
* @c: protocol configuration * @c: protocol configuration
@ -126,6 +137,9 @@ proto_init_instance(struct proto *p)
p->attn = ev_new(p->pool); p->attn = ev_new(p->pool);
p->attn->data = p; p->attn->data = p;
if (graceful_restart_state == GRS_INIT)
p->gr_recovery = 1;
if (! p->proto->multitable) if (! p->proto->multitable)
rt_lock_table(p->table); rt_lock_table(p->table);
} }
@ -137,21 +151,20 @@ extern pool *rt_table_pool;
* @t: routing table to connect to * @t: routing table to connect to
* @stats: per-table protocol statistics * @stats: per-table protocol statistics
* *
* This function creates a connection between the protocol instance @p * This function creates a connection between the protocol instance @p and the
* and the routing table @t, making the protocol hear all changes in * routing table @t, making the protocol hear all changes in the table.
* the table.
* *
* The announce hook is linked in the protocol ahook list and, if the * The announce hook is linked in the protocol ahook list. Announce hooks are
* protocol accepts routes, also in the table ahook list. Announce * allocated from the routing table resource pool and when protocol accepts
* hooks are allocated from the routing table resource pool, they are * routes also in the table ahook list. The are linked to the table ahook list
* unlinked from the table ahook list after the protocol went down, * and unlinked from it depending on export_state (in proto_want_export_up() and
* (in proto_schedule_flush()) and they are automatically freed after the * proto_want_export_down()) and they are automatically freed after the protocol
* protocol is flushed (in proto_fell_down()). * is flushed (in proto_fell_down()).
* *
* Unless you want to listen to multiple routing tables (as the Pipe * Unless you want to listen to multiple routing tables (as the Pipe protocol
* protocol does), you needn't to worry about this function since the * does), you needn't to worry about this function since the connection to the
* connection to the protocol's primary routing table is initialized * protocol's primary routing table is initialized automatically by the core
* automatically by the core code. * code.
*/ */
struct announce_hook * struct announce_hook *
proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *stats) proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *stats)
@ -169,7 +182,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s
h->next = p->ahooks; h->next = p->ahooks;
p->ahooks = h; p->ahooks = h;
if (p->rt_notify) if (p->rt_notify && (p->export_state != ES_DOWN))
add_tail(&t->hooks, &h->n); add_tail(&t->hooks, &h->n);
return h; return h;
} }
@ -193,6 +206,16 @@ proto_find_announce_hook(struct proto *p, struct rtable *t)
return NULL; return NULL;
} }
static void
proto_link_ahooks(struct proto *p)
{
struct announce_hook *h;
if (p->rt_notify)
for(h=p->ahooks; h; h=h->next)
add_tail(&h->table->hooks, &h->n);
}
static void static void
proto_unlink_ahooks(struct proto *p) proto_unlink_ahooks(struct proto *p)
{ {
@ -362,9 +385,11 @@ proto_init(struct proto_config *c)
q->proto_state = PS_DOWN; q->proto_state = PS_DOWN;
q->core_state = FS_HUNGRY; q->core_state = FS_HUNGRY;
q->export_state = ES_DOWN;
q->last_state_change = now; q->last_state_change = now;
proto_enqueue(&initial_proto_list, q); add_tail(&initial_proto_list, &q->n);
if (p == &proto_unix_iface) if (p == &proto_unix_iface)
initial_device_proto = q; initial_device_proto = q;
@ -590,6 +615,7 @@ static void
proto_rethink_goal(struct proto *p) proto_rethink_goal(struct proto *p)
{ {
struct protocol *q; struct protocol *q;
byte goal;
if (p->reconfiguring && p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN) if (p->reconfiguring && p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN)
{ {
@ -606,22 +632,14 @@ proto_rethink_goal(struct proto *p)
/* Determine what state we want to reach */ /* Determine what state we want to reach */
if (p->disabled || p->reconfiguring) if (p->disabled || p->reconfiguring)
{ goal = PS_DOWN;
p->core_goal = FS_HUNGRY;
if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN)
return;
}
else else
{ goal = PS_UP;
p->core_goal = FS_HAPPY;
if (p->core_state == FS_HAPPY && p->proto_state == PS_UP)
return;
}
q = p->proto; q = p->proto;
if (p->core_goal == FS_HAPPY) /* Going up */ if (goal == PS_UP) /* Going up */
{ {
if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN) if (p->proto_state == PS_DOWN && p->core_state == FS_HUNGRY)
{ {
DBG("Kicking %s up\n", p->name); DBG("Kicking %s up\n", p->name);
PD(p, "Starting"); PD(p, "Starting");
@ -640,6 +658,178 @@ proto_rethink_goal(struct proto *p)
} }
} }
/**
* DOC: Graceful restart recovery
*
* Graceful restart of a router is a process when the routing plane (e.g. BIRD)
* restarts but both the forwarding plane (e.g kernel routing table) and routing
* neighbors keep proper routes, and therefore uninterrupted packet forwarding
* is maintained.
*
* BIRD implements graceful restart recovery by deferring export of routes to
* protocols until routing tables are refilled with the expected content. After
* start, protocols generate routes as usual, but routes are not propagated to
* them, until protocols report that they generated all routes. After that,
* graceful restart recovery is finished and the export (and the initial feed)
* to protocols is enabled.
*
* When graceful restart recovery need is detected during initialization, then
* enabled protocols are marked with @gr_recovery flag before start. Such
* protocols then decide how to proceed with graceful restart, participation is
* voluntary. Protocols could lock the recovery by proto_graceful_restart_lock()
* (stored in @gr_lock flag), which means that they want to postpone the end of
* the recovery until they converge and then unlock it. They also could set
* @gr_wait before advancing to %PS_UP, which means that the core should defer
* route export to that protocol until the end of the recovery. This should be
* done by protocols that expect their neigbors to keep the proper routes
* (kernel table, BGP sessions with BGP graceful restart capability).
*
* The graceful restart recovery is finished when either all graceful restart
* locks are unlocked or when graceful restart wait timer fires.
*
*/
static void graceful_restart_done(struct timer *t);
/**
* graceful_restart_recovery - request initial graceful restart recovery
*
* Called by the platform initialization code if the need for recovery
* after graceful restart is detected during boot. Have to be called
* before protos_commit().
*/
void
graceful_restart_recovery(void)
{
graceful_restart_state = GRS_INIT;
}
/**
* graceful_restart_init - initialize graceful restart
*
* When graceful restart recovery was requested, the function starts an active
* phase of the recovery and initializes graceful restart wait timer. The
* function have to be called after protos_commit().
*/
void
graceful_restart_init(void)
{
if (!graceful_restart_state)
return;
log(L_INFO "Graceful restart started");
if (!graceful_restart_locks)
{
graceful_restart_done(NULL);
return;
}
graceful_restart_state = GRS_ACTIVE;
gr_wait_timer = tm_new(proto_pool);
gr_wait_timer->hook = graceful_restart_done;
tm_start(gr_wait_timer, config->gr_wait);
}
/**
* graceful_restart_done - finalize graceful restart
*
* When there are no locks on graceful restart, the functions finalizes the
* graceful restart recovery. Protocols postponing route export until the end of
* the recovery are awakened and the export to them is enabled. All other
* related state is cleared. The function is also called when the graceful
* restart wait timer fires (but there are still some locks).
*/
static void
graceful_restart_done(struct timer *t UNUSED)
{
struct proto *p;
node *n;
log(L_INFO "Graceful restart done");
graceful_restart_state = GRS_DONE;
WALK_LIST2(p, n, proto_list, glob_node)
{
if (!p->gr_recovery)
continue;
/* Resume postponed export of routes */
if ((p->proto_state == PS_UP) && p->gr_wait)
{
proto_want_export_up(p);
proto_log_state_change(p);
}
/* Cleanup */
p->gr_recovery = 0;
p->gr_wait = 0;
p->gr_lock = 0;
}
graceful_restart_locks = 0;
}
void
graceful_restart_show_status(void)
{
if (graceful_restart_state != GRS_ACTIVE)
return;
cli_msg(-24, "Graceful restart recovery in progress");
cli_msg(-24, " Waiting for %d protocols to recover", graceful_restart_locks);
cli_msg(-24, " Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait);
}
/**
* proto_graceful_restart_lock - lock graceful restart by protocol
* @p: protocol instance
*
* This function allows a protocol to postpone the end of graceful restart
* recovery until it converges. The lock is removed when the protocol calls
* proto_graceful_restart_unlock() or when the protocol is stopped.
*
* The function have to be called during the initial phase of graceful restart
* recovery and only for protocols that are part of graceful restart (i.e. their
* @gr_recovery is set), which means it should be called from protocol start
* hooks.
*/
void
proto_graceful_restart_lock(struct proto *p)
{
ASSERT(graceful_restart_state == GRS_INIT);
ASSERT(p->gr_recovery);
if (p->gr_lock)
return;
p->gr_lock = 1;
graceful_restart_locks++;
}
/**
* proto_graceful_restart_unlock - unlock graceful restart by protocol
* @p: protocol instance
*
* This function unlocks a lock from proto_graceful_restart_lock(). It is also
* automatically called when the lock holding protocol went down.
*/
void
proto_graceful_restart_unlock(struct proto *p)
{
if (!p->gr_lock)
return;
p->gr_lock = 0;
graceful_restart_locks--;
if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks)
tm_start(gr_wait_timer, 0);
}
/** /**
* protos_dump_all - dump status of all protocols * protos_dump_all - dump status of all protocols
* *
@ -753,41 +943,23 @@ protos_build(void)
proto_shutdown_timer->hook = proto_shutdown_loop; proto_shutdown_timer->hook = proto_shutdown_loop;
} }
static void
proto_fell_down(struct proto *p)
{
DBG("Protocol %s down\n", p->name);
u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
if (all_routes != 0)
log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
bzero(&p->stats, sizeof(struct proto_stats));
proto_free_ahooks(p);
if (! p->proto->multitable)
rt_unlock_table(p->table);
if (p->proto->cleanup)
p->proto->cleanup(p);
proto_rethink_goal(p);
}
static void static void
proto_feed_more(void *P) proto_feed_more(void *P)
{ {
struct proto *p = P; struct proto *p = P;
if (p->core_state != FS_FEEDING) if (p->export_state != ES_FEEDING)
return; return;
DBG("Feeding protocol %s continued\n", p->name); DBG("Feeding protocol %s continued\n", p->name);
if (rt_feed_baby(p)) if (rt_feed_baby(p))
{ {
p->core_state = FS_HAPPY; DBG("Feeding protocol %s finished\n", p->name);
proto_relink(p); p->export_state = ES_READY;
DBG("Protocol %s up and running\n", p->name); proto_log_state_change(p);
if (p->feed_done)
p->feed_done(p);
} }
else else
{ {
@ -801,7 +973,7 @@ proto_feed_initial(void *P)
{ {
struct proto *p = P; struct proto *p = P;
if (p->core_state != FS_FEEDING) if (p->export_state != ES_FEEDING)
return; return;
DBG("Feeding protocol %s\n", p->name); DBG("Feeding protocol %s\n", p->name);
@ -814,40 +986,10 @@ static void
proto_schedule_feed(struct proto *p, int initial) proto_schedule_feed(struct proto *p, int initial)
{ {
DBG("%s: Scheduling meal\n", p->name); DBG("%s: Scheduling meal\n", p->name);
p->core_state = FS_FEEDING;
p->export_state = ES_FEEDING;
p->refeeding = !initial; p->refeeding = !initial;
/* FIXME: This should be changed for better support of multitable protos */
if (!initial)
{
struct announce_hook *ah;
for (ah = p->ahooks; ah; ah = ah->next)
proto_reset_limit(ah->out_limit);
/* Hack: reset exp_routes during refeed, and do not decrease it later */
p->stats.exp_routes = 0;
}
/* Connect protocol to routing table */
if (initial && !p->proto->multitable)
{
p->main_source = rt_get_source(p, 0);
rt_lock_source(p->main_source);
p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats);
p->main_ahook->in_filter = p->cf->in_filter;
p->main_ahook->out_filter = p->cf->out_filter;
p->main_ahook->rx_limit = p->cf->rx_limit;
p->main_ahook->in_limit = p->cf->in_limit;
p->main_ahook->out_limit = p->cf->out_limit;
p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered;
proto_reset_limit(p->main_ahook->rx_limit);
proto_reset_limit(p->main_ahook->in_limit);
proto_reset_limit(p->main_ahook->out_limit);
}
proto_relink(p);
p->attn->hook = initial ? proto_feed_initial : proto_feed_more; p->attn->hook = initial ? proto_feed_initial : proto_feed_more;
ev_schedule(p->attn); ev_schedule(p->attn);
} }
@ -877,7 +1019,7 @@ proto_schedule_flush_loop(void)
{ {
p->flushing = 1; p->flushing = 1;
for (h=p->ahooks; h; h=h->next) for (h=p->ahooks; h; h=h->next)
h->table->prune_state = 1; rt_mark_for_prune(h->table);
} }
ev_schedule(proto_flush_event); ev_schedule(proto_flush_event);
@ -910,6 +1052,7 @@ proto_flush_loop(void *unused UNUSED)
p->flushing = 0; p->flushing = 0;
p->core_state = FS_HUNGRY; p->core_state = FS_HUNGRY;
proto_relink(p); proto_relink(p);
proto_log_state_change(p);
if (p->proto_state == PS_DOWN) if (p->proto_state == PS_DOWN)
proto_fell_down(p); proto_fell_down(p);
goto again; goto again;
@ -921,19 +1064,6 @@ proto_flush_loop(void *unused UNUSED)
proto_schedule_flush_loop(); proto_schedule_flush_loop();
} }
static void
proto_schedule_flush(struct proto *p)
{
/* Need to abort feeding */
if (p->core_state == FS_FEEDING)
rt_feed_baby_abort(p);
DBG("%s: Scheduling flush\n", p->name);
p->core_state = FS_FLUSHING;
proto_relink(p);
proto_unlink_ahooks(p);
proto_schedule_flush_loop();
}
/* Temporary hack to propagate restart to BGP */ /* Temporary hack to propagate restart to BGP */
int proto_restart; int proto_restart;
@ -980,9 +1110,9 @@ proto_schedule_down(struct proto *p, byte restart, byte code)
* *
* Sometimes it is needed to send again all routes to the * Sometimes it is needed to send again all routes to the
* protocol. This is called feeding and can be requested by this * protocol. This is called feeding and can be requested by this
* function. This would cause protocol core state transition * function. This would cause protocol export state transition
* to FS_FEEDING (during feeding) and when completed, it will * to ES_FEEDING (during feeding) and when completed, it will
* switch back to FS_HAPPY. This function can be called even * switch back to ES_READY. This function can be called even
* when feeding is already running, in that case it is restarted. * when feeding is already running, in that case it is restarted.
*/ */
void void
@ -990,8 +1120,12 @@ proto_request_feeding(struct proto *p)
{ {
ASSERT(p->proto_state == PS_UP); ASSERT(p->proto_state == PS_UP);
/* Do nothing if we are still waiting for feeding */
if (p->export_state == ES_DOWN)
return;
/* If we are already feeding, we want to restart it */ /* If we are already feeding, we want to restart it */
if (p->core_state == FS_FEEDING) if (p->export_state == ES_FEEDING)
{ {
/* Unless feeding is in initial state */ /* Unless feeding is in initial state */
if (p->attn->hook == proto_feed_initial) if (p->attn->hook == proto_feed_initial)
@ -1000,7 +1134,16 @@ proto_request_feeding(struct proto *p)
rt_feed_baby_abort(p); rt_feed_baby_abort(p);
} }
/* FIXME: This should be changed for better support of multitable protos */
struct announce_hook *ah;
for (ah = p->ahooks; ah; ah = ah->next)
proto_reset_limit(ah->out_limit);
/* Hack: reset exp_routes during refeed, and do not decrease it later */
p->stats.exp_routes = 0;
proto_schedule_feed(p, 0); proto_schedule_feed(p, 0);
proto_log_state_change(p);
} }
static const char * static const char *
@ -1060,6 +1203,106 @@ proto_notify_limit(struct announce_hook *ah, struct proto_limit *l, int dir, u32
} }
} }
static void
proto_want_core_up(struct proto *p)
{
ASSERT(p->core_state == FS_HUNGRY);
if (!p->proto->multitable)
{
p->main_source = rt_get_source(p, 0);
rt_lock_source(p->main_source);
/* Connect protocol to routing table */
p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats);
p->main_ahook->in_filter = p->cf->in_filter;
p->main_ahook->out_filter = p->cf->out_filter;
p->main_ahook->rx_limit = p->cf->rx_limit;
p->main_ahook->in_limit = p->cf->in_limit;
p->main_ahook->out_limit = p->cf->out_limit;
p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered;
proto_reset_limit(p->main_ahook->rx_limit);
proto_reset_limit(p->main_ahook->in_limit);
proto_reset_limit(p->main_ahook->out_limit);
}
p->core_state = FS_HAPPY;
proto_relink(p);
}
static void
proto_want_export_up(struct proto *p)
{
ASSERT(p->core_state == CS_HAPPY);
ASSERT(p->export_state == ES_DOWN);
proto_link_ahooks(p);
proto_schedule_feed(p, 1); /* Sets ES_FEEDING */
}
static void
proto_want_export_down(struct proto *p)
{
ASSERT(p->export_state != ES_DOWN);
/* Need to abort feeding */
if (p->export_state == ES_FEEDING)
rt_feed_baby_abort(p);
p->export_state = ES_DOWN;
proto_unlink_ahooks(p);
}
static void
proto_want_core_down(struct proto *p)
{
ASSERT(p->core_state == CS_HAPPY);
ASSERT(p->export_state == ES_DOWN);
p->core_state = FS_FLUSHING;
proto_relink(p);
proto_schedule_flush_loop();
if (!p->proto->multitable)
{
rt_unlock_source(p->main_source);
p->main_source = NULL;
}
}
static void
proto_falling_down(struct proto *p)
{
p->gr_recovery = 0;
p->gr_wait = 0;
if (p->gr_lock)
proto_graceful_restart_unlock(p);
}
static void
proto_fell_down(struct proto *p)
{
DBG("Protocol %s down\n", p->name);
u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
if (all_routes != 0)
log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
bzero(&p->stats, sizeof(struct proto_stats));
proto_free_ahooks(p);
if (! p->proto->multitable)
rt_unlock_table(p->table);
if (p->proto->cleanup)
p->proto->cleanup(p);
proto_rethink_goal(p);
}
/** /**
* proto_notify_state - notify core about protocol state change * proto_notify_state - notify core about protocol state change
* @p: protocol the state of which has changed * @p: protocol the state of which has changed
@ -1079,6 +1322,7 @@ proto_notify_state(struct proto *p, unsigned ps)
{ {
unsigned ops = p->proto_state; unsigned ops = p->proto_state;
unsigned cs = p->core_state; unsigned cs = p->core_state;
unsigned es = p->export_state;
DBG("%s reporting state transition %s/%s -> */%s\n", p->name, c_states[cs], p_states[ops], p_states[ps]); DBG("%s reporting state transition %s/%s -> */%s\n", p->name, c_states[cs], p_states[ops], p_states[ps]);
if (ops == ps) if (ops == ps)
@ -1089,17 +1333,47 @@ proto_notify_state(struct proto *p, unsigned ps)
switch (ps) switch (ps)
{ {
case PS_START:
ASSERT(ops == PS_DOWN || ops == PS_UP);
ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY);
if (es != ES_DOWN)
proto_want_export_down(p);
break;
case PS_UP:
ASSERT(ops == PS_DOWN || ops == PS_START);
ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY);
ASSERT(es == ES_DOWN);
if (cs == FS_HUNGRY)
proto_want_core_up(p);
if (!p->gr_wait)
proto_want_export_up(p);
break;
case PS_STOP:
ASSERT(ops == PS_START || ops == PS_UP);
p->down_sched = 0;
if (es != ES_DOWN)
proto_want_export_down(p);
if (cs == FS_HAPPY)
proto_want_core_down(p);
proto_falling_down(p);
break;
case PS_DOWN: case PS_DOWN:
p->down_code = 0; p->down_code = 0;
p->down_sched = 0; p->down_sched = 0;
if ((cs == FS_FEEDING) || (cs == FS_HAPPY))
proto_schedule_flush(p);
if (!p->proto->multitable) if (es != ES_DOWN)
{ proto_want_export_down(p);
rt_unlock_source(p->main_source); if (cs == FS_HAPPY)
p->main_source = NULL; proto_want_core_down(p);
} if (ops != PS_STOP)
proto_falling_down(p);
neigh_prune(); // FIXME convert neighbors to resource? neigh_prune(); // FIXME convert neighbors to resource?
rfree(p->pool); rfree(p->pool);
@ -1107,27 +1381,17 @@ proto_notify_state(struct proto *p, unsigned ps)
if (cs == FS_HUNGRY) /* Shutdown finished */ if (cs == FS_HUNGRY) /* Shutdown finished */
{ {
proto_log_state_change(p);
proto_fell_down(p); proto_fell_down(p);
return; /* The protocol might have ceased to exist */ return; /* The protocol might have ceased to exist */
} }
break; break;
case PS_START:
ASSERT(ops == PS_DOWN);
ASSERT(cs == FS_HUNGRY);
break;
case PS_UP:
ASSERT(ops == PS_DOWN || ops == PS_START);
ASSERT(cs == FS_HUNGRY);
proto_schedule_feed(p, 1);
break;
case PS_STOP:
p->down_sched = 0;
if ((cs == FS_FEEDING) || (cs == FS_HAPPY))
proto_schedule_flush(p);
break;
default: default:
bug("Invalid state transition for %s from %s/%s to */%s", p->name, c_states[cs], p_states[ops], p_states[ps]); bug("%s: Invalid state %d", p->name, ps);
} }
proto_log_state_change(p);
} }
/* /*
@ -1141,12 +1405,18 @@ proto_state_name(struct proto *p)
switch (P(p->proto_state, p->core_state)) switch (P(p->proto_state, p->core_state))
{ {
case P(PS_DOWN, FS_HUNGRY): return "down"; case P(PS_DOWN, FS_HUNGRY): return "down";
case P(PS_START, FS_HUNGRY): return "start"; case P(PS_START, FS_HUNGRY):
case P(PS_UP, FS_HUNGRY): case P(PS_START, FS_HAPPY): return "start";
case P(PS_UP, FS_FEEDING): return "feed"; case P(PS_UP, FS_HAPPY):
case P(PS_STOP, FS_HUNGRY): return "stop"; switch (p->export_state)
case P(PS_UP, FS_HAPPY): return "up"; {
case P(PS_STOP, FS_FLUSHING): case ES_DOWN: return "wait";
case ES_FEEDING: return "feed";
case ES_READY: return "up";
default: return "???";
}
case P(PS_STOP, FS_HUNGRY):
case P(PS_STOP, FS_FLUSHING): return "stop";
case P(PS_DOWN, FS_FLUSHING): return "flush"; case P(PS_DOWN, FS_FLUSHING): return "flush";
default: return "???"; default: return "???";
} }
@ -1196,6 +1466,11 @@ proto_show_basic_info(struct proto *p)
cli_msg(-1006, " Input filter: %s", filter_name(p->cf->in_filter)); cli_msg(-1006, " Input filter: %s", filter_name(p->cf->in_filter));
cli_msg(-1006, " Output filter: %s", filter_name(p->cf->out_filter)); cli_msg(-1006, " Output filter: %s", filter_name(p->cf->out_filter));
if (graceful_restart_state == GRS_ACTIVE)
cli_msg(-1006, " GR recovery: %s%s",
p->gr_lock ? " pending" : "",
p->gr_wait ? " waiting" : "");
proto_show_limit(p->cf->rx_limit, "Receive limit:"); proto_show_limit(p->cf->rx_limit, "Receive limit:");
proto_show_limit(p->cf->in_limit, "Import limit:"); proto_show_limit(p->cf->in_limit, "Import limit:");
proto_show_limit(p->cf->out_limit, "Export limit:"); proto_show_limit(p->cf->out_limit, "Export limit:");

View file

@ -148,10 +148,13 @@ struct proto {
byte disabled; /* Manually disabled */ byte disabled; /* Manually disabled */
byte proto_state; /* Protocol state machine (PS_*, see below) */ byte proto_state; /* Protocol state machine (PS_*, see below) */
byte core_state; /* Core state machine (FS_*, see below) */ byte core_state; /* Core state machine (FS_*, see below) */
byte core_goal; /* State we want to reach (FS_*, see below) */ byte export_state; /* Route export state (ES_*, see below) */
byte reconfiguring; /* We're shutting down due to reconfiguration */ byte reconfiguring; /* We're shutting down due to reconfiguration */
byte refeeding; /* We are refeeding (valid only if core_state == FS_FEEDING) */ byte refeeding; /* We are refeeding (valid only if export_state == ES_FEEDING) */
byte flushing; /* Protocol is flushed in current flush loop round */ byte flushing; /* Protocol is flushed in current flush loop round */
byte gr_recovery; /* Protocol should participate in graceful restart recovery */
byte gr_lock; /* Graceful restart mechanism should wait for this proto */
byte gr_wait; /* Route export to protocol is postponed until graceful restart */
byte down_sched; /* Shutdown is scheduled for later (PDS_*) */ byte down_sched; /* Shutdown is scheduled for later (PDS_*) */
byte down_code; /* Reason for shutdown (PDC_* codes) */ byte down_code; /* Reason for shutdown (PDC_* codes) */
u32 hash_key; /* Random key used for hashing of neighbors */ u32 hash_key; /* Random key used for hashing of neighbors */
@ -175,6 +178,7 @@ struct proto {
* reload_routes Request protocol to reload all its routes to the core * reload_routes Request protocol to reload all its routes to the core
* (using rte_update()). Returns: 0=reload cannot be done, * (using rte_update()). Returns: 0=reload cannot be done,
* 1= reload is scheduled and will happen (asynchronously). * 1= reload is scheduled and will happen (asynchronously).
* feed_done Notify protocol about finish of route feeding.
*/ */
void (*if_notify)(struct proto *, unsigned flags, struct iface *i); void (*if_notify)(struct proto *, unsigned flags, struct iface *i);
@ -185,6 +189,7 @@ struct proto {
void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs); void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs);
int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool); int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool);
int (*reload_routes)(struct proto *); int (*reload_routes)(struct proto *);
void (*feed_done)(struct proto *);
/* /*
* Routing entry hooks (called only for routes belonging to this protocol): * Routing entry hooks (called only for routes belonging to this protocol):
@ -242,6 +247,13 @@ static inline void
proto_copy_rest(struct proto_config *dest, struct proto_config *src, unsigned size) proto_copy_rest(struct proto_config *dest, struct proto_config *src, unsigned size)
{ memcpy(dest + 1, src + 1, size - sizeof(struct proto_config)); } { memcpy(dest + 1, src + 1, size - sizeof(struct proto_config)); }
void graceful_restart_recovery(void);
void graceful_restart_init(void);
void graceful_restart_show_status(void);
void proto_graceful_restart_lock(struct proto *p);
void proto_graceful_restart_unlock(struct proto *p);
#define DEFAULT_GR_WAIT 240
void proto_show_limit(struct proto_limit *l, const char *dsc); void proto_show_limit(struct proto_limit *l, const char *dsc);
void proto_show_basic_info(struct proto *p); void proto_show_basic_info(struct proto *p);
@ -343,10 +355,17 @@ void proto_notify_state(struct proto *p, unsigned state);
* as a result of received ROUTE-REFRESH request). * as a result of received ROUTE-REFRESH request).
*/ */
#define FS_HUNGRY 0 #define FS_HUNGRY 0
#define FS_FEEDING 1 #define FS_FEEDING 1 /* obsolete */
#define FS_HAPPY 2 #define FS_HAPPY 2
#define FS_FLUSHING 3 #define FS_FLUSHING 3
#define ES_DOWN 0
#define ES_FEEDING 1
#define ES_READY 2
/* /*
* Debugging flags * Debugging flags

View file

@ -148,6 +148,10 @@ typedef struct rtable {
struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */
} rtable; } rtable;
#define RPS_NONE 0
#define RPS_SCHEDULED 1
#define RPS_RUNNING 2
typedef struct network { typedef struct network {
struct fib_node n; /* FIB flags reserved for kernel syncer */ struct fib_node n; /* FIB flags reserved for kernel syncer */
struct rte *routes; /* Available routes for this network */ struct rte *routes; /* Available routes for this network */
@ -222,6 +226,8 @@ typedef struct rte {
#define REF_COW 1 /* Copy this rte on write */ #define REF_COW 1 /* Copy this rte on write */
#define REF_FILTERED 2 /* Route is rejected by import filter */ #define REF_FILTERED 2 /* Route is rejected by import filter */
#define REF_STALE 4 /* Route is stale in a refresh cycle */
#define REF_DISCARD 8 /* Route is scheduled for discard */
/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
@ -257,6 +263,8 @@ void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *s
static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); } static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); }
void rte_discard(rtable *tab, rte *old); void rte_discard(rtable *tab, rte *old);
int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter); int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter);
void rt_refresh_begin(rtable *t, struct announce_hook *ah);
void rt_refresh_end(rtable *t, struct announce_hook *ah);
void rte_dump(rte *); void rte_dump(rte *);
void rte_free(rte *); void rte_free(rte *);
rte *rte_do_cow(rte *); rte *rte_do_cow(rte *);
@ -268,6 +276,15 @@ void rt_feed_baby_abort(struct proto *p);
int rt_prune_loop(void); int rt_prune_loop(void);
struct rtable_config *rt_new_table(struct symbol *s); struct rtable_config *rt_new_table(struct symbol *s);
static inline void
rt_mark_for_prune(rtable *tab)
{
if (tab->prune_state == RPS_RUNNING)
fit_get(&tab->fib, &tab->prune_fit);
tab->prune_state = RPS_SCHEDULED;
}
struct rt_show_data { struct rt_show_data {
ip_addr prefix; ip_addr prefix;
unsigned pxlen; unsigned pxlen;

View file

@ -55,8 +55,10 @@ static void rt_free_hostcache(rtable *tab);
static void rt_notify_hostcache(rtable *tab, net *net); static void rt_notify_hostcache(rtable *tab, net *net);
static void rt_update_hostcache(rtable *tab); static void rt_update_hostcache(rtable *tab);
static void rt_next_hop_update(rtable *tab); static void rt_next_hop_update(rtable *tab);
static inline int rt_prune_table(rtable *tab);
static inline void rt_schedule_gc(rtable *tab); static inline void rt_schedule_gc(rtable *tab);
static inline void rt_schedule_prune(rtable *tab);
static inline struct ea_list * static inline struct ea_list *
make_tmp_attrs(struct rte *rt, struct linpool *pool) make_tmp_attrs(struct rte *rt, struct linpool *pool)
@ -570,7 +572,7 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo
struct announce_hook *a; struct announce_hook *a;
WALK_LIST(a, tab->hooks) WALK_LIST(a, tab->hooks)
{ {
ASSERT(a->proto->core_state == FS_HAPPY || a->proto->core_state == FS_FEEDING); ASSERT(a->proto->export_state != ES_DOWN);
if (a->proto->accept_ra_types == type) if (a->proto->accept_ra_types == type)
if (type == RA_ACCEPTED) if (type == RA_ACCEPTED)
rt_notify_accepted(a, net, new, old, before_old, tmpa, 0); rt_notify_accepted(a, net, new, old, before_old, tmpa, 0);
@ -1108,6 +1110,69 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter
return v > 0; return v > 0;
} }
/**
* rt_refresh_begin - start a refresh cycle
* @t: related routing table
* @ah: related announce hook
*
* This function starts a refresh cycle for given routing table and announce
* hook. The refresh cycle is a sequence where the protocol sends all its valid
* routes to the routing table (by rte_update()). After that, all protocol
* routes (more precisely routes with @ah as @sender) not sent during the
* refresh cycle but still in the table from the past are pruned. This is
* implemented by marking all related routes as stale by REF_STALE flag in
* rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
* flag in rt_refresh_end() and then removing such routes in the prune loop.
*/
void
rt_refresh_begin(rtable *t, struct announce_hook *ah)
{
net *n;
rte *e;
FIB_WALK(&t->fib, fn)
{
n = (net *) fn;
for (e = n->routes; e; e = e->next)
if (e->sender == ah)
e->flags |= REF_STALE;
}
FIB_WALK_END;
}
/**
* rt_refresh_end - end a refresh cycle
* @t: related routing table
* @ah: related announce hook
*
* This function starts a refresh cycle for given routing table and announce
* hook. See rt_refresh_begin() for description of refresh cycles.
*/
void
rt_refresh_end(rtable *t, struct announce_hook *ah)
{
int prune = 0;
net *n;
rte *e;
FIB_WALK(&t->fib, fn)
{
n = (net *) fn;
for (e = n->routes; e; e = e->next)
if ((e->sender == ah) && (e->flags & REF_STALE))
{
e->flags |= REF_DISCARD;
prune = 1;
}
}
FIB_WALK_END;
if (prune)
rt_schedule_prune(t);
}
/** /**
* rte_dump - dump a route * rte_dump - dump a route
* @e: &rte to be dumped * @e: &rte to be dumped
@ -1169,6 +1234,13 @@ rt_dump_all(void)
rt_dump(t); rt_dump(t);
} }
static inline void
rt_schedule_prune(rtable *tab)
{
rt_mark_for_prune(tab);
ev_schedule(tab->rt_event);
}
static inline void static inline void
rt_schedule_gc(rtable *tab) rt_schedule_gc(rtable *tab)
{ {
@ -1199,6 +1271,7 @@ rt_schedule_nhu(rtable *tab)
tab->nhu_state |= 1; tab->nhu_state |= 1;
} }
static void static void
rt_prune_nets(rtable *tab) rt_prune_nets(rtable *tab)
{ {
@ -1242,6 +1315,14 @@ rt_event(void *ptr)
if (tab->nhu_state) if (tab->nhu_state)
rt_next_hop_update(tab); rt_next_hop_update(tab);
if (tab->prune_state)
if (!rt_prune_table(tab))
{
/* Table prune unfinished */
ev_schedule(tab->rt_event);
return;
}
if (tab->gc_scheduled) if (tab->gc_scheduled)
{ {
rt_prune_nets(tab); rt_prune_nets(tab);
@ -1283,8 +1364,8 @@ rt_init(void)
} }
static inline int static int
rt_prune_step(rtable *tab, int step, int *max_feed) rt_prune_step(rtable *tab, int step, int *limit)
{ {
static struct rate_limit rl_flush; static struct rate_limit rl_flush;
struct fib_iterator *fit = &tab->prune_fit; struct fib_iterator *fit = &tab->prune_fit;
@ -1294,13 +1375,13 @@ rt_prune_step(rtable *tab, int step, int *max_feed)
fib_check(&tab->fib); fib_check(&tab->fib);
#endif #endif
if (tab->prune_state == 0) if (tab->prune_state == RPS_NONE)
return 1; return 1;
if (tab->prune_state == 1) if (tab->prune_state == RPS_SCHEDULED)
{ {
FIB_ITERATE_INIT(fit, &tab->fib); FIB_ITERATE_INIT(fit, &tab->fib);
tab->prune_state = 2; tab->prune_state = RPS_RUNNING;
} }
again: again:
@ -1312,9 +1393,10 @@ again:
rescan: rescan:
for (e=n->routes; e; e=e->next) for (e=n->routes; e; e=e->next)
if (e->sender->proto->flushing || if (e->sender->proto->flushing ||
(e->flags & REF_DISCARD) ||
(step && e->attrs->src->proto->flushing)) (step && e->attrs->src->proto->flushing))
{ {
if (*max_feed <= 0) if (*limit <= 0)
{ {
FIB_ITERATE_PUT(fit, fn); FIB_ITERATE_PUT(fit, fn);
return 0; return 0;
@ -1325,7 +1407,7 @@ again:
n->n.prefix, n->n.pxlen, e->attrs->src->proto->name, tab->name); n->n.prefix, n->n.pxlen, e->attrs->src->proto->name, tab->name);
rte_discard(tab, e); rte_discard(tab, e);
(*max_feed)--; (*limit)--;
goto rescan; goto rescan;
} }
@ -1342,41 +1424,60 @@ again:
fib_check(&tab->fib); fib_check(&tab->fib);
#endif #endif
tab->prune_state = 0; tab->prune_state = RPS_NONE;
return 1; return 1;
} }
/**
* rt_prune_table - prune a routing table
*
* This function scans the routing table @tab and removes routes belonging to
* flushing protocols, discarded routes and also stale network entries, in a
* similar fashion like rt_prune_loop(). Returns 1 when all such routes are
* pruned. Contrary to rt_prune_loop(), this function is not a part of the
* protocol flushing loop, but it is called from rt_event() for just one routing
* table.
*
* Note that rt_prune_table() and rt_prune_loop() share (for each table) the
* prune state (@prune_state) and also the pruning iterator (@prune_fit).
*/
static inline int
rt_prune_table(rtable *tab)
{
int limit = 512;
return rt_prune_step(tab, 0, &limit);
}
/** /**
* rt_prune_loop - prune routing tables * rt_prune_loop - prune routing tables
* *
* The prune loop scans routing tables and removes routes belonging to * The prune loop scans routing tables and removes routes belonging to flushing
* flushing protocols and also stale network entries. Returns 1 when * protocols, discarded routes and also stale network entries. Returns 1 when
* all such routes are pruned. It is a part of the protocol flushing * all such routes are pruned. It is a part of the protocol flushing loop.
* loop.
* *
* The prune loop runs in two steps. In the first step it prunes just * The prune loop runs in two steps. In the first step it prunes just the routes
* the routes with flushing senders (in explicitly marked tables) so * with flushing senders (in explicitly marked tables) so the route removal is
* the route removal is propagated as usual. In the second step, all * propagated as usual. In the second step, all remaining relevant routes are
* remaining relevant routes are removed. Ideally, there shouldn't be * removed. Ideally, there shouldn't be any, but it happens when pipe filters
* any, but it happens when pipe filters are changed. * are changed.
*/ */
int int
rt_prune_loop(void) rt_prune_loop(void)
{ {
static int step = 0; static int step = 0;
int max_feed = 512; int limit = 512;
rtable *t; rtable *t;
again: again:
WALK_LIST(t, routing_tables) WALK_LIST(t, routing_tables)
if (! rt_prune_step(t, step, &max_feed)) if (! rt_prune_step(t, step, &limit))
return 0; return 0;
if (step == 0) if (step == 0)
{ {
/* Prepare for the second step */ /* Prepare for the second step */
WALK_LIST(t, routing_tables) WALK_LIST(t, routing_tables)
t->prune_state = 1; t->prune_state = RPS_SCHEDULED;
step = 1; step = 1;
goto again; goto again;
@ -1721,7 +1822,7 @@ again:
(p->accept_ra_types == RA_ACCEPTED)) (p->accept_ra_types == RA_ACCEPTED))
if (rte_is_valid(e)) if (rte_is_valid(e))
{ {
if (p->core_state != FS_FEEDING) if (p->export_state != ES_FEEDING)
return 1; /* In the meantime, the protocol fell down. */ return 1; /* In the meantime, the protocol fell down. */
do_feed_baby(p, p->accept_ra_types, h, n, e); do_feed_baby(p, p->accept_ra_types, h, n, e);
max_feed--; max_feed--;
@ -1730,7 +1831,7 @@ again:
if (p->accept_ra_types == RA_ANY) if (p->accept_ra_types == RA_ANY)
for(e = n->routes; rte_is_valid(e); e = e->next) for(e = n->routes; rte_is_valid(e); e = e->next)
{ {
if (p->core_state != FS_FEEDING) if (p->export_state != ES_FEEDING)
return 1; /* In the meantime, the protocol fell down. */ return 1; /* In the meantime, the protocol fell down. */
do_feed_baby(p, RA_ANY, h, n, e); do_feed_baby(p, RA_ANY, h, n, e);
max_feed--; max_feed--;
@ -2223,9 +2324,7 @@ rt_show_cont(struct cli *c)
cli_printf(c, 8004, "Stopped due to reconfiguration"); cli_printf(c, 8004, "Stopped due to reconfiguration");
goto done; goto done;
} }
if (d->export_protocol && if (d->export_protocol && (d->export_protocol->export_state == ES_DOWN))
d->export_protocol->core_state != FS_HAPPY &&
d->export_protocol->core_state != FS_FEEDING)
{ {
cli_printf(c, 8005, "Protocol is down"); cli_printf(c, 8005, "Protocol is down");
goto done; goto done;

View file

@ -51,6 +51,16 @@
* and bgp_encode_attrs() which does the converse. Both functions are built around a * and bgp_encode_attrs() which does the converse. Both functions are built around a
* @bgp_attr_table array describing all important characteristics of all known attributes. * @bgp_attr_table array describing all important characteristics of all known attributes.
* Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
*
* BGP protocol implements graceful restart in both restarting (local restart)
* and receiving (neighbor restart) roles. The first is handled mostly by the
* graceful restart code in the nest, BGP protocol just handles capabilities,
* sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
* The second is implemented by internal restart of the BGP state to %BS_IDLE
* and protocol state to %PS_START, but keeping the protocol up from the core
* point of view and therefore maintaining received routes. Routing table
* refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
* stale routes after reestablishment of BGP session during graceful restart.
*/ */
#undef LOCAL_DEBUG #undef LOCAL_DEBUG
@ -319,6 +329,7 @@ bgp_decision(void *vp)
DBG("BGP: Decision start\n"); DBG("BGP: Decision start\n");
if ((p->p.proto_state == PS_START) if ((p->p.proto_state == PS_START)
&& (p->outgoing_conn.state == BS_IDLE) && (p->outgoing_conn.state == BS_IDLE)
&& (p->incoming_conn.state != BS_OPENCONFIRM)
&& (!p->cf->passive)) && (!p->cf->passive))
bgp_active(p); bgp_active(p);
@ -371,6 +382,20 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
bgp_init_bucket_table(p); bgp_init_bucket_table(p);
bgp_init_prefix_table(p, 8); bgp_init_prefix_table(p, 8);
int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART);
if (p->p.gr_recovery && !peer_gr_ready)
proto_graceful_restart_unlock(&p->p);
if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready)
p->p.gr_wait = 1;
if (p->gr_active)
tm_stop(p->gr_timer);
if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
bgp_graceful_restart_done(p);
bgp_conn_set_state(conn, BS_ESTABLISHED); bgp_conn_set_state(conn, BS_ESTABLISHED);
proto_notify_state(&p->p, PS_UP); proto_notify_state(&p->p, PS_UP);
} }
@ -416,16 +441,86 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn)
bgp_conn_leave_established_state(p); bgp_conn_leave_established_state(p);
} }
/**
* bgp_handle_graceful_restart - handle detected BGP graceful restart
* @p: BGP instance
*
* This function is called when a BGP graceful restart of the neighbor is
* detected (when the TCP connection fails or when a new TCP connection
* appears). The function activates processing of the restart - starts routing
* table refresh cycle and activates BGP restart timer. The protocol state goes
* back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
* caller.
*/
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
p->gr_active ? " - already pending" : "");
proto_notify_state(&p->p, PS_START);
if (p->gr_active)
rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
p->gr_active = 1;
bgp_start_timer(p->gr_timer, p->conn->peer_gr_time);
rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
}
/**
* bgp_graceful_restart_done - finish active BGP graceful restart
* @p: BGP instance
*
* This function is called when the active BGP graceful restart of the neighbor
* should be finished - either successfully (the neighbor sends all paths and
* reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
* not support BGP graceful restart on the new session). The function ends
* routing table refresh cycle and stops BGP restart timer.
*/
void
bgp_graceful_restart_done(struct bgp_proto *p)
{
BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
p->gr_active = 0;
tm_stop(p->gr_timer);
rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
}
/**
* bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
* @t: timer
*
* This function is a timeout hook for @gr_timer, implementing BGP restart time
* limit for reestablisment of the BGP session after the graceful restart. When
* fired, we just proceed with the usual protocol restart.
*/
static void
bgp_graceful_restart_timeout(timer *t)
{
struct bgp_proto *p = t->data;
BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
bgp_stop(p, 0);
}
static void static void
bgp_send_open(struct bgp_conn *conn) bgp_send_open(struct bgp_conn *conn)
{ {
conn->start_state = conn->bgp->start_state; conn->start_state = conn->bgp->start_state;
// Default values, possibly changed by receiving capabilities. // Default values, possibly changed by receiving capabilities.
conn->advertised_as = 0;
conn->peer_refresh_support = 0; conn->peer_refresh_support = 0;
conn->peer_as4_support = 0; conn->peer_as4_support = 0;
conn->peer_add_path = 0; conn->peer_add_path = 0;
conn->advertised_as = 0; conn->peer_gr_aware = 0;
conn->peer_gr_able = 0;
conn->peer_gr_time = 0;
conn->peer_gr_flags = 0;
conn->peer_gr_aflags = 0;
DBG("BGP: Sending open\n"); DBG("BGP: Sending open\n");
conn->sk->rx_hook = bgp_rx; conn->sk->rx_hook = bgp_rx;
@ -484,6 +579,9 @@ bgp_sock_err(sock *sk, int err)
else else
BGP_TRACE(D_EVENTS, "Connection closed"); BGP_TRACE(D_EVENTS, "Connection closed");
if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(conn); bgp_conn_enter_idle_state(conn);
} }
@ -649,6 +747,14 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED)
int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) && int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
(p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
{
bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(p->conn);
acc = 1;
}
BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL, sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL,
sk->dport, acc ? "accepted" : "rejected"); sk->dport, acc ? "accepted" : "rejected");
@ -817,6 +923,17 @@ bgp_reload_routes(struct proto *P)
return 1; return 1;
} }
static void
bgp_feed_done(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
if (!p->conn || !p->cf->gr_mode || p->p.refeeding)
return;
p->send_end_mark = 1;
bgp_schedule_packet(p->conn, PKT_UPDATE);
}
static void static void
bgp_start_locked(struct object_lock *lock) bgp_start_locked(struct object_lock *lock)
{ {
@ -867,6 +984,8 @@ bgp_start(struct proto *P)
p->incoming_conn.state = BS_IDLE; p->incoming_conn.state = BS_IDLE;
p->neigh = NULL; p->neigh = NULL;
p->bfd_req = NULL; p->bfd_req = NULL;
p->gr_ready = 0;
p->gr_active = 0;
rt_lock_table(p->igp_table); rt_lock_table(p->igp_table);
@ -878,6 +997,10 @@ bgp_start(struct proto *P)
p->startup_timer->hook = bgp_startup_timeout; p->startup_timer->hook = bgp_startup_timeout;
p->startup_timer->data = p; p->startup_timer->data = p;
p->gr_timer = tm_new(p->p.pool);
p->gr_timer->hook = bgp_graceful_restart_timeout;
p->gr_timer->data = p;
p->local_id = proto_get_router_id(P->cf); p->local_id = proto_get_router_id(P->cf);
if (p->rr_client) if (p->rr_client)
p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
@ -885,6 +1008,9 @@ bgp_start(struct proto *P)
p->remote_id = 0; p->remote_id = 0;
p->source_addr = p->cf->source_addr; p->source_addr = p->cf->source_addr;
if (p->p.gr_recovery && p->cf->gr_mode)
proto_graceful_restart_lock(P);
/* /*
* Before attempting to create the connection, we need to lock the * Before attempting to create the connection, we need to lock the
* port, so that are sure we're the only instance attempting to talk * port, so that are sure we're the only instance attempting to talk
@ -985,6 +1111,7 @@ bgp_init(struct proto_config *C)
P->import_control = bgp_import_control; P->import_control = bgp_import_control;
P->neigh_notify = bgp_neigh_notify; P->neigh_notify = bgp_neigh_notify;
P->reload_routes = bgp_reload_routes; P->reload_routes = bgp_reload_routes;
P->feed_done = bgp_feed_done;
P->rte_better = bgp_rte_better; P->rte_better = bgp_rte_better;
P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
@ -1164,7 +1291,7 @@ bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" }; static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""}; static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down" }; static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down", "Graceful restart"};
static char *bgp_auto_errors[] = { "", "Route limit exceeded"}; static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
static const char * static const char *
@ -1225,6 +1352,9 @@ bgp_show_proto_info(struct proto *P)
cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface); cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface);
cli_msg(-1006, " Neighbor AS: %u", p->remote_as); cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
if (p->gr_active)
cli_msg(-1006, " Neighbor graceful restart active");
if (P->proto_state == PS_START) if (P->proto_state == PS_START)
{ {
struct bgp_conn *oc = &p->outgoing_conn; struct bgp_conn *oc = &p->outgoing_conn;
@ -1238,12 +1368,16 @@ bgp_show_proto_info(struct proto *P)
(oc->connect_retry_timer->expires)) (oc->connect_retry_timer->expires))
cli_msg(-1006, " Start delay: %d/%d", cli_msg(-1006, " Start delay: %d/%d",
oc->connect_retry_timer->expires - now, p->cf->start_delay_time); oc->connect_retry_timer->expires - now, p->cf->start_delay_time);
if (p->gr_active && p->gr_timer->expires)
cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now);
} }
else if (P->proto_state == PS_UP) else if (P->proto_state == PS_UP)
{ {
cli_msg(-1006, " Neighbor ID: %R", p->remote_id); cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
cli_msg(-1006, " Neighbor caps: %s%s%s%s", cli_msg(-1006, " Neighbor caps: %s%s%s%s%s",
c->peer_refresh_support ? " refresh" : "", c->peer_refresh_support ? " refresh" : "",
c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""),
c->peer_as4_support ? " AS4" : "", c->peer_as4_support ? " AS4" : "",
(c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "",
(c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : ""); (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "");

View file

@ -48,6 +48,8 @@ struct bgp_config {
int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */
int add_path; /* Use ADD-PATH extension [draft] */ int add_path; /* Use ADD-PATH extension [draft] */
int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */
int gr_mode; /* Graceful restart mode (BGP_GR_*) */
unsigned gr_time; /* Graceful restart timeout */
unsigned connect_retry_time; unsigned connect_retry_time;
unsigned hold_time, initial_hold_time; unsigned hold_time, initial_hold_time;
unsigned keepalive_time; unsigned keepalive_time;
@ -73,6 +75,15 @@ struct bgp_config {
#define ADD_PATH_TX 2 #define ADD_PATH_TX 2
#define ADD_PATH_FULL 3 #define ADD_PATH_FULL 3
#define BGP_GR_ABLE 1
#define BGP_GR_AWARE 2
/* For peer_gr_flags */
#define BGP_GRF_RESTART 0x80
/* For peer_gr_aflags */
#define BGP_GRF_FORWARDING 0x80
struct bgp_conn { struct bgp_conn {
struct bgp_proto *bgp; struct bgp_proto *bgp;
@ -90,6 +101,11 @@ struct bgp_conn {
u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */
u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */
u8 peer_add_path; /* Peer supports ADD-PATH [draft] */ u8 peer_add_path; /* Peer supports ADD-PATH [draft] */
u8 peer_gr_aware;
u8 peer_gr_able;
u16 peer_gr_time;
u8 peer_gr_flags;
u8 peer_gr_aflags;
unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */
}; };
@ -107,6 +123,8 @@ struct bgp_proto {
u32 rr_cluster_id; /* Route reflector cluster ID */ u32 rr_cluster_id; /* Route reflector cluster ID */
int rr_client; /* Whether neighbor is RR client of me */ int rr_client; /* Whether neighbor is RR client of me */
int rs_client; /* Whether neighbor is RS client of me */ int rs_client; /* Whether neighbor is RS client of me */
u8 gr_ready; /* Neighbor could do graceful restart */
u8 gr_active; /* Neighbor is doing graceful restart */
struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn *conn; /* Connection we have established */
struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */
struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */
@ -117,12 +135,14 @@ struct bgp_proto {
rtable *igp_table; /* Table used for recursive next hop lookups */ rtable *igp_table; /* Table used for recursive next hop lookups */
struct event *event; /* Event for respawning and shutting process */ struct event *event; /* Event for respawning and shutting process */
struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */
struct timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */
struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */
unsigned int hash_size, hash_count, hash_limit; unsigned int hash_size, hash_count, hash_limit;
HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
slab *prefix_slab; /* Slab holding prefix nodes */ slab *prefix_slab; /* Slab holding prefix nodes */
list bucket_queue; /* Queue of buckets to send */ list bucket_queue; /* Queue of buckets to send */
struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
unsigned send_end_mark; /* End-of-RIB mark scheduled for transmit */
unsigned startup_delay; /* Time to delay protocol startup by due to errors */ unsigned startup_delay; /* Time to delay protocol startup by due to errors */
bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */ bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */
u8 last_error_class; /* Error class of last error */ u8 last_error_class; /* Error class of last error */
@ -172,6 +192,8 @@ void bgp_conn_enter_openconfirm_state(struct bgp_conn *conn);
void bgp_conn_enter_established_state(struct bgp_conn *conn); void bgp_conn_enter_established_state(struct bgp_conn *conn);
void bgp_conn_enter_close_state(struct bgp_conn *conn); void bgp_conn_enter_close_state(struct bgp_conn *conn);
void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_conn_enter_idle_state(struct bgp_conn *conn);
void bgp_handle_graceful_restart(struct bgp_proto *p);
void bgp_graceful_restart_done(struct bgp_proto *p);
void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code);
void bgp_stop(struct bgp_proto *p, unsigned subcode); void bgp_stop(struct bgp_proto *p, unsigned subcode);
@ -313,6 +335,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
#define BEM_INVALID_MD5 3 /* MD5 authentication kernel request failed (possibly not supported) */ #define BEM_INVALID_MD5 3 /* MD5 authentication kernel request failed (possibly not supported) */
#define BEM_NO_SOCKET 4 #define BEM_NO_SOCKET 4
#define BEM_BFD_DOWN 5 #define BEM_BFD_DOWN 5
#define BEM_GRACEFUL_RESTART 6
/* Automatic shutdown error codes */ /* Automatic shutdown error codes */

View file

@ -26,7 +26,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC,
SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX) SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE)
CF_GRAMMAR CF_GRAMMAR
@ -50,6 +50,8 @@ bgp_proto_start: proto_start BGP {
BGP_CFG->advertise_ipv4 = 1; BGP_CFG->advertise_ipv4 = 1;
BGP_CFG->interpret_communities = 1; BGP_CFG->interpret_communities = 1;
BGP_CFG->default_local_pref = 100; BGP_CFG->default_local_pref = 100;
BGP_CFG->gr_mode = BGP_GR_AWARE;
BGP_CFG->gr_time = 120;
} }
; ;
@ -115,6 +117,9 @@ bgp_proto:
| bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; }
| bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; }
| bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; }
| bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; }
| bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; }
| bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; }
| bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; }
| bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; } | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; }
| bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); } | bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); }

View file

@ -122,7 +122,7 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf)
#ifdef IPV6 #ifdef IPV6
static byte * static byte *
bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf) bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf)
{ {
*buf++ = 1; /* Capability 1: Multiprotocol extensions */ *buf++ = 1; /* Capability 1: Multiprotocol extensions */
*buf++ = 4; /* Capability data length */ *buf++ = 4; /* Capability data length */
@ -136,7 +136,7 @@ bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf)
#else #else
static byte * static byte *
bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf) bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf)
{ {
*buf++ = 1; /* Capability 1: Multiprotocol extensions */ *buf++ = 1; /* Capability 1: Multiprotocol extensions */
*buf++ = 4; /* Capability data length */ *buf++ = 4; /* Capability data length */
@ -149,7 +149,7 @@ bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf)
#endif #endif
static byte * static byte *
bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf) bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf)
{ {
*buf++ = 2; /* Capability 2: Support for route refresh */ *buf++ = 2; /* Capability 2: Support for route refresh */
*buf++ = 0; /* Capability data length */ *buf++ = 0; /* Capability data length */
@ -157,16 +157,44 @@ bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf)
} }
static byte * static byte *
bgp_put_cap_as4(struct bgp_conn *conn, byte *buf) bgp_put_cap_gr1(struct bgp_proto *p, byte *buf)
{
*buf++ = 64; /* Capability 64: Support for graceful restart */
*buf++ = 6; /* Capability data length */
put_u16(buf, p->cf->gr_time);
if (p->p.gr_recovery)
buf[0] |= BGP_GRF_RESTART;
buf += 2;
*buf++ = 0; /* Appropriate AF */
*buf++ = BGP_AF;
*buf++ = 1; /* and SAFI 1 */
*buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0;
return buf;
}
static byte *
bgp_put_cap_gr2(struct bgp_proto *p, byte *buf)
{
*buf++ = 64; /* Capability 64: Support for graceful restart */
*buf++ = 2; /* Capability data length */
put_u16(buf, 0);
return buf + 2;
}
static byte *
bgp_put_cap_as4(struct bgp_proto *p, byte *buf)
{ {
*buf++ = 65; /* Capability 65: Support for 4-octet AS number */ *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
*buf++ = 4; /* Capability data length */ *buf++ = 4; /* Capability data length */
put_u32(buf, conn->bgp->local_as); put_u32(buf, p->local_as);
return buf + 4; return buf + 4;
} }
static byte * static byte *
bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf) bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
{ {
*buf++ = 69; /* Capability 69: Support for ADD-PATH */ *buf++ = 69; /* Capability 69: Support for ADD-PATH */
*buf++ = 4; /* Capability data length */ *buf++ = 4; /* Capability data length */
@ -175,7 +203,7 @@ bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf)
*buf++ = BGP_AF; *buf++ = BGP_AF;
*buf++ = 1; /* SAFI 1 */ *buf++ = 1; /* SAFI 1 */
*buf++ = conn->bgp->cf->add_path; *buf++ = p->cf->add_path;
return buf; return buf;
} }
@ -206,21 +234,26 @@ bgp_create_open(struct bgp_conn *conn, byte *buf)
#ifndef IPV6 #ifndef IPV6
if (p->cf->advertise_ipv4) if (p->cf->advertise_ipv4)
cap = bgp_put_cap_ipv4(conn, cap); cap = bgp_put_cap_ipv4(p, cap);
#endif #endif
#ifdef IPV6 #ifdef IPV6
cap = bgp_put_cap_ipv6(conn, cap); cap = bgp_put_cap_ipv6(p, cap);
#endif #endif
if (p->cf->enable_refresh) if (p->cf->enable_refresh)
cap = bgp_put_cap_rr(conn, cap); cap = bgp_put_cap_rr(p, cap);
if (p->cf->gr_mode == BGP_GR_ABLE)
cap = bgp_put_cap_gr1(p, cap);
else if (p->cf->gr_mode == BGP_GR_AWARE)
cap = bgp_put_cap_gr2(p, cap);
if (p->cf->enable_as4) if (p->cf->enable_as4)
cap = bgp_put_cap_as4(conn, cap); cap = bgp_put_cap_as4(p, cap);
if (p->cf->add_path) if (p->cf->add_path)
cap = bgp_put_cap_add_path(conn, cap); cap = bgp_put_cap_add_path(p, cap);
cap_len = cap - buf - 12; cap_len = cap - buf - 12;
if (cap_len > 0) if (cap_len > 0)
@ -351,6 +384,16 @@ bgp_create_update(struct bgp_conn *conn, byte *buf)
return NULL; return NULL;
} }
static byte *
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
{
struct bgp_proto *p = conn->bgp;
BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
put_u32(buf, 0);
return buf+4;
}
#else /* IPv6 version */ #else /* IPv6 version */
static inline int static inline int
@ -520,6 +563,26 @@ bgp_create_update(struct bgp_conn *conn, byte *buf)
return NULL; return NULL;
} }
static byte *
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
{
struct bgp_proto *p = conn->bgp;
BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
put_u16(buf+0, 0);
put_u16(buf+2, 6); /* length 4-9 */
buf += 4;
/* Empty MP_UNREACH_NLRI atribute */
*buf++ = BAF_OPTIONAL;
*buf++ = BA_MP_UNREACH_NLRI;
*buf++ = 3; /* Length 7-9 */
*buf++ = 0; /* AFI */
*buf++ = BGP_AF_IPV6;
*buf++ = 1; /* SAFI */
return buf;
}
#endif #endif
static byte * static byte *
@ -606,10 +669,16 @@ bgp_fire_tx(struct bgp_conn *conn)
{ {
end = bgp_create_update(conn, pkt); end = bgp_create_update(conn, pkt);
type = PKT_UPDATE; type = PKT_UPDATE;
if (!end) if (!end)
{ {
conn->packets_to_send = 0; conn->packets_to_send = 0;
return 0;
if (!p->send_end_mark)
return 0;
p->send_end_mark = 0;
end = bgp_create_end_mark(conn, pkt);
} }
} }
else else
@ -678,6 +747,22 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
conn->peer_refresh_support = 1; conn->peer_refresh_support = 1;
break; break;
case 64: /* Graceful restart capability, RFC 4724 */
if (cl % 4 != 2)
goto err;
conn->peer_gr_aware = 1;
conn->peer_gr_able = 0;
conn->peer_gr_time = get_u16(opt + 2) & 0x0fff;
conn->peer_gr_flags = opt[2] & 0xf0;
conn->peer_gr_aflags = 0;
for (i = 2; i < cl; i += 4)
if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
{
conn->peer_gr_able = 1;
conn->peer_gr_aflags = opt[2+i+3];
}
break;
case 65: /* AS4 capability, RFC 4893 */ case 65: /* AS4 capability, RFC 4893 */
if (cl != 4) if (cl != 4)
goto err; goto err;
@ -704,7 +789,7 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
} }
return; return;
err: err:
bgp_error(conn, 2, 0, NULL, 0); bgp_error(conn, 2, 0, NULL, 0);
return; return;
} }
@ -807,12 +892,17 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
switch (other->state) switch (other->state)
{ {
case BS_IDLE:
case BS_CONNECT: case BS_CONNECT:
case BS_ACTIVE: case BS_ACTIVE:
/* Stop outgoing connection attempts */
bgp_conn_enter_idle_state(other);
break;
case BS_IDLE:
case BS_OPENSENT: case BS_OPENSENT:
case BS_CLOSE: case BS_CLOSE:
break; break;
case BS_OPENCONFIRM: case BS_OPENCONFIRM:
if ((p->local_id < id) == (conn == &p->incoming_conn)) if ((p->local_id < id) == (conn == &p->incoming_conn))
{ {
@ -838,6 +928,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; p->as4_session = p->cf->enable_as4 && conn->peer_as4_support;
p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX);
p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX);
p->gr_ready = p->cf->gr_mode && conn->peer_gr_able;
if (p->add_path_tx) if (p->add_path_tx)
p->p.accept_ra_types = RA_ANY; p->p.accept_ra_types = RA_ANY;
@ -849,6 +940,20 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
bgp_conn_enter_openconfirm_state(conn); bgp_conn_enter_openconfirm_state(conn);
} }
static inline void
bgp_rx_end_mark(struct bgp_proto *p)
{
BGP_TRACE(D_PACKETS, "Got End-of-RIB");
if (p->p.gr_recovery)
proto_graceful_restart_unlock(&p->p);
if (p->gr_active)
bgp_graceful_restart_done(p);
}
#define DECODE_PREFIX(pp, ll) do { \ #define DECODE_PREFIX(pp, ll) do { \
if (p->add_path_rx) \ if (p->add_path_rx) \
{ \ { \
@ -983,6 +1088,13 @@ bgp_do_rx_update(struct bgp_conn *conn,
u32 path_id = 0; u32 path_id = 0;
u32 last_id = 0; u32 last_id = 0;
/* Check for End-of-RIB marker */
if (!withdrawn_len && !attr_len && !nlri_len)
{
bgp_rx_end_mark(p);
return;
}
/* Withdraw routes */ /* Withdraw routes */
while (withdrawn_len) while (withdrawn_len)
{ {
@ -1088,6 +1200,14 @@ bgp_do_rx_update(struct bgp_conn *conn,
if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
return; return;
/* Check for End-of-RIB marker */
if ((attr_len < 8) && !withdrawn_len && !attr_len &&
(p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6))
{
bgp_rx_end_mark(p);
return;
}
DO_NLRI(mp_unreach) DO_NLRI(mp_unreach)
{ {
while (len) while (len)

View file

@ -17,7 +17,7 @@ CF_DEFINES
CF_DECLS CF_DECLS
CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, KRT_SOURCE, KRT_METRIC) CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC)
CF_GRAMMAR CF_GRAMMAR
@ -46,6 +46,7 @@ kern_item:
#endif #endif
} }
| DEVICE ROUTES bool { THIS_KRT->devroutes = $3; } | DEVICE ROUTES bool { THIS_KRT->devroutes = $3; }
| GRACEFUL RESTART bool { THIS_KRT->graceful_restart = $3; }
; ;
/* Kernel interface protocol */ /* Kernel interface protocol */

View file

@ -653,6 +653,13 @@ krt_got_route(struct krt_proto *p, rte *e)
return; return;
} }
if (!p->ready)
{
/* We wait for the initial feed to have correct KRF_INSTALLED flag */
verdict = KRF_IGNORE;
goto sentenced;
}
old = net->routes; old = net->routes;
if ((net->n.flags & KRF_INSTALLED) && rte_is_valid(old)) if ((net->n.flags & KRF_INSTALLED) && rte_is_valid(old))
{ {
@ -779,7 +786,9 @@ krt_prune(struct krt_proto *p)
if (KRT_CF->learn) if (KRT_CF->learn)
krt_learn_prune(p); krt_learn_prune(p);
#endif #endif
p->initialized = 1;
if (p->ready)
p->initialized = 1;
} }
void void
@ -852,7 +861,7 @@ krt_scan_timer_start(struct krt_proto *p)
krt_scan_count++; krt_scan_count++;
tm_start(krt_scan_timer, 0); tm_start(krt_scan_timer, 1);
} }
static void static void
@ -867,6 +876,12 @@ krt_scan_timer_stop(struct krt_proto *p)
} }
} }
static void
krt_scan_timer_kick(struct krt_proto *p UNUSED)
{
tm_start(krt_scan_timer, 0);
}
#else #else
static void static void
@ -885,7 +900,7 @@ static void
krt_scan_timer_start(struct krt_proto *p) krt_scan_timer_start(struct krt_proto *p)
{ {
p->scan_timer = tm_new_set(p->p.pool, krt_scan, p, 0, KRT_CF->scan_time); p->scan_timer = tm_new_set(p->p.pool, krt_scan, p, 0, KRT_CF->scan_time);
tm_start(p->scan_timer, 0); tm_start(p->scan_timer, 1);
} }
static void static void
@ -894,6 +909,12 @@ krt_scan_timer_stop(struct krt_proto *p)
tm_stop(p->scan_timer); tm_stop(p->scan_timer);
} }
static void
krt_scan_timer_kick(struct krt_proto *p UNUSED)
{
tm_start(p->scan_timer, 0);
}
#endif #endif
@ -970,6 +991,16 @@ krt_notify(struct proto *P, struct rtable *table UNUSED, net *net,
krt_replace_rte(p, net, new, old, eattrs); krt_replace_rte(p, net, new, old, eattrs);
} }
static void
krt_feed_done(struct proto *P)
{
struct krt_proto *p = (struct krt_proto *) P;
p->ready = 1;
krt_scan_timer_kick(p);
}
static int static int
krt_rte_same(rte *a, rte *b) krt_rte_same(rte *a, rte *b)
{ {
@ -992,6 +1023,7 @@ krt_init(struct proto_config *c)
p->p.accept_ra_types = RA_OPTIMAL; p->p.accept_ra_types = RA_OPTIMAL;
p->p.import_control = krt_import_control; p->p.import_control = krt_import_control;
p->p.rt_notify = krt_notify; p->p.rt_notify = krt_notify;
p->p.feed_done = krt_feed_done;
p->p.make_tmp_attrs = krt_make_tmp_attrs; p->p.make_tmp_attrs = krt_make_tmp_attrs;
p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.store_tmp_attrs = krt_store_tmp_attrs;
p->p.rte_same = krt_rte_same; p->p.rte_same = krt_rte_same;
@ -1015,6 +1047,9 @@ krt_start(struct proto *P)
krt_scan_timer_start(p); krt_scan_timer_start(p);
if (P->gr_recovery && KRT_CF->graceful_restart)
P->gr_wait = 1;
return PS_UP; return PS_UP;
} }
@ -1029,6 +1064,9 @@ krt_shutdown(struct proto *P)
if (p->initialized && !KRT_CF->persist) if (p->initialized && !KRT_CF->persist)
krt_flush_routes(p); krt_flush_routes(p);
p->ready = 0;
p->initialized = 0;
krt_sys_shutdown(p); krt_sys_shutdown(p);
rem_node(&p->krt_node); rem_node(&p->krt_node);
@ -1045,7 +1083,7 @@ krt_reconfigure(struct proto *p, struct proto_config *new)
if (!krt_sys_reconfigure((struct krt_proto *) p, n, o)) if (!krt_sys_reconfigure((struct krt_proto *) p, n, o))
return 0; return 0;
/* persist needn't be the same */ /* persist, graceful restart need not be the same */
return o->scan_time == n->scan_time && o->learn == n->learn && o->devroutes == n->devroutes; return o->scan_time == n->scan_time && o->learn == n->learn && o->devroutes == n->devroutes;
} }

View file

@ -48,6 +48,7 @@ struct krt_config {
int scan_time; /* How often we re-scan routes */ int scan_time; /* How often we re-scan routes */
int learn; /* Learn routes from other sources */ int learn; /* Learn routes from other sources */
int devroutes; /* Allow export of device routes */ int devroutes; /* Allow export of device routes */
int graceful_restart; /* Regard graceful restart recovery */
}; };
struct krt_proto { struct krt_proto {
@ -63,7 +64,8 @@ struct krt_proto {
#endif #endif
node krt_node; /* Node in krt_proto_list */ node krt_node; /* Node in krt_proto_list */
int initialized; /* First scan has already been finished */ byte ready; /* Initial feed has been finished */
byte initialized; /* First scan has been finished */
}; };
extern pool *krt_pool; extern pool *krt_pool;

View file

@ -602,7 +602,7 @@ signal_init(void)
* Parsing of command-line arguments * Parsing of command-line arguments
*/ */
static char *opt_list = "c:dD:ps:P:u:g:f"; static char *opt_list = "c:dD:ps:P:u:g:fR";
static int parse_and_exit; static int parse_and_exit;
char *bird_name; char *bird_name;
static char *use_user; static char *use_user;
@ -612,7 +612,7 @@ static int run_in_foreground = 0;
static void static void
usage(void) usage(void)
{ {
fprintf(stderr, "Usage: %s [-c <config-file>] [-d] [-D <debug-file>] [-p] [-s <control-socket>] [-P <pid-file>] [-u <user>] [-g <group>] [-f]\n", bird_name); fprintf(stderr, "Usage: %s [-c <config-file>] [-d] [-D <debug-file>] [-p] [-s <control-socket>] [-P <pid-file>] [-u <user>] [-g <group>] [-f] [-R]\n", bird_name);
exit(1); exit(1);
} }
@ -723,6 +723,9 @@ parse_args(int argc, char **argv)
case 'f': case 'f':
run_in_foreground = 1; run_in_foreground = 1;
break; break;
case 'R':
graceful_restart_recovery();
break;
default: default:
usage(); usage();
} }
@ -805,6 +808,8 @@ main(int argc, char **argv)
config_commit(conf, RECONFIG_HARD, 0); config_commit(conf, RECONFIG_HARD, 0);
graceful_restart_init();
#ifdef LOCAL_DEBUG #ifdef LOCAL_DEBUG
async_dump_flag = 1; async_dump_flag = 1;
#endif #endif