From 0c791f873aeb7c1052c97db7da4fe23873d69603 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Thu, 20 Mar 2014 14:07:12 +0100 Subject: [PATCH 1/3] BGP graceful restart support. Also significant core protocol state changes needed for that, global graceful restart recovery state and kernel proto support for recovery. --- conf/conf.c | 1 + conf/conf.h | 1 + doc/reply_codes | 1 + lib/lists.h | 2 + nest/cmds.c | 3 + nest/config.Y | 6 + nest/proto.c | 388 +++++++++++++++++++++++++++++++++----------- nest/protocol.h | 31 +++- nest/route.h | 17 ++ nest/rt-table.c | 100 ++++++++++-- proto/bgp/bgp.c | 106 +++++++++++- proto/bgp/bgp.h | 23 +++ proto/bgp/config.Y | 7 +- proto/bgp/packets.c | 150 +++++++++++++++-- sysdep/unix/krt.Y | 3 +- sysdep/unix/krt.c | 46 +++++- sysdep/unix/krt.h | 4 +- sysdep/unix/main.c | 9 +- 18 files changed, 745 insertions(+), 153 deletions(-) diff --git a/conf/conf.c b/conf/conf.c index fc674ef3..67b027ce 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -98,6 +98,7 @@ config_alloc(byte *name) c->load_time = now; c->tf_route = c->tf_proto = (struct timeformat){"%T", "%F", 20*3600}; c->tf_base = c->tf_log = (struct timeformat){"%F %T", NULL, 0}; + c->gr_wait = DEFAULT_GR_WAIT; return c; } diff --git a/conf/conf.h b/conf/conf.h index 28624294..a8bba7e8 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -38,6 +38,7 @@ struct config { struct timeformat tf_proto; /* Time format for 'show protocol' */ struct timeformat tf_log; /* Time format for the logfile */ struct timeformat tf_base; /* Time format for other purposes */ + u32 gr_wait; /* Graceful restart wait timeout */ int cli_debug; /* Tracing of CLI connections and commands */ char *err_msg; /* Parser error message */ diff --git a/doc/reply_codes b/doc/reply_codes index 45b42e00..cd5f2620 100644 --- a/doc/reply_codes +++ b/doc/reply_codes @@ -32,6 +32,7 @@ Reply codes of BIRD command-line interface 0021 Undo requested 0022 Undo scheduled 0023 Evaluation of expression +0024 Graceful restart status report 1000 BIRD version 1001 Interface list diff --git a/lib/lists.h b/lib/lists.h index 9153029c..37c56efb 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -36,6 +36,8 @@ typedef struct list { /* In fact two overlayed nodes */ #define NODE_NEXT(n) ((void *)((NODE (n))->next)) #define NODE_VALID(n) ((NODE (n))->next) #define WALK_LIST(n,list) for(n=HEAD(list); NODE_VALID(n); n=NODE_NEXT(n)) +#define WALK_LIST2(n,nn,list,pos) \ + for(nn=(list).head; NODE_VALID(nn) && (n=SKIP_BACK(typeof(*n),pos,nn)); nn=nn->next) #define WALK_LIST_DELSAFE(n,nxt,list) \ for(n=HEAD(list); nxt=NODE_NEXT(n); n=(void *) nxt) /* WALK_LIST_FIRST supposes that called code removes each processed node */ diff --git a/nest/cmds.c b/nest/cmds.c index ec6bc762..70fbdaf8 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -7,6 +7,7 @@ */ #include "nest/bird.h" +#include "nest/protocol.h" #include "nest/route.h" #include "nest/cli.h" #include "conf/conf.h" @@ -32,6 +33,8 @@ cmd_show_status(void) tm_format_datetime(tim, &config->tf_base, config->load_time); cli_msg(-1011, "Last reconfiguration on %s", tim); + graceful_restart_show_status(); + if (shutting_down) cli_msg(13, "Shutdown in progress"); else if (configuring) diff --git a/nest/config.Y b/nest/config.Y index e9b8a21b..59d354b8 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -49,6 +49,7 @@ CF_KEYWORDS(PASSWORD, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, INTERFA CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, COMMANDS, PREEXPORT, GENERATE, ROA, MAX, FLUSH, AS) CF_KEYWORDS(LISTEN, BGP, V6ONLY, DUAL, ADDRESS, PORT, PASSWORDS, DESCRIPTION, SORTED) CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP) +CF_KEYWORDS(GRACEFUL, RESTART, WAIT) CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT, RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE) @@ -110,6 +111,11 @@ listen_opt: ; +CF_ADDTO(conf, gr_opts) + +gr_opts: GRACEFUL RESTART WAIT expr ';' { new_config->gr_wait = $4; } ; + + /* Creation of routing tables */ tab_sorted: diff --git a/nest/proto.c b/nest/proto.c index cfa6ff4b..2bc3e319 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -35,9 +35,18 @@ static struct proto *initial_device_proto; static event *proto_flush_event; static timer *proto_shutdown_timer; +static timer *gr_wait_timer; + +#define GRS_NONE 0 +#define GRS_INIT 1 +#define GRS_ACTIVE 2 +#define GRS_DONE 3 + +static int graceful_restart_state; +static u32 graceful_restart_locks; static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; -static char *c_states[] = { "HUNGRY", "FEEDING", "HAPPY", "FLUSHING" }; +static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" }; static void proto_flush_loop(void *); static void proto_shutdown_loop(struct timer *); @@ -51,10 +60,12 @@ proto_enqueue(list *l, struct proto *p) } static void -proto_relink(struct proto *p) +proto_set_core_state(struct proto *p, uint state) { list *l = NULL; + p->core_state = state; + if (p->debug & D_STATES) { char *name = proto_state_name(p); @@ -66,13 +77,13 @@ proto_relink(struct proto *p) } else p->last_state_name_announced = NULL; + rem_node(&p->n); switch (p->core_state) { case FS_HUNGRY: l = &inactive_proto_list; break; - case FS_FEEDING: case FS_HAPPY: l = &active_proto_list; break; @@ -126,6 +137,9 @@ proto_init_instance(struct proto *p) p->attn = ev_new(p->pool); p->attn->data = p; + if (graceful_restart_state == GRS_INIT) + p->gr_recovery = 1; + if (! p->proto->multitable) rt_lock_table(p->table); } @@ -169,7 +183,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s h->next = p->ahooks; p->ahooks = h; - if (p->rt_notify) + if (p->rt_notify && (p->export_state == ES_READY)) add_tail(&t->hooks, &h->n); return h; } @@ -193,6 +207,16 @@ proto_find_announce_hook(struct proto *p, struct rtable *t) return NULL; } +static void +proto_link_ahooks(struct proto *p) +{ + struct announce_hook *h; + + if (p->rt_notify) + for(h=p->ahooks; h; h=h->next) + add_tail(&h->table->hooks, &h->n); +} + static void proto_unlink_ahooks(struct proto *p) { @@ -362,6 +386,7 @@ proto_init(struct proto_config *c) q->proto_state = PS_DOWN; q->core_state = FS_HUNGRY; + q->export_state = ES_DOWN; q->last_state_change = now; proto_enqueue(&initial_proto_list, q); @@ -590,6 +615,7 @@ static void proto_rethink_goal(struct proto *p) { struct protocol *q; + byte goal; if (p->reconfiguring && p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN) { @@ -606,22 +632,14 @@ proto_rethink_goal(struct proto *p) /* Determine what state we want to reach */ if (p->disabled || p->reconfiguring) - { - p->core_goal = FS_HUNGRY; - if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN) - return; - } + goal = PS_DOWN; else - { - p->core_goal = FS_HAPPY; - if (p->core_state == FS_HAPPY && p->proto_state == PS_UP) - return; - } + goal = PS_UP; q = p->proto; - if (p->core_goal == FS_HAPPY) /* Going up */ + if (goal == PS_UP) /* Going up */ { - if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN) + if (p->proto_state == PS_DOWN && p->core_state == FS_HUNGRY) { DBG("Kicking %s up\n", p->name); PD(p, "Starting"); @@ -640,6 +658,104 @@ proto_rethink_goal(struct proto *p) } } + + +static void graceful_restart_done(struct timer *t UNUSED); +static void proto_want_export_up(struct proto *p); + +void +graceful_restart_recovery(void) +{ + graceful_restart_state = GRS_INIT; +} + +void +graceful_restart_init(void) +{ + if (!graceful_restart_state) + return; + + log(L_INFO "Graceful restart started"); + + if (!graceful_restart_locks) + { + graceful_restart_done(NULL); + return; + } + + graceful_restart_state = GRS_ACTIVE; + gr_wait_timer = tm_new(proto_pool); + gr_wait_timer->hook = graceful_restart_done; + tm_start(gr_wait_timer, config->gr_wait); +} + +static void +graceful_restart_done(struct timer *t UNUSED) +{ + struct proto *p; + node *n; + + log(L_INFO "Graceful restart done"); + graceful_restart_state = GRS_DONE; + + WALK_LIST2(p, n, proto_list, glob_node) + { + if (!p->gr_recovery) + continue; + + /* Resume postponed export of routes */ + if ((p->proto_state == PS_UP) && p->gr_wait) + proto_want_export_up(p); + + /* Cleanup */ + p->gr_recovery = 0; + p->gr_wait = 0; + p->gr_lock = 0; + } + + graceful_restart_locks = 0; +} + +void +graceful_restart_show_status(void) +{ + if (graceful_restart_state != GRS_ACTIVE) + return; + + cli_msg(-24, "Graceful restart recovery in progress"); + cli_msg(-24, " Waiting for %d protocols to recover", graceful_restart_locks); + cli_msg(-24, " Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait); +} + +/* Just from start hook */ +void +proto_graceful_restart_lock(struct proto *p) +{ + ASSERT(graceful_restart_state == GRS_INIT); + ASSERT(p->gr_recovery); + + if (p->gr_lock) + return; + + p->gr_lock = 1; + graceful_restart_locks++; +} + +void +proto_graceful_restart_unlock(struct proto *p) +{ + if (!p->gr_lock) + return; + + p->gr_lock = 0; + graceful_restart_locks--; + + if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks) + tm_start(gr_wait_timer, 0); +} + + + /** * protos_dump_all - dump status of all protocols * @@ -751,6 +867,8 @@ protos_build(void) proto_flush_event->hook = proto_flush_loop; proto_shutdown_timer = tm_new(proto_pool); proto_shutdown_timer->hook = proto_shutdown_loop; + proto_shutdown_timer = tm_new(proto_pool); + proto_shutdown_timer->hook = proto_shutdown_loop; } static void @@ -779,15 +897,17 @@ proto_feed_more(void *P) { struct proto *p = P; - if (p->core_state != FS_FEEDING) + if (p->export_state != ES_FEEDING) return; DBG("Feeding protocol %s continued\n", p->name); if (rt_feed_baby(p)) { - p->core_state = FS_HAPPY; - proto_relink(p); - DBG("Protocol %s up and running\n", p->name); + DBG("Feeding protocol %s finished\n", p->name); + p->export_state = ES_READY; + + if (p->feed_done) + p->feed_done(p); } else { @@ -801,7 +921,7 @@ proto_feed_initial(void *P) { struct proto *p = P; - if (p->core_state != FS_FEEDING) + if (p->export_state != ES_FEEDING) return; DBG("Feeding protocol %s\n", p->name); @@ -814,40 +934,10 @@ static void proto_schedule_feed(struct proto *p, int initial) { DBG("%s: Scheduling meal\n", p->name); - p->core_state = FS_FEEDING; + + p->export_state = ES_FEEDING; p->refeeding = !initial; - /* FIXME: This should be changed for better support of multitable protos */ - if (!initial) - { - struct announce_hook *ah; - for (ah = p->ahooks; ah; ah = ah->next) - proto_reset_limit(ah->out_limit); - - /* Hack: reset exp_routes during refeed, and do not decrease it later */ - p->stats.exp_routes = 0; - } - - /* Connect protocol to routing table */ - if (initial && !p->proto->multitable) - { - p->main_source = rt_get_source(p, 0); - rt_lock_source(p->main_source); - - p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats); - p->main_ahook->in_filter = p->cf->in_filter; - p->main_ahook->out_filter = p->cf->out_filter; - p->main_ahook->rx_limit = p->cf->rx_limit; - p->main_ahook->in_limit = p->cf->in_limit; - p->main_ahook->out_limit = p->cf->out_limit; - p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered; - - proto_reset_limit(p->main_ahook->rx_limit); - proto_reset_limit(p->main_ahook->in_limit); - proto_reset_limit(p->main_ahook->out_limit); - } - - proto_relink(p); p->attn->hook = initial ? proto_feed_initial : proto_feed_more; ev_schedule(p->attn); } @@ -877,7 +967,7 @@ proto_schedule_flush_loop(void) { p->flushing = 1; for (h=p->ahooks; h; h=h->next) - h->table->prune_state = 1; + rt_mark_for_prune(h->table); } ev_schedule(proto_flush_event); @@ -908,8 +998,7 @@ proto_flush_loop(void *unused UNUSED) DBG("Flushing protocol %s\n", p->name); p->flushing = 0; - p->core_state = FS_HUNGRY; - proto_relink(p); + proto_set_core_state(p, FS_HUNGRY); if (p->proto_state == PS_DOWN) proto_fell_down(p); goto again; @@ -921,19 +1010,6 @@ proto_flush_loop(void *unused UNUSED) proto_schedule_flush_loop(); } -static void -proto_schedule_flush(struct proto *p) -{ - /* Need to abort feeding */ - if (p->core_state == FS_FEEDING) - rt_feed_baby_abort(p); - - DBG("%s: Scheduling flush\n", p->name); - p->core_state = FS_FLUSHING; - proto_relink(p); - proto_unlink_ahooks(p); - proto_schedule_flush_loop(); -} /* Temporary hack to propagate restart to BGP */ int proto_restart; @@ -980,9 +1056,9 @@ proto_schedule_down(struct proto *p, byte restart, byte code) * * Sometimes it is needed to send again all routes to the * protocol. This is called feeding and can be requested by this - * function. This would cause protocol core state transition - * to FS_FEEDING (during feeding) and when completed, it will - * switch back to FS_HAPPY. This function can be called even + * function. This would cause protocol export state transition + * to ES_FEEDING (during feeding) and when completed, it will + * switch back to ES_READY. This function can be called even * when feeding is already running, in that case it is restarted. */ void @@ -991,7 +1067,7 @@ proto_request_feeding(struct proto *p) ASSERT(p->proto_state == PS_UP); /* If we are already feeding, we want to restart it */ - if (p->core_state == FS_FEEDING) + if (p->export_state == ES_FEEDING) { /* Unless feeding is in initial state */ if (p->attn->hook == proto_feed_initial) @@ -1000,6 +1076,14 @@ proto_request_feeding(struct proto *p) rt_feed_baby_abort(p); } + /* FIXME: This should be changed for better support of multitable protos */ + struct announce_hook *ah; + for (ah = p->ahooks; ah; ah = ah->next) + proto_reset_limit(ah->out_limit); + + /* Hack: reset exp_routes during refeed, and do not decrease it later */ + p->stats.exp_routes = 0; + proto_schedule_feed(p, 0); } @@ -1060,6 +1144,83 @@ proto_notify_limit(struct announce_hook *ah, struct proto_limit *l, int dir, u32 } } + +static void +proto_want_core_up(struct proto *p) +{ + ASSERT(p->core_state == FS_HUNGRY); + + if (!p->proto->multitable) + { + p->main_source = rt_get_source(p, 0); + rt_lock_source(p->main_source); + + /* Connect protocol to routing table */ + p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats); + p->main_ahook->in_filter = p->cf->in_filter; + p->main_ahook->out_filter = p->cf->out_filter; + p->main_ahook->rx_limit = p->cf->rx_limit; + p->main_ahook->in_limit = p->cf->in_limit; + p->main_ahook->out_limit = p->cf->out_limit; + p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered; + + proto_reset_limit(p->main_ahook->rx_limit); + proto_reset_limit(p->main_ahook->in_limit); + proto_reset_limit(p->main_ahook->out_limit); + } + + proto_set_core_state(p, FS_HAPPY); +} + +static void +proto_want_export_up(struct proto *p) +{ + ASSERT(p->core_state == CS_HAPPY); + ASSERT(p->export_state == ES_DOWN); + + proto_link_ahooks(p); + proto_schedule_feed(p, 1); /* Sets ES_FEEDING */ +} + +static void +proto_want_export_down(struct proto *p) +{ + ASSERT(p->export_state != ES_DOWN); + + /* Need to abort feeding */ + if (p->export_state == ES_FEEDING) + rt_feed_baby_abort(p); + + p->export_state = ES_DOWN; + proto_unlink_ahooks(p); +} + +static void +proto_want_core_down(struct proto *p) +{ + ASSERT(p->core_state == CS_HAPPY); + ASSERT(p->export_state == ES_DOWN); + + proto_set_core_state(p, FS_FLUSHING); + proto_schedule_flush_loop(); + + if (!p->proto->multitable) + { + rt_unlock_source(p->main_source); + p->main_source = NULL; + } +} + +static void +proto_falling_down(struct proto *p) +{ + p->gr_recovery = 0; + p->gr_wait = 0; + if (p->gr_lock) + proto_graceful_restart_unlock(p); +} + + /** * proto_notify_state - notify core about protocol state change * @p: protocol the state of which has changed @@ -1079,6 +1240,7 @@ proto_notify_state(struct proto *p, unsigned ps) { unsigned ops = p->proto_state; unsigned cs = p->core_state; + unsigned es = p->export_state; DBG("%s reporting state transition %s/%s -> */%s\n", p->name, c_states[cs], p_states[ops], p_states[ps]); if (ops == ps) @@ -1089,17 +1251,47 @@ proto_notify_state(struct proto *p, unsigned ps) switch (ps) { + case PS_START: + ASSERT(ops == PS_DOWN || ops == PS_UP); + ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY); + + if (es != ES_DOWN) + proto_want_export_down(p); + break; + + case PS_UP: + ASSERT(ops == PS_DOWN || ops == PS_START); + ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY); + ASSERT(es == ES_DOWN); + + if (cs == FS_HUNGRY) + proto_want_core_up(p); + if (!p->gr_wait) + proto_want_export_up(p); + break; + + case PS_STOP: + ASSERT(ops == PS_START || ops == PS_UP); + + p->down_sched = 0; + + if (es != ES_DOWN) + proto_want_export_down(p); + if (cs == FS_HAPPY) + proto_want_core_down(p); + proto_falling_down(p); + break; + case PS_DOWN: p->down_code = 0; p->down_sched = 0; - if ((cs == FS_FEEDING) || (cs == FS_HAPPY)) - proto_schedule_flush(p); - if (p->proto->multitable) - { - rt_unlock_source(p->main_source); - p->main_source = NULL; - } + if (es != ES_DOWN) + proto_want_export_down(p); + if (cs == FS_HAPPY) + proto_want_core_down(p); + if (ops != PS_STOP) + proto_falling_down(p); neigh_prune(); // FIXME convert neighbors to resource? rfree(p->pool); @@ -1111,22 +1303,9 @@ proto_notify_state(struct proto *p, unsigned ps) return; /* The protocol might have ceased to exist */ } break; - case PS_START: - ASSERT(ops == PS_DOWN); - ASSERT(cs == FS_HUNGRY); - break; - case PS_UP: - ASSERT(ops == PS_DOWN || ops == PS_START); - ASSERT(cs == FS_HUNGRY); - proto_schedule_feed(p, 1); - break; - case PS_STOP: - p->down_sched = 0; - if ((cs == FS_FEEDING) || (cs == FS_HAPPY)) - proto_schedule_flush(p); - break; + default: - bug("Invalid state transition for %s from %s/%s to */%s", p->name, c_states[cs], p_states[ops], p_states[ps]); + bug("%s: Invalid state %d", p->name, ps); } } @@ -1141,11 +1320,17 @@ proto_state_name(struct proto *p) switch (P(p->proto_state, p->core_state)) { case P(PS_DOWN, FS_HUNGRY): return "down"; - case P(PS_START, FS_HUNGRY): return "start"; - case P(PS_UP, FS_HUNGRY): - case P(PS_UP, FS_FEEDING): return "feed"; + case P(PS_START, FS_HUNGRY): + case P(PS_START, FS_HAPPY): return "start"; + case P(PS_UP, FS_HAPPY): + switch (p->export_state) + { + case ES_DOWN: return "wait"; + case ES_FEEDING: return "feed"; + case ES_READY: return "up"; + default: return "???"; + } case P(PS_STOP, FS_HUNGRY): return "stop"; - case P(PS_UP, FS_HAPPY): return "up"; case P(PS_STOP, FS_FLUSHING): case P(PS_DOWN, FS_FLUSHING): return "flush"; default: return "???"; @@ -1196,6 +1381,11 @@ proto_show_basic_info(struct proto *p) cli_msg(-1006, " Input filter: %s", filter_name(p->cf->in_filter)); cli_msg(-1006, " Output filter: %s", filter_name(p->cf->out_filter)); + if (graceful_restart_state == GRS_ACTIVE) + cli_msg(-1006, " GR recovery: %s%s", + p->gr_lock ? " pending" : "", + p->gr_wait ? " waiting" : ""); + proto_show_limit(p->cf->rx_limit, "Receive limit:"); proto_show_limit(p->cf->in_limit, "Import limit:"); proto_show_limit(p->cf->out_limit, "Export limit:"); diff --git a/nest/protocol.h b/nest/protocol.h index b58f9e67..ec779563 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -148,10 +148,13 @@ struct proto { byte disabled; /* Manually disabled */ byte proto_state; /* Protocol state machine (PS_*, see below) */ byte core_state; /* Core state machine (FS_*, see below) */ - byte core_goal; /* State we want to reach (FS_*, see below) */ + byte export_state; /* Route export state (ES_*, see below) */ byte reconfiguring; /* We're shutting down due to reconfiguration */ - byte refeeding; /* We are refeeding (valid only if core_state == FS_FEEDING) */ + byte refeeding; /* We are refeeding (valid only if export_state == ES_FEEDING) */ byte flushing; /* Protocol is flushed in current flush loop round */ + byte gr_recovery; /* Protocol should participate in graceful restart recovery */ + byte gr_lock; /* Graceful restart mechanism should wait for this proto */ + byte gr_wait; /* Route export to protocol is postponed until graceful restart */ byte down_sched; /* Shutdown is scheduled for later (PDS_*) */ byte down_code; /* Reason for shutdown (PDC_* codes) */ u32 hash_key; /* Random key used for hashing of neighbors */ @@ -175,6 +178,7 @@ struct proto { * reload_routes Request protocol to reload all its routes to the core * (using rte_update()). Returns: 0=reload cannot be done, * 1= reload is scheduled and will happen (asynchronously). + * feed_done Notify protocol about finish of route feeding. */ void (*if_notify)(struct proto *, unsigned flags, struct iface *i); @@ -185,6 +189,7 @@ struct proto { void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs); int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool); int (*reload_routes)(struct proto *); + void (*feed_done)(struct proto *); /* * Routing entry hooks (called only for routes belonging to this protocol): @@ -242,6 +247,13 @@ static inline void proto_copy_rest(struct proto_config *dest, struct proto_config *src, unsigned size) { memcpy(dest + 1, src + 1, size - sizeof(struct proto_config)); } +void graceful_restart_recovery(void); +void graceful_restart_init(void); +void graceful_restart_show_status(void); +void proto_graceful_restart_lock(struct proto *p); +void proto_graceful_restart_unlock(struct proto *p); + +#define DEFAULT_GR_WAIT 240 void proto_show_limit(struct proto_limit *l, const char *dsc); void proto_show_basic_info(struct proto *p); @@ -343,10 +355,17 @@ void proto_notify_state(struct proto *p, unsigned state); * as a result of received ROUTE-REFRESH request). */ -#define FS_HUNGRY 0 -#define FS_FEEDING 1 -#define FS_HAPPY 2 -#define FS_FLUSHING 3 +#define FS_HUNGRY 0 +#define FS_FEEDING 1 /* obsolete */ +#define FS_HAPPY 2 +#define FS_FLUSHING 3 + + +#define ES_DOWN 0 +#define ES_FEEDING 1 +#define ES_READY 2 + + /* * Debugging flags diff --git a/nest/route.h b/nest/route.h index f00f8b2b..82d9e202 100644 --- a/nest/route.h +++ b/nest/route.h @@ -148,6 +148,10 @@ typedef struct rtable { struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ } rtable; +#define RPS_NONE 0 +#define RPS_SCHEDULED 1 +#define RPS_RUNNING 2 + typedef struct network { struct fib_node n; /* FIB flags reserved for kernel syncer */ struct rte *routes; /* Available routes for this network */ @@ -222,6 +226,8 @@ typedef struct rte { #define REF_COW 1 /* Copy this rte on write */ #define REF_FILTERED 2 /* Route is rejected by import filter */ +#define REF_STALE 4 /* Route is stale in a refresh cycle */ +#define REF_DISCARD 8 /* Route is scheduled for discard */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } @@ -257,6 +263,8 @@ void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *s static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); } void rte_discard(rtable *tab, rte *old); int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter); +void rt_refresh_begin(rtable *t, struct announce_hook *ah); +void rt_refresh_end(rtable *t, struct announce_hook *ah); void rte_dump(rte *); void rte_free(rte *); rte *rte_do_cow(rte *); @@ -268,6 +276,15 @@ void rt_feed_baby_abort(struct proto *p); int rt_prune_loop(void); struct rtable_config *rt_new_table(struct symbol *s); +static inline void +rt_mark_for_prune(rtable *tab) +{ + if (tab->prune_state == RPS_RUNNING) + fit_get(&tab->fib, &tab->prune_fit); + + tab->prune_state = RPS_SCHEDULED; +} + struct rt_show_data { ip_addr prefix; unsigned pxlen; diff --git a/nest/rt-table.c b/nest/rt-table.c index 8c91ea0a..bc911729 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -55,8 +55,10 @@ static void rt_free_hostcache(rtable *tab); static void rt_notify_hostcache(rtable *tab, net *net); static void rt_update_hostcache(rtable *tab); static void rt_next_hop_update(rtable *tab); - +static inline int rt_prune_table(rtable *tab); static inline void rt_schedule_gc(rtable *tab); +static inline void rt_schedule_prune(rtable *tab); + static inline struct ea_list * make_tmp_attrs(struct rte *rt, struct linpool *pool) @@ -570,7 +572,7 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo struct announce_hook *a; WALK_LIST(a, tab->hooks) { - ASSERT(a->proto->core_state == FS_HAPPY || a->proto->core_state == FS_FEEDING); + ASSERT(a->proto->export_state != ES_DOWN); if (a->proto->accept_ra_types == type) if (type == RA_ACCEPTED) rt_notify_accepted(a, net, new, old, before_old, tmpa, 0); @@ -1108,6 +1110,46 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter return v > 0; } +void +rt_refresh_begin(rtable *t, struct announce_hook *ah) +{ + net *n; + rte *e; + + FIB_WALK(&t->fib, fn) + { + n = (net *) fn; + for (e = n->routes; e; e = e->next) + if (e->sender == ah) + e->flags |= REF_STALE; + } + FIB_WALK_END; +} + +void +rt_refresh_end(rtable *t, struct announce_hook *ah) +{ + int prune = 0; + net *n; + rte *e; + + FIB_WALK(&t->fib, fn) + { + n = (net *) fn; + for (e = n->routes; e; e = e->next) + if ((e->sender == ah) && (e->flags & REF_STALE)) + { + e->flags |= REF_DISCARD; + prune = 1; + } + } + FIB_WALK_END; + + if (prune) + rt_schedule_prune(t); +} + + /** * rte_dump - dump a route * @e: &rte to be dumped @@ -1169,6 +1211,13 @@ rt_dump_all(void) rt_dump(t); } +static inline void +rt_schedule_prune(rtable *tab) +{ + rt_mark_for_prune(tab); + ev_schedule(tab->rt_event); +} + static inline void rt_schedule_gc(rtable *tab) { @@ -1199,6 +1248,7 @@ rt_schedule_nhu(rtable *tab) tab->nhu_state |= 1; } + static void rt_prune_nets(rtable *tab) { @@ -1242,6 +1292,14 @@ rt_event(void *ptr) if (tab->nhu_state) rt_next_hop_update(tab); + if (tab->prune_state) + if (!rt_prune_table(tab)) + { + /* Table prune unfinished */ + ev_schedule(tab->rt_event); + return; + } + if (tab->gc_scheduled) { rt_prune_nets(tab); @@ -1283,8 +1341,8 @@ rt_init(void) } -static inline int -rt_prune_step(rtable *tab, int step, int *max_feed) +static int +rt_prune_step(rtable *tab, int step, int *limit) { static struct rate_limit rl_flush; struct fib_iterator *fit = &tab->prune_fit; @@ -1294,13 +1352,13 @@ rt_prune_step(rtable *tab, int step, int *max_feed) fib_check(&tab->fib); #endif - if (tab->prune_state == 0) + if (tab->prune_state == RPS_NONE) return 1; - if (tab->prune_state == 1) + if (tab->prune_state == RPS_SCHEDULED) { FIB_ITERATE_INIT(fit, &tab->fib); - tab->prune_state = 2; + tab->prune_state = RPS_RUNNING; } again: @@ -1312,9 +1370,10 @@ again: rescan: for (e=n->routes; e; e=e->next) if (e->sender->proto->flushing || + (e->flags & REF_DISCARD) || (step && e->attrs->src->proto->flushing)) { - if (*max_feed <= 0) + if (*limit <= 0) { FIB_ITERATE_PUT(fit, fn); return 0; @@ -1325,7 +1384,7 @@ again: n->n.prefix, n->n.pxlen, e->attrs->src->proto->name, tab->name); rte_discard(tab, e); - (*max_feed)--; + (*limit)--; goto rescan; } @@ -1342,10 +1401,17 @@ again: fib_check(&tab->fib); #endif - tab->prune_state = 0; + tab->prune_state = RPS_NONE; return 1; } +static inline int +rt_prune_table(rtable *tab) +{ + int limit = 512; + return rt_prune_step(tab, 0, &limit); +} + /** * rt_prune_loop - prune routing tables * @@ -1364,19 +1430,19 @@ int rt_prune_loop(void) { static int step = 0; - int max_feed = 512; + int limit = 512; rtable *t; again: WALK_LIST(t, routing_tables) - if (! rt_prune_step(t, step, &max_feed)) + if (! rt_prune_step(t, step, &limit)) return 0; if (step == 0) { /* Prepare for the second step */ WALK_LIST(t, routing_tables) - t->prune_state = 1; + t->prune_state = RPS_SCHEDULED; step = 1; goto again; @@ -1721,7 +1787,7 @@ again: (p->accept_ra_types == RA_ACCEPTED)) if (rte_is_valid(e)) { - if (p->core_state != FS_FEEDING) + if (p->export_state != ES_FEEDING) return 1; /* In the meantime, the protocol fell down. */ do_feed_baby(p, p->accept_ra_types, h, n, e); max_feed--; @@ -1730,7 +1796,7 @@ again: if (p->accept_ra_types == RA_ANY) for(e = n->routes; rte_is_valid(e); e = e->next) { - if (p->core_state != FS_FEEDING) + if (p->export_state != ES_FEEDING) return 1; /* In the meantime, the protocol fell down. */ do_feed_baby(p, RA_ANY, h, n, e); max_feed--; @@ -2223,9 +2289,7 @@ rt_show_cont(struct cli *c) cli_printf(c, 8004, "Stopped due to reconfiguration"); goto done; } - if (d->export_protocol && - d->export_protocol->core_state != FS_HAPPY && - d->export_protocol->core_state != FS_FEEDING) + if (d->export_protocol && (d->export_protocol->export_state == ES_DOWN)) { cli_printf(c, 8005, "Protocol is down"); goto done; diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index a748669d..ae9f6877 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -319,6 +319,7 @@ bgp_decision(void *vp) DBG("BGP: Decision start\n"); if ((p->p.proto_state == PS_START) && (p->outgoing_conn.state == BS_IDLE) + && (p->incoming_conn.state != BS_OPENCONFIRM) && (!p->cf->passive)) bgp_active(p); @@ -363,7 +364,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) /* For multi-hop BGP sessions */ if (ipa_zero(p->source_addr)) - p->source_addr = conn->sk->saddr; + p->source_addr = conn->sk->saddr; p->conn = conn; p->last_error_class = 0; @@ -371,6 +372,20 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) bgp_init_bucket_table(p); bgp_init_prefix_table(p, 8); + int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART); + + if (p->p.gr_recovery && !peer_gr_ready) + proto_graceful_restart_unlock(&p->p); + + if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready) + p->p.gr_wait = 1; + + if (p->gr_active) + tm_stop(p->gr_timer); + + if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING))) + bgp_graceful_restart_done(p); + bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); } @@ -416,16 +431,56 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn) bgp_conn_leave_established_state(p); } +void +bgp_handle_graceful_restart(struct bgp_proto *p) +{ + ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready); + + BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s", + p->gr_active ? " - already pending" : ""); + proto_notify_state(&p->p, PS_START); + + if (p->gr_active) + rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + + p->gr_active = 1; + bgp_start_timer(p->gr_timer, p->conn->peer_gr_time); + rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); +} + +void +bgp_graceful_restart_done(struct bgp_proto *p) +{ + BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); + p->gr_active = 0; + tm_stop(p->gr_timer); + rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); +} + +static void +bgp_graceful_restart_timeout(timer *t) +{ + struct bgp_proto *p = t->data; + + BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout"); + bgp_stop(p, 0); +} + static void bgp_send_open(struct bgp_conn *conn) { conn->start_state = conn->bgp->start_state; // Default values, possibly changed by receiving capabilities. + conn->advertised_as = 0; conn->peer_refresh_support = 0; conn->peer_as4_support = 0; conn->peer_add_path = 0; - conn->advertised_as = 0; + conn->peer_gr_aware = 0; + conn->peer_gr_able = 0; + conn->peer_gr_time = 0; + conn->peer_gr_flags = 0; + conn->peer_gr_aflags = 0; DBG("BGP: Sending open\n"); conn->sk->rx_hook = bgp_rx; @@ -484,6 +539,9 @@ bgp_sock_err(sock *sk, int err) else BGP_TRACE(D_EVENTS, "Connection closed"); + if ((conn->state == BS_ESTABLISHED) && p->gr_ready) + bgp_handle_graceful_restart(p); + bgp_conn_enter_idle_state(conn); } @@ -649,6 +707,14 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED) int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) && (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); + if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready) + { + bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); + bgp_handle_graceful_restart(p); + bgp_conn_enter_idle_state(p->conn); + acc = 1; + } + BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL, sk->dport, acc ? "accepted" : "rejected"); @@ -817,6 +883,17 @@ bgp_reload_routes(struct proto *P) return 1; } +static void +bgp_feed_done(struct proto *P) +{ + struct bgp_proto *p = (struct bgp_proto *) P; + if (!p->conn || !p->cf->gr_mode) + return; + + p->send_end_mark = 1; + bgp_schedule_packet(p->conn, PKT_UPDATE); +} + static void bgp_start_locked(struct object_lock *lock) { @@ -867,6 +944,8 @@ bgp_start(struct proto *P) p->incoming_conn.state = BS_IDLE; p->neigh = NULL; p->bfd_req = NULL; + p->gr_ready = 0; + p->gr_active = 0; rt_lock_table(p->igp_table); @@ -878,6 +957,10 @@ bgp_start(struct proto *P) p->startup_timer->hook = bgp_startup_timeout; p->startup_timer->data = p; + p->gr_timer = tm_new(p->p.pool); + p->gr_timer->hook = bgp_graceful_restart_timeout; + p->gr_timer->data = p; + p->local_id = proto_get_router_id(P->cf); if (p->rr_client) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; @@ -885,6 +968,9 @@ bgp_start(struct proto *P) p->remote_id = 0; p->source_addr = p->cf->source_addr; + if (P->gr_recovery) + proto_graceful_restart_lock(P); + /* * Before attempting to create the connection, we need to lock the * port, so that are sure we're the only instance attempting to talk @@ -985,6 +1071,7 @@ bgp_init(struct proto_config *C) P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; + P->feed_done = bgp_feed_done; P->rte_better = bgp_rte_better; P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; @@ -1164,7 +1251,7 @@ bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code) static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" }; static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""}; -static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down" }; +static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down", "Graceful restart"}; static char *bgp_auto_errors[] = { "", "Route limit exceeded"}; static const char * @@ -1225,25 +1312,32 @@ bgp_show_proto_info(struct proto *P) cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface); cli_msg(-1006, " Neighbor AS: %u", p->remote_as); + if (p->gr_active) + cli_msg(-1006, " Neighbor graceful restart active"); + if (P->proto_state == PS_START) { struct bgp_conn *oc = &p->outgoing_conn; if ((p->start_state < BSS_CONNECT) && (p->startup_timer->expires)) - cli_msg(-1006, " Error wait: %d/%d", + cli_msg(-1006, " Error wait: %d/%d", p->startup_timer->expires - now, p->startup_delay); if ((oc->state == BS_ACTIVE) && (oc->connect_retry_timer->expires)) - cli_msg(-1006, " Start delay: %d/%d", + cli_msg(-1006, " Start delay: %d/%d", oc->connect_retry_timer->expires - now, p->cf->start_delay_time); + + if (p->gr_active && p->gr_timer->expires) + cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now); } else if (P->proto_state == PS_UP) { cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s", + cli_msg(-1006, " Neighbor caps: %s%s%s%s%s", c->peer_refresh_support ? " refresh" : "", + c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), c->peer_as4_support ? " AS4" : "", (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : ""); diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 170b6bbe..da0114c2 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -48,6 +48,8 @@ struct bgp_config { int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ int add_path; /* Use ADD-PATH extension [draft] */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ + int gr_mode; /* Graceful restart mode (BGP_GR_*) */ + unsigned gr_time; /* Graceful restart timeout */ unsigned connect_retry_time; unsigned hold_time, initial_hold_time; unsigned keepalive_time; @@ -73,6 +75,15 @@ struct bgp_config { #define ADD_PATH_TX 2 #define ADD_PATH_FULL 3 +#define BGP_GR_ABLE 1 +#define BGP_GR_AWARE 2 + +/* For peer_gr_flags */ +#define BGP_GRF_RESTART 0x80 + +/* For peer_gr_aflags */ +#define BGP_GRF_FORWARDING 0x80 + struct bgp_conn { struct bgp_proto *bgp; @@ -90,6 +101,11 @@ struct bgp_conn { u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ u8 peer_add_path; /* Peer supports ADD-PATH [draft] */ + u8 peer_gr_aware; + u8 peer_gr_able; + u16 peer_gr_time; + u8 peer_gr_flags; + u8 peer_gr_aflags; unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; @@ -107,6 +123,8 @@ struct bgp_proto { u32 rr_cluster_id; /* Route reflector cluster ID */ int rr_client; /* Whether neighbor is RR client of me */ int rs_client; /* Whether neighbor is RS client of me */ + u8 gr_ready; /* Neighbor could do graceful restart */ + u8 gr_active; /* Neighbor is doing graceful restart */ struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ @@ -117,12 +135,14 @@ struct bgp_proto { rtable *igp_table; /* Table used for recursive next hop lookups */ struct event *event; /* Event for respawning and shutting process */ struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ + struct timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ unsigned int hash_size, hash_count, hash_limit; HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ slab *prefix_slab; /* Slab holding prefix nodes */ list bucket_queue; /* Queue of buckets to send */ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + unsigned send_end_mark; /* End-of-RIB mark scheduled for transmit */ unsigned startup_delay; /* Time to delay protocol startup by due to errors */ bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */ u8 last_error_class; /* Error class of last error */ @@ -172,6 +192,8 @@ void bgp_conn_enter_openconfirm_state(struct bgp_conn *conn); void bgp_conn_enter_established_state(struct bgp_conn *conn); void bgp_conn_enter_close_state(struct bgp_conn *conn); void bgp_conn_enter_idle_state(struct bgp_conn *conn); +void bgp_handle_graceful_restart(struct bgp_proto *p); +void bgp_graceful_restart_done(struct bgp_proto *p); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, unsigned subcode); @@ -313,6 +335,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BEM_INVALID_MD5 3 /* MD5 authentication kernel request failed (possibly not supported) */ #define BEM_NO_SOCKET 4 #define BEM_BFD_DOWN 5 +#define BEM_GRACEFUL_RESTART 6 /* Automatic shutdown error codes */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 76a76470..6b885032 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -26,7 +26,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, - SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX) + SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE) CF_GRAMMAR @@ -50,6 +50,8 @@ bgp_proto_start: proto_start BGP { BGP_CFG->advertise_ipv4 = 1; BGP_CFG->interpret_communities = 1; BGP_CFG->default_local_pref = 100; + BGP_CFG->gr_mode = BGP_GR_AWARE; + BGP_CFG->gr_time = 120; } ; @@ -115,6 +117,9 @@ bgp_proto: | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } + | bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; } + | bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; } + | bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; } | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; } | bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); } diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 649d8078..2d4da8c9 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -122,7 +122,7 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf) #ifdef IPV6 static byte * -bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf) +bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf) { *buf++ = 1; /* Capability 1: Multiprotocol extensions */ *buf++ = 4; /* Capability data length */ @@ -136,7 +136,7 @@ bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf) #else static byte * -bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf) +bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf) { *buf++ = 1; /* Capability 1: Multiprotocol extensions */ *buf++ = 4; /* Capability data length */ @@ -149,7 +149,7 @@ bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf) #endif static byte * -bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf) +bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf) { *buf++ = 2; /* Capability 2: Support for route refresh */ *buf++ = 0; /* Capability data length */ @@ -157,16 +157,44 @@ bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf) } static byte * -bgp_put_cap_as4(struct bgp_conn *conn, byte *buf) +bgp_put_cap_gr1(struct bgp_proto *p, byte *buf) +{ + *buf++ = 64; /* Capability 64: Support for graceful restart */ + *buf++ = 6; /* Capability data length */ + + put_u16(buf, p->cf->gr_time); + if (p->p.gr_recovery) + buf[0] |= BGP_GRF_RESTART; + buf += 2; + + *buf++ = 0; /* Appropriate AF */ + *buf++ = BGP_AF; + *buf++ = 1; /* and SAFI 1 */ + *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0; + + return buf; +} + +static byte * +bgp_put_cap_gr2(struct bgp_proto *p, byte *buf) +{ + *buf++ = 64; /* Capability 64: Support for graceful restart */ + *buf++ = 2; /* Capability data length */ + put_u16(buf, 0); + return buf + 2; +} + +static byte * +bgp_put_cap_as4(struct bgp_proto *p, byte *buf) { *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ *buf++ = 4; /* Capability data length */ - put_u32(buf, conn->bgp->local_as); + put_u32(buf, p->local_as); return buf + 4; } static byte * -bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf) +bgp_put_cap_add_path(struct bgp_proto *p, byte *buf) { *buf++ = 69; /* Capability 69: Support for ADD-PATH */ *buf++ = 4; /* Capability data length */ @@ -175,7 +203,7 @@ bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf) *buf++ = BGP_AF; *buf++ = 1; /* SAFI 1 */ - *buf++ = conn->bgp->cf->add_path; + *buf++ = p->cf->add_path; return buf; } @@ -206,21 +234,26 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) #ifndef IPV6 if (p->cf->advertise_ipv4) - cap = bgp_put_cap_ipv4(conn, cap); + cap = bgp_put_cap_ipv4(p, cap); #endif #ifdef IPV6 - cap = bgp_put_cap_ipv6(conn, cap); + cap = bgp_put_cap_ipv6(p, cap); #endif if (p->cf->enable_refresh) - cap = bgp_put_cap_rr(conn, cap); + cap = bgp_put_cap_rr(p, cap); + + if (p->cf->gr_mode == BGP_GR_ABLE) + cap = bgp_put_cap_gr1(p, cap); + else if (p->cf->gr_mode == BGP_GR_AWARE) + cap = bgp_put_cap_gr2(p, cap); if (p->cf->enable_as4) - cap = bgp_put_cap_as4(conn, cap); + cap = bgp_put_cap_as4(p, cap); if (p->cf->add_path) - cap = bgp_put_cap_add_path(conn, cap); + cap = bgp_put_cap_add_path(p, cap); cap_len = cap - buf - 12; if (cap_len > 0) @@ -351,6 +384,16 @@ bgp_create_update(struct bgp_conn *conn, byte *buf) return NULL; } +static byte * +bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +{ + struct bgp_proto *p = conn->bgp; + BGP_TRACE(D_PACKETS, "Sending End-of-RIB"); + + put_u32(buf, 0); + return buf+4; +} + #else /* IPv6 version */ static inline int @@ -520,6 +563,26 @@ bgp_create_update(struct bgp_conn *conn, byte *buf) return NULL; } +static byte * +bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +{ + struct bgp_proto *p = conn->bgp; + BGP_TRACE(D_PACKETS, "Sending End-of-RIB"); + + put_u16(buf+0, 0); + put_u16(buf+2, 6); /* length 4-9 */ + buf += 4; + + /* Empty MP_UNREACH_NLRI atribute */ + *buf++ = BAF_OPTIONAL; + *buf++ = BA_MP_UNREACH_NLRI; + *buf++ = 3; /* Length 7-9 */ + *buf++ = 0; /* AFI */ + *buf++ = BGP_AF_IPV6; + *buf++ = 1; /* SAFI */ + return buf; +} + #endif static byte * @@ -606,10 +669,16 @@ bgp_fire_tx(struct bgp_conn *conn) { end = bgp_create_update(conn, pkt); type = PKT_UPDATE; + if (!end) { conn->packets_to_send = 0; - return 0; + + if (!p->send_end_mark) + return 0; + + p->send_end_mark = 0; + end = bgp_create_end_mark(conn, pkt); } } else @@ -678,6 +747,22 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_refresh_support = 1; break; + case 64: /* Graceful restart capability, RFC 4724 */ + if (cl % 4 != 2) + goto err; + conn->peer_gr_aware = 1; + conn->peer_gr_able = 0; + conn->peer_gr_time = get_u16(opt + 2) & 0x0fff; + conn->peer_gr_flags = opt[2] & 0xf0; + conn->peer_gr_aflags = 0; + for (i = 2; i < cl; i += 4) + if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ + { + conn->peer_gr_able = 1; + conn->peer_gr_aflags = opt[2+i+3]; + } + break; + case 65: /* AS4 capability, RFC 4893 */ if (cl != 4) goto err; @@ -704,7 +789,7 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) } return; - err: + err: bgp_error(conn, 2, 0, NULL, 0); return; } @@ -807,12 +892,17 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) { - case BS_IDLE: case BS_CONNECT: case BS_ACTIVE: + /* Stop outgoing connection attempts */ + bgp_conn_enter_idle_state(other); + break; + + case BS_IDLE: case BS_OPENSENT: case BS_CLOSE: break; + case BS_OPENCONFIRM: if ((p->local_id < id) == (conn == &p->incoming_conn)) { @@ -838,6 +928,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); + p->gr_ready = p->cf->gr_mode && conn->peer_gr_able; if (p->add_path_tx) p->p.accept_ra_types = RA_ANY; @@ -849,6 +940,20 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) bgp_conn_enter_openconfirm_state(conn); } + +static inline void +bgp_rx_end_mark(struct bgp_proto *p) +{ + BGP_TRACE(D_PACKETS, "Got End-of-RIB"); + + if (p->p.gr_recovery) + proto_graceful_restart_unlock(&p->p); + + if (p->gr_active) + bgp_graceful_restart_done(p); +} + + #define DECODE_PREFIX(pp, ll) do { \ if (p->add_path_rx) \ { \ @@ -983,6 +1088,13 @@ bgp_do_rx_update(struct bgp_conn *conn, u32 path_id = 0; u32 last_id = 0; + /* Check for End-of-RIB marker */ + if (!withdrawn_len && !attr_len && !nlri_len) + { + bgp_rx_end_mark(p); + return; + } + /* Withdraw routes */ while (withdrawn_len) { @@ -1088,6 +1200,14 @@ bgp_do_rx_update(struct bgp_conn *conn, if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ return; + /* Check for End-of-RIB marker */ + if ((attr_len < 8) && !withdrawn_len && !attr_len && + (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6)) + { + bgp_rx_end_mark(p); + return; + } + DO_NLRI(mp_unreach) { while (len) diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index 469c136d..630cda38 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -17,7 +17,7 @@ CF_DEFINES CF_DECLS -CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, KRT_SOURCE, KRT_METRIC) +CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC) CF_GRAMMAR @@ -46,6 +46,7 @@ kern_item: #endif } | DEVICE ROUTES bool { THIS_KRT->devroutes = $3; } + | GRACEFUL RESTART bool { THIS_KRT->graceful_restart = $3; } ; /* Kernel interface protocol */ diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 6fdef619..bff3001f 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -653,6 +653,13 @@ krt_got_route(struct krt_proto *p, rte *e) return; } + if (!p->ready) + { + /* We wait for the initial feed to have correct KRF_INSTALLED flag */ + verdict = KRF_IGNORE; + goto sentenced; + } + old = net->routes; if ((net->n.flags & KRF_INSTALLED) && rte_is_valid(old)) { @@ -779,7 +786,9 @@ krt_prune(struct krt_proto *p) if (KRT_CF->learn) krt_learn_prune(p); #endif - p->initialized = 1; + + if (p->ready) + p->initialized = 1; } void @@ -852,7 +861,7 @@ krt_scan_timer_start(struct krt_proto *p) krt_scan_count++; - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_timer, 1); } static void @@ -867,6 +876,12 @@ krt_scan_timer_stop(struct krt_proto *p) } } +static void +krt_scan_timer_kick(struct krt_proto *p UNUSED) +{ + tm_start(krt_scan_timer, 0); +} + #else static void @@ -885,7 +900,7 @@ static void krt_scan_timer_start(struct krt_proto *p) { p->scan_timer = tm_new_set(p->p.pool, krt_scan, p, 0, KRT_CF->scan_time); - tm_start(p->scan_timer, 0); + tm_start(p->scan_timer, 1); } static void @@ -894,6 +909,12 @@ krt_scan_timer_stop(struct krt_proto *p) tm_stop(p->scan_timer); } +static void +krt_scan_timer_kick(struct krt_proto *p UNUSED) +{ + tm_start(p->scan_timer, 0); +} + #endif @@ -970,6 +991,16 @@ krt_notify(struct proto *P, struct rtable *table UNUSED, net *net, krt_replace_rte(p, net, new, old, eattrs); } +static void +krt_feed_done(struct proto *P) +{ + struct krt_proto *p = (struct krt_proto *) P; + + p->ready = 1; + krt_scan_timer_kick(p); +} + + static int krt_rte_same(rte *a, rte *b) { @@ -992,6 +1023,7 @@ krt_init(struct proto_config *c) p->p.accept_ra_types = RA_OPTIMAL; p->p.import_control = krt_import_control; p->p.rt_notify = krt_notify; + p->p.feed_done = krt_feed_done; p->p.make_tmp_attrs = krt_make_tmp_attrs; p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.rte_same = krt_rte_same; @@ -1015,6 +1047,9 @@ krt_start(struct proto *P) krt_scan_timer_start(p); + if (P->gr_recovery && KRT_CF->graceful_restart) + P->gr_wait = 1; + return PS_UP; } @@ -1029,6 +1064,9 @@ krt_shutdown(struct proto *P) if (p->initialized && !KRT_CF->persist) krt_flush_routes(p); + p->ready = 0; + p->initialized = 0; + krt_sys_shutdown(p); rem_node(&p->krt_node); @@ -1045,7 +1083,7 @@ krt_reconfigure(struct proto *p, struct proto_config *new) if (!krt_sys_reconfigure((struct krt_proto *) p, n, o)) return 0; - /* persist needn't be the same */ + /* persist, graceful restart need not be the same */ return o->scan_time == n->scan_time && o->learn == n->learn && o->devroutes == n->devroutes; } diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 99983ccd..2cd23165 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -48,6 +48,7 @@ struct krt_config { int scan_time; /* How often we re-scan routes */ int learn; /* Learn routes from other sources */ int devroutes; /* Allow export of device routes */ + int graceful_restart; /* Regard graceful restart recovery */ }; struct krt_proto { @@ -63,7 +64,8 @@ struct krt_proto { #endif node krt_node; /* Node in krt_proto_list */ - int initialized; /* First scan has already been finished */ + byte ready; /* Initial feed has been finished */ + byte initialized; /* First scan has been finished */ }; extern pool *krt_pool; diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index e9217bc9..31094c52 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -602,7 +602,7 @@ signal_init(void) * Parsing of command-line arguments */ -static char *opt_list = "c:dD:ps:P:u:g:f"; +static char *opt_list = "c:dD:ps:P:u:g:fR"; static int parse_and_exit; char *bird_name; static char *use_user; @@ -612,7 +612,7 @@ static int run_in_foreground = 0; static void usage(void) { - fprintf(stderr, "Usage: %s [-c ] [-d] [-D ] [-p] [-s ] [-P ] [-u ] [-g ] [-f]\n", bird_name); + fprintf(stderr, "Usage: %s [-c ] [-d] [-D ] [-p] [-s ] [-P ] [-u ] [-g ] [-f] [-R]\n", bird_name); exit(1); } @@ -723,6 +723,9 @@ parse_args(int argc, char **argv) case 'f': run_in_foreground = 1; break; + case 'R': + graceful_restart_recovery(); + break; default: usage(); } @@ -805,6 +808,8 @@ main(int argc, char **argv) config_commit(conf, RECONFIG_HARD, 0); + graceful_restart_init(); + #ifdef LOCAL_DEBUG async_dump_flag = 1; #endif From 6eda3f135f5bab4db456531d25bc3e5f669ec22e Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Sun, 23 Mar 2014 01:35:33 +0100 Subject: [PATCH 2/3] Documentation (and minor fixes) for BGP graceful restart. --- doc/bird.sgml | 95 ++++++++++++++++++++++------- nest/proto.c | 154 +++++++++++++++++++++++++++++++++++------------- nest/rt-table.c | 53 ++++++++++++++--- proto/bgp/bgp.c | 42 ++++++++++++- 4 files changed, 274 insertions(+), 70 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index e9c61526..3ea90920 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -157,6 +157,9 @@ options. The most important ones are: -f run bird in foreground. + + -R + apply graceful restart recovery after start.

BIRD writes messages about its work to log files or syslog (according to config). @@ -187,6 +190,7 @@ configuration, but it is generally easy -- BIRD needs just the standard library, privileges to read the config file and create the control socket and the CAP_NET_* capabilities. + About routing tables

BIRD has one or more routing tables which may or may not be @@ -242,6 +246,20 @@ using comparison and ordering). Minor advantage is that routes are shown sorted in Graceful restart + +

When BIRD is started after restart or crash, it repopulates routing tables in +an uncoordinated manner, like after clean start. This may be impractical in some +cases, because if the forwarding plane (i.e. kernel routing tables) remains +intact, then its synchronization with BIRD would temporarily disrupt packet +forwarding until protocols converge. Graceful restart is a mechanism that could +help with this issue. Generally, it works by starting protocols and letting them +repopulate routing tables while deferring route propagation until protocols +acknowledge their convergence. Note that graceful restart behavior have to be +configured for all relevant protocols and requires protocol-specific support +(currently implemented for Kernel and BGP protocols), it is activated for +particular boot by option Configuration @@ -371,6 +389,12 @@ protocol rip { would accept IPv6 routes only). Such behavior was default in older versions of BIRD. + graceful restart wait + During graceful restart recovery, BIRD waits for convergence of routing + protocols. This option allows to specify a timeout for the recovery to + prevent waiting indefinitely if some protocols cannot converge. Default: + 240 seconds. + timeformat route|protocol|base|log " This option allows to specify a format of date/time used by BIRD. The first argument specifies for which purpose such @@ -1493,6 +1517,8 @@ extended communities (RFC 4360), route reflectors (RFC 4456), +graceful restart +(RFC 4724), multiprotocol extensions (RFC 4760), 4B AS numbers @@ -1502,9 +1528,7 @@ and 4B AS numbers in extended communities For IPv6, it uses the standard multiprotocol extensions defined in -RFC 2283 -including changes described in the -latest draft +RFC 4760 and applied to IPv6 according to RFC 2545. @@ -1716,6 +1740,26 @@ for each neighbor using the following configuration parameters: capability and accepts such requests. Even when disabled, BIRD can send route refresh requests. Default: on. + graceful restart + When a BGP speaker restarts or crashes, neighbors will discard all + received paths from the speaker, which disrupts packet forwarding even + when the forwarding plane of the speaker remains intact. RFC 4724 + specifies an optional graceful restart mechanism to alleviate this + issue. This option controls the mechanism. It has three states: + Disabled, when no support is provided. Aware, when the graceful restart + support is announced and the support for restarting neighbors is + provided, but no local graceful restart is allowed (i.e. receiving-only + role). Enabled, when the full graceful restart support is provided + (i.e. both restarting and receiving role). Note that proper support for + local graceful restart requires also configuration of other protocols. + Default: aware. + + graceful restart time + The restart time is announced in the BGP graceful restart capability + and specifies how long the neighbor would wait for the BGP session to + re-establish after a restart before deleting stale routes. Default: + 120 seconds. + interpret communities RFC 1997 demands that BGP speaker should process well-known communities like no-export (65535, 65281) or no-advertise (65535, 65282). For @@ -2063,25 +2107,36 @@ overcome using another routing table and the pipe protocol. Configuration

- persist Tell BIRD to leave all its routes in the - routing tables when it exits (instead of cleaning them up). - scan time Time in seconds between two consecutive scans of the - kernel routing table. - learn Enable learning of routes added to the kernel - routing tables by other routing daemons or by the system administrator. - This is possible only on systems which support identification of route - authorship. + persist + Tell BIRD to leave all its routes in the routing tables when it exits + (instead of cleaning them up). - device routes Enable export of device - routes to the kernel routing table. By default, such routes - are rejected (with the exception of explicitly configured - device routes from the static protocol) regardless of the - export filter to protect device routes in kernel routing table - (managed by OS itself) from accidental overwriting or erasing. + scan time + Time in seconds between two consecutive scans of the kernel routing + table. - kernel table Select which kernel table should - this particular instance of the Kernel protocol work with. Available - only on systems supporting multiple routing tables. + learn + Enable learning of routes added to the kernel routing tables by other + routing daemons or by the system administrator. This is possible only on + systems which support identification of route authorship. + + device routes + Enable export of device routes to the kernel routing table. By default, + such routes are rejected (with the exception of explicitly configured + device routes from the static protocol) regardless of the export filter + to protect device routes in kernel routing table (managed by OS itself) + from accidental overwriting or erasing. + + kernel table + Select which kernel table should this particular instance of the Kernel + protocol work with. Available only on systems supporting multiple + routing tables. + + graceful restart + Participate in graceful restart recovery. If this option is enabled and + a graceful restart recovery is active, the Kernel protocol will defer + synchronization of routing tables until the end of the recovery. Note + that import of kernel routes to BIRD is not affected. Attributes diff --git a/nest/proto.c b/nest/proto.c index 2bc3e319..e990b48f 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -51,6 +51,8 @@ static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" }; static void proto_flush_loop(void *); static void proto_shutdown_loop(struct timer *); static void proto_rethink_goal(struct proto *p); +static void proto_want_export_up(struct proto *p); +static void proto_fell_down(struct proto *p); static char *proto_state_name(struct proto *p); static void @@ -151,21 +153,20 @@ extern pool *rt_table_pool; * @t: routing table to connect to * @stats: per-table protocol statistics * - * This function creates a connection between the protocol instance @p - * and the routing table @t, making the protocol hear all changes in - * the table. + * This function creates a connection between the protocol instance @p and the + * routing table @t, making the protocol hear all changes in the table. * - * The announce hook is linked in the protocol ahook list and, if the - * protocol accepts routes, also in the table ahook list. Announce - * hooks are allocated from the routing table resource pool, they are - * unlinked from the table ahook list after the protocol went down, - * (in proto_schedule_flush()) and they are automatically freed after the - * protocol is flushed (in proto_fell_down()). + * The announce hook is linked in the protocol ahook list. Announce hooks are + * allocated from the routing table resource pool and when protocol accepts + * routes also in the table ahook list. The are linked to the table ahook list + * and unlinked from it depending on export_state (in proto_want_export_up() and + * proto_want_export_down()) and they are automatically freed after the protocol + * is flushed (in proto_fell_down()). * - * Unless you want to listen to multiple routing tables (as the Pipe - * protocol does), you needn't to worry about this function since the - * connection to the protocol's primary routing table is initialized - * automatically by the core code. + * Unless you want to listen to multiple routing tables (as the Pipe protocol + * does), you needn't to worry about this function since the connection to the + * protocol's primary routing table is initialized automatically by the core + * code. */ struct announce_hook * proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *stats) @@ -183,7 +184,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s h->next = p->ahooks; p->ahooks = h; - if (p->rt_notify && (p->export_state == ES_READY)) + if (p->rt_notify && (p->export_state != ES_DOWN)) add_tail(&t->hooks, &h->n); return h; } @@ -659,16 +660,59 @@ proto_rethink_goal(struct proto *p) } +/** + * DOC: Graceful restart recovery + * + * Graceful restart of a router is a process when the routing plane (e.g. BIRD) + * restarts but both the forwarding plane (e.g kernel routing table) and routing + * neighbors keep proper routes, and therefore uninterrupted packet forwarding + * is maintained. + * + * BIRD implements graceful restart recovery by deferring export of routes to + * protocols until routing tables are refilled with the expected content. After + * start, protocols generate routes as usual, but routes are not propagated to + * them, until protocols report that they generated all routes. After that, + * graceful restart recovery is finished and the export (and the initial feed) + * to protocols is enabled. + * + * When graceful restart recovery need is detected during initialization, then + * enabled protocols are marked with @gr_recovery flag before start. Such + * protocols then decide how to proceed with graceful restart, participation is + * voluntary. Protocols could lock the recovery by proto_graceful_restart_lock() + * (stored in @gr_lock flag), which means that they want to postpone the end of + * the recovery until they converge and then unlock it. They also could set + * @gr_wait before advancing to %PS_UP, which means that the core should defer + * route export to that protocol until the end of the recovery. This should be + * done by protocols that expect their neigbors to keep the proper routes + * (kernel table, BGP sessions with BGP graceful restart capability). + * + * The graceful restart recovery is finished when either all graceful restart + * locks are unlocked or when graceful restart wait timer fires. + * + */ -static void graceful_restart_done(struct timer *t UNUSED); -static void proto_want_export_up(struct proto *p); +static void graceful_restart_done(struct timer *t); +/** + * graceful_restart_recovery - request initial graceful restart recovery + * + * Called by the platform initialization code if the need for recovery + * after graceful restart is detected during boot. Have to be called + * before protos_commit(). + */ void graceful_restart_recovery(void) { graceful_restart_state = GRS_INIT; } +/** + * graceful_restart_init - initialize graceful restart + * + * When graceful restart recovery was requested, the function starts an active + * phase of the recovery and initializes graceful restart wait timer. The + * function have to be called after protos_commit(). + */ void graceful_restart_init(void) { @@ -689,6 +733,15 @@ graceful_restart_init(void) tm_start(gr_wait_timer, config->gr_wait); } +/** + * graceful_restart_done - finalize graceful restart + * + * When there are no locks on graceful restart, the functions finalizes the + * graceful restart recovery. Protocols postponing route export until the end of + * the recovery are awakened and the export to them is enabled. All other + * related state is cleared. The function is also called when the graceful + * restart wait timer fires (but there are still some locks). + */ static void graceful_restart_done(struct timer *t UNUSED) { @@ -727,7 +780,19 @@ graceful_restart_show_status(void) cli_msg(-24, " Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait); } -/* Just from start hook */ +/** + * proto_graceful_restart_lock - lock graceful restart by protocol + * @p: protocol instance + * + * This function allows a protocol to postpone the end of graceful restart + * recovery until it converges. The lock is removed when the protocol calls + * proto_graceful_restart_unlock() or when the protocol is stopped. + * + * The function have to be called during the initial phase of graceful restart + * recovery and only for protocols that are part of graceful restart (i.e. their + * @gr_recovery is set), which means it should be called from protocol start + * hooks. + */ void proto_graceful_restart_lock(struct proto *p) { @@ -741,6 +806,13 @@ proto_graceful_restart_lock(struct proto *p) graceful_restart_locks++; } +/** + * proto_graceful_restart_unlock - unlock graceful restart by protocol + * @p: protocol instance + * + * This function unlocks a lock from proto_graceful_restart_lock(). It is also + * automatically called when the lock holding protocol went down. + */ void proto_graceful_restart_unlock(struct proto *p) { @@ -867,29 +939,6 @@ protos_build(void) proto_flush_event->hook = proto_flush_loop; proto_shutdown_timer = tm_new(proto_pool); proto_shutdown_timer->hook = proto_shutdown_loop; - proto_shutdown_timer = tm_new(proto_pool); - proto_shutdown_timer->hook = proto_shutdown_loop; -} - -static void -proto_fell_down(struct proto *p) -{ - DBG("Protocol %s down\n", p->name); - - u32 all_routes = p->stats.imp_routes + p->stats.filt_routes; - if (all_routes != 0) - log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes); - - bzero(&p->stats, sizeof(struct proto_stats)); - proto_free_ahooks(p); - - if (! p->proto->multitable) - rt_unlock_table(p->table); - - if (p->proto->cleanup) - p->proto->cleanup(p); - - proto_rethink_goal(p); } static void @@ -1066,6 +1115,10 @@ proto_request_feeding(struct proto *p) { ASSERT(p->proto_state == PS_UP); + /* Do nothing if we are still waiting for feeding */ + if (p->export_state == ES_DOWN) + return; + /* If we are already feeding, we want to restart it */ if (p->export_state == ES_FEEDING) { @@ -1220,6 +1273,27 @@ proto_falling_down(struct proto *p) proto_graceful_restart_unlock(p); } +static void +proto_fell_down(struct proto *p) +{ + DBG("Protocol %s down\n", p->name); + + u32 all_routes = p->stats.imp_routes + p->stats.filt_routes; + if (all_routes != 0) + log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes); + + bzero(&p->stats, sizeof(struct proto_stats)); + proto_free_ahooks(p); + + if (! p->proto->multitable) + rt_unlock_table(p->table); + + if (p->proto->cleanup) + p->proto->cleanup(p); + + proto_rethink_goal(p); +} + /** * proto_notify_state - notify core about protocol state change diff --git a/nest/rt-table.c b/nest/rt-table.c index bc911729..4295f836 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -1110,6 +1110,21 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter return v > 0; } + +/** + * rt_refresh_begin - start a refresh cycle + * @t: related routing table + * @ah: related announce hook + * + * This function starts a refresh cycle for given routing table and announce + * hook. The refresh cycle is a sequence where the protocol sends all its valid + * routes to the routing table (by rte_update()). After that, all protocol + * routes (more precisely routes with @ah as @sender) not sent during the + * refresh cycle but still in the table from the past are pruned. This is + * implemented by marking all related routes as stale by REF_STALE flag in + * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD + * flag in rt_refresh_end() and then removing such routes in the prune loop. + */ void rt_refresh_begin(rtable *t, struct announce_hook *ah) { @@ -1126,6 +1141,14 @@ rt_refresh_begin(rtable *t, struct announce_hook *ah) FIB_WALK_END; } +/** + * rt_refresh_end - end a refresh cycle + * @t: related routing table + * @ah: related announce hook + * + * This function starts a refresh cycle for given routing table and announce + * hook. See rt_refresh_begin() for description of refresh cycles. + */ void rt_refresh_end(rtable *t, struct announce_hook *ah) { @@ -1405,6 +1428,19 @@ again: return 1; } +/** + * rt_prune_table - prune a routing table + * + * This function scans the routing table @tab and removes routes belonging to + * flushing protocols, discarded routes and also stale network entries, in a + * similar fashion like rt_prune_loop(). Returns 1 when all such routes are + * pruned. Contrary to rt_prune_loop(), this function is not a part of the + * protocol flushing loop, but it is called from rt_event() for just one routing + * table. + * + * Note that rt_prune_table() and rt_prune_loop() share (for each table) the + * prune state (@prune_state) and also the pruning iterator (@prune_fit). + */ static inline int rt_prune_table(rtable *tab) { @@ -1415,16 +1451,15 @@ rt_prune_table(rtable *tab) /** * rt_prune_loop - prune routing tables * - * The prune loop scans routing tables and removes routes belonging to - * flushing protocols and also stale network entries. Returns 1 when - * all such routes are pruned. It is a part of the protocol flushing - * loop. + * The prune loop scans routing tables and removes routes belonging to flushing + * protocols, discarded routes and also stale network entries. Returns 1 when + * all such routes are pruned. It is a part of the protocol flushing loop. * - * The prune loop runs in two steps. In the first step it prunes just - * the routes with flushing senders (in explicitly marked tables) so - * the route removal is propagated as usual. In the second step, all - * remaining relevant routes are removed. Ideally, there shouldn't be - * any, but it happens when pipe filters are changed. + * The prune loop runs in two steps. In the first step it prunes just the routes + * with flushing senders (in explicitly marked tables) so the route removal is + * propagated as usual. In the second step, all remaining relevant routes are + * removed. Ideally, there shouldn't be any, but it happens when pipe filters + * are changed. */ int rt_prune_loop(void) diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index ae9f6877..326883dd 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -51,6 +51,16 @@ * and bgp_encode_attrs() which does the converse. Both functions are built around a * @bgp_attr_table array describing all important characteristics of all known attributes. * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. + * + * BGP protocol implements graceful restart in both restarting (local restart) + * and receiving (neighbor restart) roles. The first is handled mostly by the + * graceful restart code in the nest, BGP protocol just handles capabilities, + * sets @gr_wait and locks graceful restart until end-of-RIB mark is received. + * The second is implemented by internal restart of the BGP state to %BS_IDLE + * and protocol state to %PS_START, but keeping the protocol up from the core + * point of view and therefore maintaining received routes. Routing table + * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing + * stale routes after reestablishment of BGP session during graceful restart. */ #undef LOCAL_DEBUG @@ -431,6 +441,17 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn) bgp_conn_leave_established_state(p); } +/** + * bgp_handle_graceful_restart - handle detected BGP graceful restart + * @p: BGP instance + * + * This function is called when a BGP graceful restart of the neighbor is + * detected (when the TCP connection fails or when a new TCP connection + * appears). The function activates processing of the restart - starts routing + * table refresh cycle and activates BGP restart timer. The protocol state goes + * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the + * caller. + */ void bgp_handle_graceful_restart(struct bgp_proto *p) { @@ -448,6 +469,16 @@ bgp_handle_graceful_restart(struct bgp_proto *p) rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); } +/** + * bgp_graceful_restart_done - finish active BGP graceful restart + * @p: BGP instance + * + * This function is called when the active BGP graceful restart of the neighbor + * should be finished - either successfully (the neighbor sends all paths and + * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does + * not support BGP graceful restart on the new session). The function ends + * routing table refresh cycle and stops BGP restart timer. + */ void bgp_graceful_restart_done(struct bgp_proto *p) { @@ -457,6 +488,15 @@ bgp_graceful_restart_done(struct bgp_proto *p) rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); } +/** + * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer' + * @t: timer + * + * This function is a timeout hook for @gr_timer, implementing BGP restart time + * limit for reestablisment of the BGP session after the graceful restart. When + * fired, we just proceed with the usual protocol restart. + */ + static void bgp_graceful_restart_timeout(timer *t) { @@ -968,7 +1008,7 @@ bgp_start(struct proto *P) p->remote_id = 0; p->source_addr = p->cf->source_addr; - if (P->gr_recovery) + if (p->p.gr_recovery && p->cf->gr_mode) proto_graceful_restart_lock(P); /* From 227af309e55a59f14d1a5a757f17900164bffc97 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Mon, 24 Mar 2014 12:32:12 +0100 Subject: [PATCH 3/3] Fixes some minor issues in graceful restart. --- nest/proto.c | 69 ++++++++++++++++++++++++++++--------------------- proto/bgp/bgp.c | 2 +- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index e990b48f..13a0833a 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -56,31 +56,10 @@ static void proto_fell_down(struct proto *p); static char *proto_state_name(struct proto *p); static void -proto_enqueue(list *l, struct proto *p) -{ - add_tail(l, &p->n); -} - -static void -proto_set_core_state(struct proto *p, uint state) +proto_relink(struct proto *p) { list *l = NULL; - p->core_state = state; - - if (p->debug & D_STATES) - { - char *name = proto_state_name(p); - if (name != p->last_state_name_announced) - { - p->last_state_name_announced = name; - PD(p, "State changed to %s", proto_state_name(p)); - } - } - else - p->last_state_name_announced = NULL; - - rem_node(&p->n); switch (p->core_state) { case FS_HUNGRY: @@ -95,9 +74,28 @@ proto_set_core_state(struct proto *p, uint state) default: ASSERT(0); } - proto_enqueue(l, p); + + rem_node(&p->n); + add_tail(l, &p->n); } +static void +proto_log_state_change(struct proto *p) +{ + if (p->debug & D_STATES) + { + char *name = proto_state_name(p); + if (name != p->last_state_name_announced) + { + p->last_state_name_announced = name; + PD(p, "State changed to %s", proto_state_name(p)); + } + } + else + p->last_state_name_announced = NULL; +} + + /** * proto_new - create a new protocol instance * @c: protocol configuration @@ -390,7 +388,8 @@ proto_init(struct proto_config *c) q->export_state = ES_DOWN; q->last_state_change = now; - proto_enqueue(&initial_proto_list, q); + add_tail(&initial_proto_list, &q->n); + if (p == &proto_unix_iface) initial_device_proto = q; @@ -758,7 +757,10 @@ graceful_restart_done(struct timer *t UNUSED) /* Resume postponed export of routes */ if ((p->proto_state == PS_UP) && p->gr_wait) + { proto_want_export_up(p); + proto_log_state_change(p); + } /* Cleanup */ p->gr_recovery = 0; @@ -954,6 +956,7 @@ proto_feed_more(void *P) { DBG("Feeding protocol %s finished\n", p->name); p->export_state = ES_READY; + proto_log_state_change(p); if (p->feed_done) p->feed_done(p); @@ -1047,7 +1050,9 @@ proto_flush_loop(void *unused UNUSED) DBG("Flushing protocol %s\n", p->name); p->flushing = 0; - proto_set_core_state(p, FS_HUNGRY); + p->core_state = FS_HUNGRY; + proto_relink(p); + proto_log_state_change(p); if (p->proto_state == PS_DOWN) proto_fell_down(p); goto again; @@ -1138,6 +1143,7 @@ proto_request_feeding(struct proto *p) p->stats.exp_routes = 0; proto_schedule_feed(p, 0); + proto_log_state_change(p); } static const char * @@ -1222,7 +1228,8 @@ proto_want_core_up(struct proto *p) proto_reset_limit(p->main_ahook->out_limit); } - proto_set_core_state(p, FS_HAPPY); + p->core_state = FS_HAPPY; + proto_relink(p); } static void @@ -1254,7 +1261,8 @@ proto_want_core_down(struct proto *p) ASSERT(p->core_state == CS_HAPPY); ASSERT(p->export_state == ES_DOWN); - proto_set_core_state(p, FS_FLUSHING); + p->core_state = FS_FLUSHING; + proto_relink(p); proto_schedule_flush_loop(); if (!p->proto->multitable) @@ -1373,6 +1381,7 @@ proto_notify_state(struct proto *p, unsigned ps) if (cs == FS_HUNGRY) /* Shutdown finished */ { + proto_log_state_change(p); proto_fell_down(p); return; /* The protocol might have ceased to exist */ } @@ -1381,6 +1390,8 @@ proto_notify_state(struct proto *p, unsigned ps) default: bug("%s: Invalid state %d", p->name, ps); } + + proto_log_state_change(p); } /* @@ -1404,8 +1415,8 @@ proto_state_name(struct proto *p) case ES_READY: return "up"; default: return "???"; } - case P(PS_STOP, FS_HUNGRY): return "stop"; - case P(PS_STOP, FS_FLUSHING): + case P(PS_STOP, FS_HUNGRY): + case P(PS_STOP, FS_FLUSHING): return "stop"; case P(PS_DOWN, FS_FLUSHING): return "flush"; default: return "???"; } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 326883dd..ca619f31 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -927,7 +927,7 @@ static void bgp_feed_done(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - if (!p->conn || !p->cf->gr_mode) + if (!p->conn || !p->cf->gr_mode || p->p.refeeding) return; p->send_end_mark = 1;