Nest: Improve GC strategy for rtables
Use timer (configurable as 'gc period') to schedule routing table GC/pruning to ensure that prune is done on time but not too often. Randomize GC timers to avoid concentration of GC events from different tables in one loop cycle. Fix a bug that caused minimum inter-GC interval be 5 us instead of 5 s. Make default 'gc period' adaptive based on number of routing tables, from 10 s for small setups to 600 s for large ones. In marge multi-table RS setup, the patch improved time of flushing a downed peer from 20-30 min to <2 min and removed 40s latencies.
This commit is contained in:
parent
9e60b500c7
commit
a8a3d95be5
5 changed files with 70 additions and 12 deletions
|
@ -140,6 +140,7 @@ config_parse(struct config *c)
|
||||||
protos_preconfig(c);
|
protos_preconfig(c);
|
||||||
rt_preconfig(c);
|
rt_preconfig(c);
|
||||||
cf_parse();
|
cf_parse();
|
||||||
|
rt_postconfig(c);
|
||||||
|
|
||||||
if (EMPTY_LIST(c->protos))
|
if (EMPTY_LIST(c->protos))
|
||||||
cf_error("No protocol is specified in the config file");
|
cf_error("No protocol is specified in the config file");
|
||||||
|
|
|
@ -684,6 +684,21 @@ to set options.
|
||||||
limit to the settle time from the initial ROA table change even if
|
limit to the settle time from the initial ROA table change even if
|
||||||
there are consecutive updates gradually renewing the settle time.
|
there are consecutive updates gradually renewing the settle time.
|
||||||
Default: 20 s.
|
Default: 20 s.
|
||||||
|
|
||||||
|
<tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag>
|
||||||
|
Specify a minimum amount of removed networks that triggers a garbage
|
||||||
|
collection (GC) cycle. Default: 1000.
|
||||||
|
|
||||||
|
<tag><label id="rtable-gc-period">gc period <m/time/</tag>
|
||||||
|
Specify a period of time between consecutive GC cycles. When there is a
|
||||||
|
significant amount of route withdraws, GC cycles are executed repeatedly
|
||||||
|
with given period time (with some random factor). When there is just
|
||||||
|
small amount of changes, GC cycles are not executed. In extensive route
|
||||||
|
server setups, running GC on hundreds of full BGP routing tables can
|
||||||
|
take significant amount of time, therefore they should use higher GC
|
||||||
|
periods. Default: adaptive, based on number of routing tables in the
|
||||||
|
configuration. From 10 s (with <= 25 routing tables) up to 600 s (with
|
||||||
|
>= 1500 routing tables).
|
||||||
</descrip>
|
</descrip>
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US)
|
||||||
CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS)
|
CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS)
|
||||||
CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE)
|
CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE)
|
||||||
CF_KEYWORDS(CHECK, LINK)
|
CF_KEYWORDS(CHECK, LINK)
|
||||||
CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME)
|
CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME, GC, THRESHOLD, PERIOD)
|
||||||
|
|
||||||
/* For r_args_channel */
|
/* For r_args_channel */
|
||||||
CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC)
|
CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC)
|
||||||
|
@ -229,6 +229,8 @@ table_opt:
|
||||||
}
|
}
|
||||||
| MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; }
|
| MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; }
|
||||||
| MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; }
|
| MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; }
|
||||||
|
| GC THRESHOLD expr { this_table->gc_threshold = $3; }
|
||||||
|
| GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); }
|
||||||
;
|
;
|
||||||
|
|
||||||
table_opts:
|
table_opts:
|
||||||
|
|
|
@ -148,8 +148,8 @@ struct rtable_config {
|
||||||
struct rtable *table;
|
struct rtable *table;
|
||||||
struct proto_config *krt_attached; /* Kernel syncer attached to this table */
|
struct proto_config *krt_attached; /* Kernel syncer attached to this table */
|
||||||
uint addr_type; /* Type of address data stored in table (NET_*) */
|
uint addr_type; /* Type of address data stored in table (NET_*) */
|
||||||
int gc_max_ops; /* Maximum number of operations before GC is run */
|
uint gc_threshold; /* Maximum number of operations before GC is run */
|
||||||
int gc_min_time; /* Minimum time between two consecutive GC runs */
|
uint gc_period; /* Approximate time between two consecutive GC runs */
|
||||||
byte sorted; /* Routes of network are sorted according to rte_better() */
|
byte sorted; /* Routes of network are sorted according to rte_better() */
|
||||||
byte internal; /* Internal table of a protocol */
|
byte internal; /* Internal table of a protocol */
|
||||||
byte trie_used; /* Rtable has attached trie */
|
byte trie_used; /* Rtable has attached trie */
|
||||||
|
@ -180,10 +180,11 @@ typedef struct rtable {
|
||||||
* obstacle from this routing table.
|
* obstacle from this routing table.
|
||||||
*/
|
*/
|
||||||
struct event *rt_event; /* Routing table event */
|
struct event *rt_event; /* Routing table event */
|
||||||
|
struct timer *prune_timer; /* Timer for periodic pruning / GC */
|
||||||
btime last_rt_change; /* Last time when route changed */
|
btime last_rt_change; /* Last time when route changed */
|
||||||
btime base_settle_time; /* Start time of rtable settling interval */
|
btime base_settle_time; /* Start time of rtable settling interval */
|
||||||
btime gc_time; /* Time of last GC */
|
btime gc_time; /* Time of last GC */
|
||||||
int gc_counter; /* Number of operations since last GC */
|
uint gc_counter; /* Number of operations since last GC */
|
||||||
byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */
|
byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */
|
||||||
byte prune_trie; /* Prune prefix trie during next table prune */
|
byte prune_trie; /* Prune prefix trie during next table prune */
|
||||||
byte hcu_scheduled; /* Hostcache update is scheduled */
|
byte hcu_scheduled; /* Hostcache update is scheduled */
|
||||||
|
@ -332,6 +333,7 @@ struct config;
|
||||||
|
|
||||||
void rt_init(void);
|
void rt_init(void);
|
||||||
void rt_preconfig(struct config *);
|
void rt_preconfig(struct config *);
|
||||||
|
void rt_postconfig(struct config *);
|
||||||
void rt_commit(struct config *new, struct config *old);
|
void rt_commit(struct config *new, struct config *old);
|
||||||
void rt_lock_table(rtable *);
|
void rt_lock_table(rtable *);
|
||||||
void rt_unlock_table(rtable *);
|
void rt_unlock_table(rtable *);
|
||||||
|
|
|
@ -124,6 +124,7 @@ static void rt_next_hop_update(rtable *tab);
|
||||||
static inline void rt_prune_table(rtable *tab);
|
static inline void rt_prune_table(rtable *tab);
|
||||||
static inline void rt_schedule_notify(rtable *tab);
|
static inline void rt_schedule_notify(rtable *tab);
|
||||||
static void rt_flowspec_notify(rtable *tab, net *net);
|
static void rt_flowspec_notify(rtable *tab, net *net);
|
||||||
|
static void rt_kick_prune_timer(rtable *tab);
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1641,9 +1642,8 @@ rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
|
||||||
rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
|
rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
|
||||||
|
|
||||||
if (!net->routes &&
|
if (!net->routes &&
|
||||||
(table->gc_counter++ >= table->config->gc_max_ops) &&
|
(table->gc_counter++ >= table->config->gc_threshold))
|
||||||
(table->gc_time + table->config->gc_min_time <= current_time()))
|
rt_kick_prune_timer(table);
|
||||||
rt_schedule_prune(table);
|
|
||||||
|
|
||||||
if (old_ok && p->rte_remove)
|
if (old_ok && p->rte_remove)
|
||||||
p->rte_remove(net, old);
|
p->rte_remove(net, old);
|
||||||
|
@ -2098,6 +2098,29 @@ rt_event(void *ptr)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
rt_prune_timer(timer *t)
|
||||||
|
{
|
||||||
|
rtable *tab = t->data;
|
||||||
|
|
||||||
|
if (tab->gc_counter >= tab->config->gc_threshold)
|
||||||
|
rt_schedule_prune(tab);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
rt_kick_prune_timer(rtable *tab)
|
||||||
|
{
|
||||||
|
/* Return if prune is already scheduled */
|
||||||
|
if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Randomize GC period to +/- 50% */
|
||||||
|
btime gc_period = tab->config->gc_period;
|
||||||
|
gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
|
||||||
|
tm_start(tab->prune_timer, gc_period);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline btime
|
static inline btime
|
||||||
rt_settled_time(rtable *tab)
|
rt_settled_time(rtable *tab)
|
||||||
{
|
{
|
||||||
|
@ -2333,6 +2356,7 @@ rt_setup(pool *pp, struct rtable_config *cf)
|
||||||
hmap_set(&t->id_map, 0);
|
hmap_set(&t->id_map, 0);
|
||||||
|
|
||||||
t->rt_event = ev_new_init(p, rt_event, t);
|
t->rt_event = ev_new_init(p, rt_event, t);
|
||||||
|
t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
|
||||||
t->last_rt_change = t->gc_time = current_time();
|
t->last_rt_change = t->gc_time = current_time();
|
||||||
|
|
||||||
if (rt_is_flow(t))
|
if (rt_is_flow(t))
|
||||||
|
@ -2403,6 +2427,9 @@ rt_prune_table(rtable *tab)
|
||||||
FIB_ITERATE_INIT(fit, &tab->fib);
|
FIB_ITERATE_INIT(fit, &tab->fib);
|
||||||
tab->prune_state = 2;
|
tab->prune_state = 2;
|
||||||
|
|
||||||
|
tab->gc_counter = 0;
|
||||||
|
tab->gc_time = current_time();
|
||||||
|
|
||||||
if (tab->prune_trie)
|
if (tab->prune_trie)
|
||||||
{
|
{
|
||||||
/* Init prefix trie pruning */
|
/* Init prefix trie pruning */
|
||||||
|
@ -2462,9 +2489,6 @@ again:
|
||||||
fib_check(&tab->fib);
|
fib_check(&tab->fib);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
tab->gc_counter = 0;
|
|
||||||
tab->gc_time = current_time();
|
|
||||||
|
|
||||||
/* state change 2->0, 3->1 */
|
/* state change 2->0, 3->1 */
|
||||||
tab->prune_state &= 1;
|
tab->prune_state &= 1;
|
||||||
|
|
||||||
|
@ -2591,6 +2615,20 @@ rt_preconfig(struct config *c)
|
||||||
rt_new_table(cf_get_symbol("master6"), NET_IP6);
|
rt_new_table(cf_get_symbol("master6"), NET_IP6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
rt_postconfig(struct config *c)
|
||||||
|
{
|
||||||
|
uint num_tables = list_length(&c->tables);
|
||||||
|
btime def_gc_period = 400 MS * num_tables;
|
||||||
|
def_gc_period = MAX(def_gc_period, 10 S);
|
||||||
|
def_gc_period = MIN(def_gc_period, 600 S);
|
||||||
|
|
||||||
|
struct rtable_config *rc;
|
||||||
|
WALK_LIST(rc, c->tables)
|
||||||
|
if (rc->gc_period == (uint) -1)
|
||||||
|
rc->gc_period = (uint) def_gc_period;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Some functions for handing internal next hop updates
|
* Some functions for handing internal next hop updates
|
||||||
|
@ -2999,8 +3037,8 @@ rt_new_table(struct symbol *s, uint addr_type)
|
||||||
cf_define_symbol(s, SYM_TABLE, table, c);
|
cf_define_symbol(s, SYM_TABLE, table, c);
|
||||||
c->name = s->name;
|
c->name = s->name;
|
||||||
c->addr_type = addr_type;
|
c->addr_type = addr_type;
|
||||||
c->gc_max_ops = 1000;
|
c->gc_threshold = 1000;
|
||||||
c->gc_min_time = 5;
|
c->gc_period = (uint) -1; /* set in rt_postconfig() */
|
||||||
c->min_settle_time = 1 S;
|
c->min_settle_time = 1 S;
|
||||||
c->max_settle_time = 20 S;
|
c->max_settle_time = 20 S;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue