Nest: Improve GC strategy for rtables

Use timer (configurable as 'gc period') to schedule routing table
GC/pruning to ensure that prune is done on time but not too often.

Randomize GC timers to avoid concentration of GC events from different
tables in one loop cycle.

Fix a bug that caused minimum inter-GC interval be 5 us instead of 5 s.

Make default 'gc period' adaptive based on number of routing tables,
from 10 s for small setups to 600 s for large ones.

In marge multi-table RS setup, the patch improved time of flushing
a downed peer from 20-30 min to <2 min and removed 40s latencies.
This commit is contained in:
Ondrej Zajicek 2022-06-04 17:34:57 +02:00
parent 9e60b500c7
commit a8a3d95be5
5 changed files with 70 additions and 12 deletions

View file

@ -140,6 +140,7 @@ config_parse(struct config *c)
protos_preconfig(c); protos_preconfig(c);
rt_preconfig(c); rt_preconfig(c);
cf_parse(); cf_parse();
rt_postconfig(c);
if (EMPTY_LIST(c->protos)) if (EMPTY_LIST(c->protos))
cf_error("No protocol is specified in the config file"); cf_error("No protocol is specified in the config file");

View file

@ -684,6 +684,21 @@ to set options.
limit to the settle time from the initial ROA table change even if limit to the settle time from the initial ROA table change even if
there are consecutive updates gradually renewing the settle time. there are consecutive updates gradually renewing the settle time.
Default: 20 s. Default: 20 s.
<tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag>
Specify a minimum amount of removed networks that triggers a garbage
collection (GC) cycle. Default: 1000.
<tag><label id="rtable-gc-period">gc period <m/time/</tag>
Specify a period of time between consecutive GC cycles. When there is a
significant amount of route withdraws, GC cycles are executed repeatedly
with given period time (with some random factor). When there is just
small amount of changes, GC cycles are not executed. In extensive route
server setups, running GC on hundreds of full BGP routing tables can
take significant amount of time, therefore they should use higher GC
periods. Default: adaptive, based on number of routing tables in the
configuration. From 10 s (with <= 25 routing tables) up to 600 s (with
>= 1500 routing tables).
</descrip> </descrip>

View file

@ -125,7 +125,7 @@ CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US)
CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS) CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS)
CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE) CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE)
CF_KEYWORDS(CHECK, LINK) CF_KEYWORDS(CHECK, LINK)
CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME) CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME, GC, THRESHOLD, PERIOD)
/* For r_args_channel */ /* For r_args_channel */
CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC) CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC)
@ -229,6 +229,8 @@ table_opt:
} }
| MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; } | MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; }
| MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; } | MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; }
| GC THRESHOLD expr { this_table->gc_threshold = $3; }
| GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); }
; ;
table_opts: table_opts:

View file

@ -148,8 +148,8 @@ struct rtable_config {
struct rtable *table; struct rtable *table;
struct proto_config *krt_attached; /* Kernel syncer attached to this table */ struct proto_config *krt_attached; /* Kernel syncer attached to this table */
uint addr_type; /* Type of address data stored in table (NET_*) */ uint addr_type; /* Type of address data stored in table (NET_*) */
int gc_max_ops; /* Maximum number of operations before GC is run */ uint gc_threshold; /* Maximum number of operations before GC is run */
int gc_min_time; /* Minimum time between two consecutive GC runs */ uint gc_period; /* Approximate time between two consecutive GC runs */
byte sorted; /* Routes of network are sorted according to rte_better() */ byte sorted; /* Routes of network are sorted according to rte_better() */
byte internal; /* Internal table of a protocol */ byte internal; /* Internal table of a protocol */
byte trie_used; /* Rtable has attached trie */ byte trie_used; /* Rtable has attached trie */
@ -180,10 +180,11 @@ typedef struct rtable {
* obstacle from this routing table. * obstacle from this routing table.
*/ */
struct event *rt_event; /* Routing table event */ struct event *rt_event; /* Routing table event */
struct timer *prune_timer; /* Timer for periodic pruning / GC */
btime last_rt_change; /* Last time when route changed */ btime last_rt_change; /* Last time when route changed */
btime base_settle_time; /* Start time of rtable settling interval */ btime base_settle_time; /* Start time of rtable settling interval */
btime gc_time; /* Time of last GC */ btime gc_time; /* Time of last GC */
int gc_counter; /* Number of operations since last GC */ uint gc_counter; /* Number of operations since last GC */
byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */
byte prune_trie; /* Prune prefix trie during next table prune */ byte prune_trie; /* Prune prefix trie during next table prune */
byte hcu_scheduled; /* Hostcache update is scheduled */ byte hcu_scheduled; /* Hostcache update is scheduled */
@ -332,6 +333,7 @@ struct config;
void rt_init(void); void rt_init(void);
void rt_preconfig(struct config *); void rt_preconfig(struct config *);
void rt_postconfig(struct config *);
void rt_commit(struct config *new, struct config *old); void rt_commit(struct config *new, struct config *old);
void rt_lock_table(rtable *); void rt_lock_table(rtable *);
void rt_unlock_table(rtable *); void rt_unlock_table(rtable *);

View file

@ -124,6 +124,7 @@ static void rt_next_hop_update(rtable *tab);
static inline void rt_prune_table(rtable *tab); static inline void rt_prune_table(rtable *tab);
static inline void rt_schedule_notify(rtable *tab); static inline void rt_schedule_notify(rtable *tab);
static void rt_flowspec_notify(rtable *tab, net *net); static void rt_flowspec_notify(rtable *tab, net *net);
static void rt_kick_prune_timer(rtable *tab);
static void static void
@ -1641,9 +1642,8 @@ rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best); rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
if (!net->routes && if (!net->routes &&
(table->gc_counter++ >= table->config->gc_max_ops) && (table->gc_counter++ >= table->config->gc_threshold))
(table->gc_time + table->config->gc_min_time <= current_time())) rt_kick_prune_timer(table);
rt_schedule_prune(table);
if (old_ok && p->rte_remove) if (old_ok && p->rte_remove)
p->rte_remove(net, old); p->rte_remove(net, old);
@ -2098,6 +2098,29 @@ rt_event(void *ptr)
} }
static void
rt_prune_timer(timer *t)
{
rtable *tab = t->data;
if (tab->gc_counter >= tab->config->gc_threshold)
rt_schedule_prune(tab);
}
static void
rt_kick_prune_timer(rtable *tab)
{
/* Return if prune is already scheduled */
if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
return;
/* Randomize GC period to +/- 50% */
btime gc_period = tab->config->gc_period;
gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
tm_start(tab->prune_timer, gc_period);
}
static inline btime static inline btime
rt_settled_time(rtable *tab) rt_settled_time(rtable *tab)
{ {
@ -2333,6 +2356,7 @@ rt_setup(pool *pp, struct rtable_config *cf)
hmap_set(&t->id_map, 0); hmap_set(&t->id_map, 0);
t->rt_event = ev_new_init(p, rt_event, t); t->rt_event = ev_new_init(p, rt_event, t);
t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
t->last_rt_change = t->gc_time = current_time(); t->last_rt_change = t->gc_time = current_time();
if (rt_is_flow(t)) if (rt_is_flow(t))
@ -2403,6 +2427,9 @@ rt_prune_table(rtable *tab)
FIB_ITERATE_INIT(fit, &tab->fib); FIB_ITERATE_INIT(fit, &tab->fib);
tab->prune_state = 2; tab->prune_state = 2;
tab->gc_counter = 0;
tab->gc_time = current_time();
if (tab->prune_trie) if (tab->prune_trie)
{ {
/* Init prefix trie pruning */ /* Init prefix trie pruning */
@ -2462,9 +2489,6 @@ again:
fib_check(&tab->fib); fib_check(&tab->fib);
#endif #endif
tab->gc_counter = 0;
tab->gc_time = current_time();
/* state change 2->0, 3->1 */ /* state change 2->0, 3->1 */
tab->prune_state &= 1; tab->prune_state &= 1;
@ -2591,6 +2615,20 @@ rt_preconfig(struct config *c)
rt_new_table(cf_get_symbol("master6"), NET_IP6); rt_new_table(cf_get_symbol("master6"), NET_IP6);
} }
void
rt_postconfig(struct config *c)
{
uint num_tables = list_length(&c->tables);
btime def_gc_period = 400 MS * num_tables;
def_gc_period = MAX(def_gc_period, 10 S);
def_gc_period = MIN(def_gc_period, 600 S);
struct rtable_config *rc;
WALK_LIST(rc, c->tables)
if (rc->gc_period == (uint) -1)
rc->gc_period = (uint) def_gc_period;
}
/* /*
* Some functions for handing internal next hop updates * Some functions for handing internal next hop updates
@ -2999,8 +3037,8 @@ rt_new_table(struct symbol *s, uint addr_type)
cf_define_symbol(s, SYM_TABLE, table, c); cf_define_symbol(s, SYM_TABLE, table, c);
c->name = s->name; c->name = s->name;
c->addr_type = addr_type; c->addr_type = addr_type;
c->gc_max_ops = 1000; c->gc_threshold = 1000;
c->gc_min_time = 5; c->gc_period = (uint) -1; /* set in rt_postconfig() */
c->min_settle_time = 1 S; c->min_settle_time = 1 S;
c->max_settle_time = 20 S; c->max_settle_time = 20 S;