diff --git a/filter/filter.c b/filter/filter.c index 05652e64..88763302 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -795,7 +795,7 @@ interpret(struct f_inst *what) case SA_GW: res.val.px.ip = rta->gw; break; case SA_NET: res.val.px.ip = (*f_rte)->net->n.prefix; res.val.px.len = (*f_rte)->net->n.pxlen; break; - case SA_PROTO: res.val.s = rta->proto->name; break; + case SA_PROTO: res.val.s = rta->src->proto->name; break; case SA_SOURCE: res.val.i = rta->source; break; case SA_SCOPE: res.val.i = rta->scope; break; case SA_CAST: res.val.i = rta->cast; break; @@ -827,7 +827,7 @@ interpret(struct f_inst *what) case SA_GW: { ip_addr ip = v1.val.px.ip; - neighbor *n = neigh_find(rta->proto, &ip, 0); + neighbor *n = neigh_find(rta->src->proto, &ip, 0); if (!n || (n->scope == SCOPE_HOST)) runtime( "Invalid gw address" ); diff --git a/lib/string.h b/lib/string.h index 2c477294..528a1a19 100644 --- a/lib/string.h +++ b/lib/string.h @@ -11,6 +11,7 @@ #include #include +#include int bsprintf(char *str, const char *fmt, ...); int bvsprintf(char *str, const char *fmt, va_list args); diff --git a/nest/proto.c b/nest/proto.c index 5876e5d4..cfa6ff4b 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -218,6 +218,7 @@ proto_free_ahooks(struct proto *p) p->main_ahook = NULL; } + /** * proto_config_new - create a new protocol configuration * @pr: protocol the configuration will belong to @@ -830,6 +831,9 @@ proto_schedule_feed(struct proto *p, int initial) /* Connect protocol to routing table */ if (initial && !p->proto->multitable) { + p->main_source = rt_get_source(p, 0); + rt_lock_source(p->main_source); + p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats); p->main_ahook->in_filter = p->cf->in_filter; p->main_ahook->out_filter = p->cf->out_filter; @@ -837,6 +841,7 @@ proto_schedule_feed(struct proto *p, int initial) p->main_ahook->in_limit = p->cf->in_limit; p->main_ahook->out_limit = p->cf->out_limit; p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered; + proto_reset_limit(p->main_ahook->rx_limit); proto_reset_limit(p->main_ahook->in_limit); proto_reset_limit(p->main_ahook->out_limit); @@ -890,6 +895,8 @@ proto_flush_loop(void *unused UNUSED) return; } + rt_prune_sources(); + again: WALK_LIST(p, flush_proto_list) if (p->flushing) @@ -1088,6 +1095,12 @@ proto_notify_state(struct proto *p, unsigned ps) if ((cs == FS_FEEDING) || (cs == FS_HAPPY)) proto_schedule_flush(p); + if (p->proto->multitable) + { + rt_unlock_source(p->main_source); + p->main_source = NULL; + } + neigh_prune(); // FIXME convert neighbors to resource? rfree(p->pool); p->pool = NULL; diff --git a/nest/protocol.h b/nest/protocol.h index 96923447..b58f9e67 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -187,7 +187,7 @@ struct proto { int (*reload_routes)(struct proto *); /* - * Routing entry hooks (called only for rte's belonging to this protocol): + * Routing entry hooks (called only for routes belonging to this protocol): * * rte_recalculate Called at the beginning of the best route selection * rte_better Compare two rte's and decide which one is better (1=first, 0=second). @@ -203,6 +203,7 @@ struct proto { void (*rte_remove)(struct network *, struct rte *); struct rtable *table; /* Our primary routing table */ + struct rte_src *main_source; /* Primary route source */ struct announce_hook *main_ahook; /* Primary announcement hook */ struct announce_hook *ahooks; /* Announcement hooks for this protocol */ diff --git a/nest/route.h b/nest/route.h index e0b88551..f00f8b2b 100644 --- a/nest/route.h +++ b/nest/route.h @@ -251,10 +251,10 @@ void rt_unlock_table(rtable *); void rt_setup(pool *, rtable *, char *, struct rtable_config *); static inline net *net_find(rtable *tab, ip_addr addr, unsigned len) { return (net *) fib_find(&tab->fib, &addr, len); } static inline net *net_get(rtable *tab, ip_addr addr, unsigned len) { return (net *) fib_get(&tab->fib, &addr, len); } -rte *rte_find(net *net, struct proto *p); +rte *rte_find(net *net, struct rte_src *src); rte *rte_get_temp(struct rta *); -void rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src); -static inline void rte_update(rtable *tab, net *net, struct proto *p, struct proto *src, rte *new) { rte_update2(p->main_ahook, net, new, src); } +void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src); +static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); } void rte_discard(rtable *tab, rte *old); int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter); void rte_dump(rte *); @@ -300,9 +300,18 @@ struct mpnh { unsigned char weight; }; +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct proto *proto; /* Protocol the source is based on */ + u32 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + unsigned uc; /* Use count */ +}; + + typedef struct rta { struct rta *next, **pprev; /* Hash chain */ - struct proto *proto; /* Protocol instance that originally created the route */ + struct rte_src *src; /* Route source that created the route */ unsigned uc; /* Use count */ byte source; /* Route source (RTS_...) */ byte scope; /* Route scope (SCOPE_... -- see ip.h) */ @@ -421,6 +430,13 @@ typedef struct ea_list { #define EALF_BISECT 2 /* Use interval bisection for searching */ #define EALF_CACHED 4 /* Attributes belonging to cached rta */ +struct rte_src *rt_find_source(struct proto *p, u32 id); +struct rte_src *rt_get_source(struct proto *p, u32 id); +static inline void rt_lock_source(struct rte_src *src) { src->uc++; } +static inline void rt_unlock_source(struct rte_src *src) { src->uc--; } +void rt_prune_sources(void); + + eattr *ea_find(ea_list *, unsigned ea); int ea_get_int(ea_list *, unsigned ea, int def); void ea_dump(ea_list *); @@ -437,6 +453,7 @@ static inline int mpnh_same(struct mpnh *x, struct mpnh *y) void rta_init(void); rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ +static inline int rta_is_cached(rta *r) { return r->aflags & RTAF_CACHED; } static inline rta *rta_clone(rta *r) { r->uc++; return r; } void rta__free(rta *r); static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } diff --git a/nest/rt-attr.c b/nest/rt-attr.c index 3f79ee59..0fb7c820 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -58,9 +58,194 @@ pool *rta_pool; static slab *rta_slab; static slab *mpnh_slab; +static slab *rte_src_slab; + +/* rte source ID bitmap */ +static u32 *src_ids; +static u32 src_id_size, src_id_used, src_id_pos; +#define SRC_ID_SIZE_DEF 4 + +/* rte source hash */ +static struct rte_src **src_table; +static u32 src_hash_order, src_hash_size, src_hash_count; +#define SRC_HASH_ORDER_DEF 6 +#define SRC_HASH_ORDER_MAX 18 +#define SRC_HASH_ORDER_MIN 10 struct protocol *attr_class_to_protocol[EAP_MAX]; + +static void +rte_src_init(void) +{ + rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + + src_id_pos = 0; + src_id_size = SRC_ID_SIZE_DEF; + src_ids = mb_allocz(rta_pool, src_id_size * sizeof(u32)); + + /* ID 0 is reserved */ + src_ids[0] = 1; + src_id_used = 1; + + src_hash_count = 0; + src_hash_order = SRC_HASH_ORDER_DEF; + src_hash_size = 1 << src_hash_order; + src_table = mb_allocz(rta_pool, src_hash_size * sizeof(struct rte_src *)); +} + +static inline int u32_cto(unsigned int x) { return ffs(~x) - 1; } + +static inline u32 +rte_src_alloc_id(void) +{ + int i, j; + for (i = src_id_pos; i < src_id_size; i++) + if (src_ids[i] != 0xffffffff) + goto found; + + /* If we are at least 7/8 full, expand */ + if (src_id_used > (src_id_size * 28)) + { + src_id_size *= 2; + src_ids = mb_realloc(src_ids, src_id_size * sizeof(u32)); + bzero(src_ids + i, (src_id_size - i) * sizeof(u32)); + goto found; + } + + for (i = 0; i < src_id_pos; i++) + if (src_ids[i] != 0xffffffff) + goto found; + + ASSERT(0); + + found: + ASSERT(i < 0x8000000); + + src_id_pos = i; + j = u32_cto(src_ids[i]); + + src_ids[i] |= (1 << j); + src_id_used++; + return 32 * i + j; +} + +static inline void +rte_src_free_id(u32 id) +{ + int i = id / 32; + int j = id % 32; + + ASSERT((i < src_id_size) && (src_ids[i] & (1 << j))); + src_ids[i] &= ~(1 << j); + src_id_used--; +} + +static inline u32 rte_src_hash(struct proto *p, u32 x, u32 order) +{ return (x * 2902958171u) >> (32 - order); } + +static void +rte_src_rehash(int step) +{ + struct rte_src **old_tab, *src, *src_next; + u32 old_size, hash, i; + + old_tab = src_table; + old_size = src_hash_size; + + src_hash_order += step; + src_hash_size = 1 << src_hash_order; + src_table = mb_allocz(rta_pool, src_hash_size * sizeof(struct rte_src *)); + + for (i = 0; i < old_size; i++) + for (src = old_tab[i]; src; src = src_next) + { + src_next = src->next; + hash = rte_src_hash(src->proto, src->private_id, src_hash_order); + src->next = src_table[hash]; + src_table[hash] = src; + } + + mb_free(old_tab); +} + +struct rte_src * +rt_find_source(struct proto *p, u32 id) +{ + struct rte_src *src; + u32 hash = rte_src_hash(p, id, src_hash_order); + + for (src = src_table[hash]; src; src = src->next) + if ((src->proto == p) && (src->private_id == id)) + return src; + + return NULL; +} + +struct rte_src * +rt_get_source(struct proto *p, u32 id) +{ + struct rte_src *src; + u32 hash = rte_src_hash(p, id, src_hash_order); + + for (src = src_table[hash]; src; src = src->next) + if ((src->proto == p) && (src->private_id == id)) + return src; + + src = sl_alloc(rte_src_slab); + src->proto = p; + src->private_id = id; + src->global_id = rte_src_alloc_id(); + src->uc = 0; + + src->next = src_table[hash]; + src_table[hash] = src; + + src_hash_count++; + if ((src_hash_count > src_hash_size) && (src_hash_order < SRC_HASH_ORDER_MAX)) + rte_src_rehash(1); + + return src; +} + +static inline void +rt_remove_source(struct rte_src **sp) +{ + struct rte_src *src = *sp; + + *sp = src->next; + rte_src_free_id(src->global_id); + sl_free(rte_src_slab, src); + src_hash_count--; +} + +void +rt_prune_sources(void) +{ + struct rte_src **sp; + int i; + + for (i = 0; i < src_hash_size; i++) + { + sp = &src_table[i]; + while (*sp) + { + if ((*sp)->uc == 0) + rt_remove_source(sp); + else + sp = &(*sp)->next; + } + } + + while ((src_hash_count < (src_hash_size / 4)) && (src_hash_order > SRC_HASH_ORDER_MIN)) + rte_src_rehash(-1); +} + + +/* + * Multipath Next Hop + */ + static inline unsigned int mpnh_hash(struct mpnh *x) { @@ -681,14 +866,14 @@ rta_alloc_hash(void) static inline unsigned int rta_hash(rta *a) { - return (a->proto->hash_key ^ ipa_hash(a->gw) ^ + return (((unsigned) a->src) ^ ipa_hash(a->gw) ^ mpnh_hash(a->nexthops) ^ ea_hash(a->eattrs)) & 0xffff; } static inline int rta_same(rta *x, rta *y) { - return (x->proto == y->proto && + return (x->src == y->src && x->source == y->source && x->scope == y->scope && x->cast == y->cast && @@ -785,6 +970,7 @@ rta_lookup(rta *o) r = rta_copy(o); r->hash_key = h; r->aflags = RTAF_CACHED; + rt_lock_source(r->src); rt_lock_hostentry(r->hostentry); rta_insert(r); @@ -804,6 +990,7 @@ rta__free(rta *a) a->next->pprev = a->pprev; a->aflags = 0; /* Poison the entry */ rt_unlock_hostentry(a->hostentry); + rt_unlock_source(a->src); mpnh_free(a->nexthops); ea_free(a->eattrs); sl_free(rta_slab, a); @@ -826,7 +1013,7 @@ rta_dump(rta *a) static char *rtd[] = { "", " DEV", " HOLE", " UNREACH", " PROHIBIT" }; debug("p=%s uc=%d %s %s%s%s h=%04x", - a->proto->name, a->uc, rts[a->source], ip_scope_text(a->scope), rtc[a->cast], + a->src->proto->name, a->uc, rts[a->source], ip_scope_text(a->scope), rtc[a->cast], rtd[a->dest], a->hash_key); if (!(a->aflags & RTAF_CACHED)) debug(" !CACHED"); @@ -894,6 +1081,7 @@ rta_init(void) rta_slab = sl_new(rta_pool, sizeof(rta)); mpnh_slab = sl_new(rta_pool, sizeof(struct mpnh)); rta_alloc_hash(); + rte_src_init(); } /* diff --git a/nest/rt-dev.c b/nest/rt-dev.c index 4fb5bddb..1a859dac 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -51,29 +51,31 @@ dev_ifa_notify(struct proto *p, unsigned c, struct ifa *ad) DBG("dev_if_notify: device shutdown: prefix not found\n"); return; } - rte_update(p->table, n, p, p, NULL); + rte_update(p, n, NULL); } else if (c & IF_CHANGE_UP) { - rta *a, A; + rta *a; net *n; rte *e; DBG("dev_if_notify: %s:%I going up\n", ad->iface->name, ad->ip); - bzero(&A, sizeof(A)); - A.proto = p; - A.source = RTS_DEVICE; - A.scope = SCOPE_UNIVERSE; - A.cast = RTC_UNICAST; - A.dest = RTD_DEVICE; - A.iface = ad->iface; - A.eattrs = NULL; - a = rta_lookup(&A); + + rta a0 = { + .src = p->main_source, + .source = RTS_DEVICE, + .scope = SCOPE_UNIVERSE, + .cast = RTC_UNICAST, + .dest = RTD_DEVICE, + .iface = ad->iface + }; + + a = rta_lookup(&a0); n = net_get(p->table, ad->prefix, ad->pxlen); e = rte_get_temp(a); e->net = n; e->pflags = 0; - rte_update(p->table, n, p, p, e); + rte_update(p, n, e); } } diff --git a/nest/rt-table.c b/nest/rt-table.c index fc554081..8c91ea0a 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -58,6 +58,14 @@ static void rt_next_hop_update(rtable *tab); static inline void rt_schedule_gc(rtable *tab); +static inline struct ea_list * +make_tmp_attrs(struct rte *rt, struct linpool *pool) +{ + struct ea_list *(*mta)(struct rte *rt, struct linpool *pool); + mta = rt->attrs->src->proto->make_tmp_attrs; + return mta ? mta(rt, rte_update_pool) : NULL; +} + /* Like fib_route(), but skips empty net entries */ static net * net_route(rtable *tab, ip_addr a, int len) @@ -88,17 +96,17 @@ rte_init(struct fib_node *N) /** * rte_find - find a route * @net: network node - * @p: protocol + * @src: route source * * The rte_find() function returns a route for destination @net - * which belongs has been defined by protocol @p. + * which is from route source @src. */ rte * -rte_find(net *net, struct proto *p) +rte_find(net *net, struct rte_src *src) { rte *e = net->routes; - while (e && e->attrs->proto != p) + while (e && e->attrs->src != src) e = e->next; return e; } @@ -119,7 +127,7 @@ rte_get_temp(rta *a) e->attrs = a; e->flags = 0; - e->pref = a->proto->preference; + e->pref = a->src->proto->preference; return e; } @@ -148,16 +156,16 @@ rte_better(rte *new, rte *old) return 1; if (new->pref < old->pref) return 0; - if (new->attrs->proto->proto != old->attrs->proto->proto) + if (new->attrs->src->proto->proto != old->attrs->src->proto->proto) { /* * If the user has configured protocol preferences, so that two different protocols * have the same preference, try to break the tie by comparing addresses. Not too * useful, but keeps the ordering of routes unambiguous. */ - return new->attrs->proto->proto > old->attrs->proto->proto; + return new->attrs->src->proto->proto > old->attrs->src->proto->proto; } - if (better = new->attrs->proto->rte_better) + if (better = new->attrs->src->proto->rte_better) return better(new, old); return 0; } @@ -201,8 +209,7 @@ export_filter(struct announce_hook *ah, rte *rt0, rte **rt_free, ea_list **tmpa, /* If called does not care for eattrs, we prepare one internally */ if (!tmpa) { - struct proto *src = rt->attrs->proto; - tmpb = src->make_tmp_attrs ? src->make_tmp_attrs(rt, rte_update_pool) : NULL; + tmpb = make_tmp_attrs(rt, rte_update_pool); tmpa = &tmpb; } @@ -552,9 +559,9 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo if (type == RA_OPTIMAL) { if (new) - new->attrs->proto->stats.pref_routes++; + new->attrs->src->proto->stats.pref_routes++; if (old) - old->attrs->proto->stats.pref_routes--; + old->attrs->src->proto->stats.pref_routes--; if (tab->hostcache) rt_notify_hostcache(tab, net); @@ -605,7 +612,7 @@ rte_validate(rte *e) void rte_free(rte *e) { - if (e->attrs->aflags & RTAF_CACHED) + if (rta_is_cached(e->attrs)) rta_free(e->attrs); sl_free(rte_slab, e); } @@ -625,13 +632,13 @@ rte_same(rte *x, rte *y) x->flags == y->flags && x->pflags == y->pflags && x->pref == y->pref && - (!x->attrs->proto->rte_same || x->attrs->proto->rte_same(x, y)); + (!x->attrs->src->proto->rte_same || x->attrs->src->proto->rte_same(x, y)); } static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } static void -rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, struct proto *src) +rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, struct rte_src *src) { struct proto *p = ah->proto; struct rtable *table = ah->table; @@ -645,7 +652,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str k = &net->routes; /* Find and remove original route from the same protocol */ while (old = *k) { - if (old->attrs->proto == src) + if (old->attrs->src == src) { /* If there is the same route in the routing table but from * a different sender, then there are two paths from the @@ -681,7 +688,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str #ifdef CONFIG_RIP /* lastmod is used internally by RIP as the last time when the route was received. */ - if (src->proto == &proto_rip) + if (src->proto->proto == &proto_rip) old->lastmod = now; #endif return; @@ -796,7 +803,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str /* If routes are not sorted, find the best route and move it on the first position. There are several optimized cases. */ - if (src->rte_recalculate && src->rte_recalculate(table, net, new, old, old_best)) + if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best)) goto do_recalculate; if (new && rte_better(new, old_best)) @@ -970,7 +977,7 @@ rte_unhide_dummy_routes(net *net, rte **dummy) */ void -rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) +rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src) { struct proto *p = ah->proto; struct proto_stats *stats = ah->stats; @@ -1004,8 +1011,7 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) } else { - if (src->make_tmp_attrs) - tmpa = src->make_tmp_attrs(new, rte_update_pool); + tmpa = make_tmp_attrs(new, rte_update_pool); if (filter && (filter != FILTER_REJECT)) { ea_list *old_tmpa = tmpa; @@ -1020,17 +1026,25 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) new->flags |= REF_FILTERED; } - if (tmpa != old_tmpa && src->store_tmp_attrs) - src->store_tmp_attrs(new, tmpa); + if (tmpa != old_tmpa && src->proto->store_tmp_attrs) + src->proto->store_tmp_attrs(new, tmpa); } } - - if (!(new->attrs->aflags & RTAF_CACHED)) /* Need to copy attributes */ + if (!rta_is_cached(new->attrs)) /* Need to copy attributes */ new->attrs = rta_lookup(new->attrs); new->flags |= REF_COW; } else - stats->imp_withdraws_received++; + { + stats->imp_withdraws_received++; + + if (!net || !src) + { + stats->imp_withdraws_ignored++; + rte_update_unlock(); + return; + } + } recalc: rte_hide_dummy_routes(net, &dummy); @@ -1051,12 +1065,10 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) static inline void rte_announce_i(rtable *tab, unsigned type, net *n, rte *new, rte *old) { - struct proto *src; ea_list *tmpa; rte_update_lock(); - src = new->attrs->proto; - tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(new, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(new, rte_update_pool); rte_announce(tab, type, n, new, old, NULL, tmpa); rte_update_unlock(); } @@ -1065,7 +1077,7 @@ void rte_discard(rtable *t, rte *old) /* Non-filtered route deletion, used during garbage collection */ { rte_update_lock(); - rte_recalculate(old->sender, old->net, NULL, NULL, old->attrs->proto); + rte_recalculate(old->sender, old->net, NULL, NULL, old->attrs->src); rte_update_unlock(); } @@ -1082,8 +1094,7 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter rte_update_lock(); /* Rest is stripped down export_filter() */ - struct proto *src = rt->attrs->proto; - ea_list *tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(rt, rte_update_pool) : NULL; + ea_list *tmpa = make_tmp_attrs(rt, rte_update_pool); int v = p->import_control ? p->import_control(p, &rt, &tmpa, rte_update_pool) : 0; if (v == RIC_PROCESS) v = (f_run(filter, &rt, &tmpa, rte_update_pool, FF_FORCE_TMPATTR) <= F_ACCEPT); @@ -1110,8 +1121,8 @@ rte_dump(rte *e) debug("%-1I/%2d ", n->n.prefix, n->n.pxlen); debug("KF=%02x PF=%02x pref=%d lm=%d ", n->n.flags, e->pflags, e->pref, now-e->lastmod); rta_dump(e->attrs); - if (e->attrs->proto->proto->dump_attrs) - e->attrs->proto->proto->dump_attrs(e); + if (e->attrs->src->proto->proto->dump_attrs) + e->attrs->src->proto->proto->dump_attrs(e); debug("\n"); } @@ -1232,7 +1243,10 @@ rt_event(void *ptr) rt_next_hop_update(tab); if (tab->gc_scheduled) - rt_prune_nets(tab); + { + rt_prune_nets(tab); + rt_prune_sources(); // FIXME this should be moved to independent event + } } void @@ -1298,7 +1312,7 @@ again: rescan: for (e=n->routes; e; e=e->next) if (e->sender->proto->flushing || - (step && e->attrs->proto->flushing)) + (step && e->attrs->src->proto->flushing)) { if (*max_feed <= 0) { @@ -1308,7 +1322,7 @@ again: if (step) log_rl(&rl_flush, L_WARN "Route %I/%d from %s still in %s after flush", - n->n.prefix, n->n.pxlen, e->attrs->proto->name, tab->name); + n->n.prefix, n->n.pxlen, e->attrs->src->proto->name, tab->name); rte_discard(tab, e); (*max_feed)--; @@ -1452,8 +1466,8 @@ rt_next_hop_update_net(rtable *tab, net *n) /* Call a pre-comparison hook */ /* Not really an efficient way to compute this */ - if (e->attrs->proto->rte_recalculate) - e->attrs->proto->rte_recalculate(tab, n, new, e, NULL); + if (e->attrs->src->proto->rte_recalculate) + e->attrs->src->proto->rte_recalculate(tab, n, new, e, NULL); if (e != old_best) rte_free_quick(e); @@ -1651,11 +1665,10 @@ rt_commit(struct config *new, struct config *old) static inline void do_feed_baby(struct proto *p, int type, struct announce_hook *h, net *n, rte *e) { - struct proto *src = e->attrs->proto; ea_list *tmpa; rte_update_lock(); - tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(e, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(e, rte_update_pool); if (type == RA_ACCEPTED) rt_notify_accepted(h, n, e, NULL, NULL, tmpa, p->refeeding ? 2 : 1); else @@ -2041,7 +2054,7 @@ rt_update_hostcache(rtable *tab) } static struct hostentry * -rt_find_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) +rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) { struct hostentry *he; @@ -2062,9 +2075,10 @@ rt_find_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, ip_addr *ll) { - rta_apply_hostentry(a, rt_find_hostentry(tab, *gw, *ll, dep)); + rta_apply_hostentry(a, rt_get_hostentry(tab, *gw, *ll, dep)); } + /* * CLI commands */ @@ -2094,6 +2108,7 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm rta *a = e->attrs; int primary = (e->net->routes == e); int sync_error = (e->net->n.flags & KRF_SYNC_ERROR); + void (*get_route_info)(struct rte *, byte *buf, struct ea_list *attrs); struct mpnh *nh; rt_format_via(e, via); @@ -2102,7 +2117,9 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm bsprintf(from, " from %I", a->from); else from[0] = 0; - if (a->proto->proto->get_route_info || d->verbose) + + get_route_info = a->src->proto->proto->get_route_info; + if (get_route_info || d->verbose) { /* Need to normalize the extended attributes */ ea_list *t = tmpa; @@ -2111,11 +2128,11 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm ea_merge(t, tmpa); ea_sort(tmpa); } - if (a->proto->proto->get_route_info) - a->proto->proto->get_route_info(e, info, tmpa); + if (get_route_info) + get_route_info(e, info, tmpa); else bsprintf(info, " (%d)", e->pref); - cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->proto->name, + cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->src->proto->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); for (nh = a->nexthops; nh; nh = nh->next) cli_printf(c, -1007, "\tvia %I on %s weight %d", nh->gw, nh->iface->name, nh->weight + 1); @@ -2139,7 +2156,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) continue; struct ea_list *tmpa; - struct proto *p0 = e->attrs->proto; + struct rte_src *src = e->attrs->src; struct proto *p1 = d->export_protocol; struct proto *p2 = d->show_protocol; @@ -2148,9 +2165,9 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) d->rt_counter++; ee = e; rte_update_lock(); /* We use the update buffer for filtering */ - tmpa = p0->make_tmp_attrs ? p0->make_tmp_attrs(e, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(e, rte_update_pool); ok = f_run(d->filter, &e, &tmpa, rte_update_pool, FF_FORCE_TMPATTR) <= F_ACCEPT; - if (p2 && p2 != p0) ok = 0; + if (p2 && p2 != src->proto) ok = 0; if (ok && d->export_mode) { int ic; diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 8e25c4d2..d34e2ae3 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -621,12 +621,14 @@ bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains) return -1; } +/* static void bgp_init_prefix(struct fib_node *N) { struct bgp_prefix *p = (struct bgp_prefix *) N; p->bucket_node.next = NULL; } +*/ static int bgp_compare_u32(const u32 *x, const u32 *y) @@ -870,30 +872,125 @@ bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) mb_free(buck); } + +/* Prefix hash table */ + +static inline u32 prefix_hash(ip_addr prefix, int pxlen, u32 path_id, u32 order) +{ + u32 x = ipa_hash(prefix) + pxlen + path_id; + return (x * 2902958171u) >> (32 - order); +} + +static inline u32 px_hash_size(struct bgp_proto *p) +{ return 1 << p->px_hash_order; } + +void +bgp_init_prefix_table(struct bgp_proto *p, u32 order) +{ + p->px_hash_count = 0; + p->px_hash_order = order; + p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *)); + p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix)); +} + +static void +bgp_rehash_prefix_table(struct bgp_proto *p, int step) +{ + struct bgp_prefix **old_tab, *px, *px_next; + u32 old_size, hash, i; + + old_tab = p->prefix_table; + old_size = px_hash_size(p); + + p->px_hash_order += step; + p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *)); + + for (i = 0; i < old_size; i++) + for (px = old_tab[i]; px; px = px_next) + { + px_next = px->next; + hash = prefix_hash(px->n.prefix, px->n.pxlen, px->path_id, p->px_hash_order); + px->next = p->prefix_table[hash]; + p->prefix_table[hash] = px; + } + + mb_free(old_tab); +} + +static struct bgp_prefix * +bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id) +{ + struct bgp_prefix *bp; + u32 hash = prefix_hash(prefix, pxlen, path_id, p->px_hash_order); + + for (bp = p->prefix_table[hash]; bp; bp = bp->next) + if (bp->n.pxlen == pxlen && ipa_equal(bp->n.prefix, prefix) && bp->path_id == path_id) + return bp; + + bp = sl_alloc(p->prefix_slab); + bp->n.prefix = prefix; + bp->n.pxlen = pxlen; + bp->path_id = path_id; + bp->next = p->prefix_table[hash]; + p->prefix_table[hash] = bp; + + bp->bucket_node.next = NULL; + + p->px_hash_count++; + if ((p->px_hash_count > px_hash_size(p)) && (p->px_hash_order < 18)) + bgp_rehash_prefix_table(p, 1); + + return bp; +} + +void +bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp) +{ + struct bgp_prefix **bpp; + u32 hash = prefix_hash(bp->n.prefix, bp->n.pxlen, bp->path_id, p->px_hash_order); + + for (bpp = &p->prefix_table[hash]; *bpp; *bpp = (*bpp)->next) + if (*bpp == bp) + break; + + *bpp = bp->next; + sl_free(p->prefix_slab, bp); + + p->px_hash_count--; + if ((p->px_hash_count < (px_hash_size(p) / 4)) && (p->px_hash_order > 10)) + bgp_rehash_prefix_table(p, -1); +} + + void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs) { struct bgp_proto *p = (struct bgp_proto *) P; struct bgp_bucket *buck; struct bgp_prefix *px; + rte *key; + u32 path_id; DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down"); if (new) { + key = new; buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP); if (!buck) /* Inconsistent attribute list */ return; } else { + key = old; if (!(buck = p->withdraw_bucket)) { buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket)); init_list(&buck->prefixes); } } - px = fib_get(&p->prefix_fib, &n->n.prefix, n->n.pxlen); + path_id = p->add_path_tx ? key->attrs->src->global_id : 0; + px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id); if (px->bucket_node.next) { DBG("\tRemoving old entry.\n"); @@ -1026,7 +1123,7 @@ bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *p if (rr) { /* Handling route reflection, RFC 4456 */ - struct bgp_proto *src = (struct bgp_proto *) e->attrs->proto; + struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto; a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); if (!a) @@ -1076,7 +1173,8 @@ bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool * { rte *e = *new; struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_proto *new_bgp = (e->attrs->proto->proto == &proto_bgp) ? (struct bgp_proto *) e->attrs->proto : NULL; + struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ? + (struct bgp_proto *) e->attrs->src->proto : NULL; if (p == new_bgp) /* Poison reverse updates */ return -1; @@ -1115,7 +1213,7 @@ bgp_get_neighbor(rte *r) if (e && as_path_get_first(e->u.ptr, &as)) return as; else - return ((struct bgp_proto *) r->attrs->proto)->remote_as; + return ((struct bgp_proto *) r->attrs->src->proto)->remote_as; } static inline int @@ -1128,8 +1226,8 @@ rte_resolvable(rte *rt) int bgp_rte_better(rte *new, rte *old) { - struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto; - struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto; + struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->src->proto; + struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->src->proto; eattr *x, *y; u32 n, o; @@ -1263,7 +1361,7 @@ same_group(rte *r, u32 lpref, u32 lasn) static inline int use_deterministic_med(rte *r) { - struct proto *P = r->attrs->proto; + struct proto *P = r->attrs->src->proto; return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med; } @@ -1548,7 +1646,6 @@ bgp_decode_attrs(struct bgp_conn *conn, byte *attr, unsigned int len, struct lin int withdraw = 0; bzero(a, sizeof(rta)); - a->proto = &bgp->p; a->source = RTS_BGP; a->scope = SCOPE_UNIVERSE; a->cast = RTC_UNICAST; @@ -1757,14 +1854,14 @@ bgp_get_attr(eattr *a, byte *buf, int buflen) } void -bgp_attr_init(struct bgp_proto *p) +bgp_init_bucket_table(struct bgp_proto *p) { p->hash_size = 256; p->hash_limit = p->hash_size * 4; p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); init_list(&p->bucket_queue); p->withdraw_bucket = NULL; - fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); + // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); } void diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 7cd0b0ae..98c49b30 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -367,7 +367,9 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; - bgp_attr_init(conn->bgp); + bgp_init_bucket_table(p); + bgp_init_prefix_table(p, 8); + bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); } @@ -417,8 +419,11 @@ static void bgp_send_open(struct bgp_conn *conn) { conn->start_state = conn->bgp->start_state; - conn->want_as4_support = conn->bgp->cf->enable_as4 && (conn->start_state != BSS_CONNECT_NOCAP); - conn->peer_as4_support = 0; // Default value, possibly changed by receiving capability. + + // Default values, possibly changed by receiving capabilities. + conn->peer_refresh_support = 0; + conn->peer_as4_support = 0; + conn->peer_add_path = 0; conn->advertised_as = 0; DBG("BGP: Sending open\n"); @@ -970,19 +975,17 @@ get_igp_table(struct bgp_config *cf) static struct proto * bgp_init(struct proto_config *C) { - struct bgp_config *c = (struct bgp_config *) C; struct proto *P = proto_new(C, sizeof(struct bgp_proto)); + struct bgp_config *c = (struct bgp_config *) C; struct bgp_proto *p = (struct bgp_proto *) P; P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL; P->rt_notify = bgp_rt_notify; - P->rte_better = bgp_rte_better; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; - - if (c->deterministic_med) - P->rte_recalculate = bgp_rte_recalculate; + P->rte_better = bgp_rte_better; + P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = c; p->local_as = c->local_as; @@ -1238,15 +1241,19 @@ bgp_show_proto_info(struct proto *P) else if (P->proto_state == PS_UP) { cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s", + cli_msg(-1006, " Neighbor caps: %s%s%s%s", c->peer_refresh_support ? " refresh" : "", - c->peer_as4_support ? " AS4" : ""); - cli_msg(-1006, " Session: %s%s%s%s%s", + c->peer_as4_support ? " AS4" : "", + (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", + (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : ""); + cli_msg(-1006, " Session: %s%s%s%s%s%s%s", p->is_internal ? "internal" : "external", p->cf->multihop ? " multihop" : "", p->rr_client ? " route-reflector" : "", p->rs_client ? " route-server" : "", - p->as4_session ? " AS4" : ""); + p->as4_session ? " AS4" : "", + p->add_path_rx ? " add-path-rx" : "", + p->add_path_tx ? " add-path-tx" : ""); cli_msg(-1006, " Source address: %I", p->source_addr); if (P->cf->in_limit) cli_msg(-1006, " Route limit: %d/%d", diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index b5e216b7..a35c362c 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -45,6 +45,7 @@ struct bgp_config { int passive; /* Do not initiate outgoing connection */ int interpret_communities; /* Hardwired handling of well-known communities */ int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + int add_path; /* Use ADD-PATH extension [draft] */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ unsigned connect_retry_time; unsigned hold_time, initial_hold_time; @@ -67,6 +68,11 @@ struct bgp_config { #define GW_DIRECT 1 #define GW_RECURSIVE 2 +#define ADD_PATH_RX 1 +#define ADD_PATH_TX 2 +#define ADD_PATH_FULL 3 + + struct bgp_conn { struct bgp_proto *bgp; struct birdsock *sk; @@ -80,9 +86,9 @@ struct bgp_conn { byte *notify_data; u32 advertised_as; /* Temporary value for AS number received */ int start_state; /* protocol start_state snapshot when connection established */ - int want_as4_support; /* Connection tries to establish AS4 session */ - int peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ - int peer_refresh_support; /* Peer supports route refresh [RFC2918] */ + u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ + u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ + u8 peer_add_path; /* Peer supports ADD-PATH [draft] */ unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; @@ -91,8 +97,10 @@ struct bgp_proto { struct bgp_config *cf; /* Shortcut to BGP configuration */ u32 local_as, remote_as; int start_state; /* Substates that partitions BS_START */ - int is_internal; /* Internal BGP connection (local_as == remote_as) */ - int as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 is_internal; /* Internal BGP connection (local_as == remote_as) */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ + u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ u32 local_id; /* BGP identifier of this router */ u32 remote_id; /* BGP identifier of the neighbor */ u32 rr_cluster_id; /* Route reflector cluster ID */ @@ -110,7 +118,10 @@ struct bgp_proto { struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ unsigned int hash_size, hash_count, hash_limit; - struct fib prefix_fib; /* Prefixes to be sent */ + // struct fib prefix_fib; /* Prefixes to be sent */ + struct bgp_prefix **prefix_table; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ + u32 px_hash_order, px_hash_count; list bucket_queue; /* Queue of buckets to send */ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ unsigned startup_delay; /* Time to delay protocol startup by due to errors */ @@ -126,7 +137,12 @@ struct bgp_proto { }; struct bgp_prefix { - struct fib_node n; /* Node in prefix fib */ + struct { + ip_addr prefix; + int pxlen; + } n; + u32 path_id; + struct bgp_prefix *next; node bucket_node; /* Node in per-bucket list */ }; @@ -160,6 +176,9 @@ void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, unsigned subcode); +struct rte_source *bgp_find_source(struct bgp_proto *p, u32 path_id); +struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); + #ifdef LOCAL_DEBUG @@ -195,9 +214,11 @@ int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); -void bgp_attr_init(struct bgp_proto *); -unsigned int bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); +void bgp_init_bucket_table(struct bgp_proto *); void bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck); +void bgp_init_prefix_table(struct bgp_proto *p, u32 order); +void bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp); +unsigned int bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); void bgp_get_route_info(struct rte *, byte *buf, struct ea_list *attrs); inline static void bgp_attach_attr_ip(struct ea_list **to, struct linpool *pool, unsigned attr, ip_addr a) diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index e93501d3..76a76470 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -26,7 +26,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, - SECONDARY, ALLOW, BFD) + SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX) CF_GRAMMAR @@ -110,6 +110,9 @@ bgp_proto: | bgp_proto PASSIVE bool ';' { BGP_CFG->passive = $3; } | bgp_proto INTERPRET COMMUNITIES bool ';' { BGP_CFG->interpret_communities = $4; } | bgp_proto SECONDARY bool ';' { BGP_CFG->secondary = $3; } + | bgp_proto ADD PATHS RX ';' { BGP_CFG->add_path = ADD_PATH_RX; } + | bgp_proto ADD PATHS TX ';' { BGP_CFG->add_path = ADD_PATH_TX; } + | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 9d85cbc9..42064332 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -165,6 +165,21 @@ bgp_put_cap_as4(struct bgp_conn *conn, byte *buf) return buf + 4; } +static byte * +bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf) +{ + *buf++ = 69; /* Capability 69: Support for ADD-PATH */ + *buf++ = 4; /* Capability data length */ + + *buf++ = 0; /* Appropriate AF */ + *buf++ = BGP_AF; + *buf++ = 1; /* SAFI 1 */ + + *buf++ = conn->bgp->cf->add_path; + + return buf; +} + static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { @@ -201,9 +216,12 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) if (p->cf->enable_refresh) cap = bgp_put_cap_rr(conn, cap); - if (conn->want_as4_support) + if (p->cf->enable_as4) cap = bgp_put_cap_as4(conn, cap); + if (p->cf->add_path) + cap = bgp_put_cap_add_path(conn, cap); + cap_len = cap - buf - 12; if (cap_len > 0) { @@ -230,6 +248,13 @@ bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsig { struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); + + if (p->add_path_tx) + { + put_u32(w, px->path_id); + w += 4; + } + *w++ = px->n.pxlen; bytes = (px->n.pxlen + 7) / 8; a = px->n.prefix; @@ -238,7 +263,8 @@ bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsig w += bytes; remains -= bytes + 1; rem_node(&px->bucket_node); - fib_delete(&p->prefix_fib, px); + bgp_free_prefix(p, px); + // fib_delete(&p->prefix_fib, px); } return w - start; } @@ -251,7 +277,8 @@ bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck) struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen); rem_node(&px->bucket_node); - fib_delete(&p->prefix_fib, px); + bgp_free_prefix(p, px); + // fib_delete(&p->prefix_fib, px); } } @@ -633,7 +660,7 @@ void bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) { // struct bgp_proto *p = conn->bgp; - int cl; + int i, cl; while (len > 0) { @@ -650,14 +677,25 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_refresh_support = 1; break; - case 65: /* AS4 capability, RFC 4893 */ + case 65: /* AS4 capability, RFC 4893 */ if (cl != 4) goto err; conn->peer_as4_support = 1; - if (conn->want_as4_support) + if (conn->bgp->cf->enable_as4) conn->advertised_as = get_u32(opt + 2); break; + case 69: /* ADD-PATH capability, draft */ + if (cl % 4) + goto err; + for (i = 0; i < cl; i += 4) + if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ + conn->peer_add_path = opt[2+i+3]; + if (conn->peer_add_path > ADD_PATH_FULL) + goto err; + + break; + /* We can safely ignore all other capabilities */ } len -= 2 + cl; @@ -796,7 +834,12 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) conn->hold_time = MIN(hold, p->cf->hold_time); conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; p->remote_id = id; - p->as4_session = conn->want_as4_support && conn->peer_as4_support; + p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; + p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); + p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); + + if (p->add_path_tx) + p->p.accept_ra_types = RA_ANY; DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session); @@ -806,6 +849,13 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) } #define DECODE_PREFIX(pp, ll) do { \ + if (p->add_path_rx) \ + { \ + if (ll < 5) { err=1; goto done; } \ + path_id = get_u32(pp); \ + pp += 4; \ + ll -= 4; \ + } \ int b = *pp++; \ int q; \ ll--; \ @@ -820,6 +870,53 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) pxlen = b; \ } while (0) + +static inline void +bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, + u32 path_id, u32 *last_id, struct rte_src **src, + rta *a0, rta **a) +{ + if (path_id != *last_id) + { + *src = rt_get_source(&p->p, path_id); + *last_id = path_id; + + if (*a) + { + rta_free(*a); + *a = NULL; + } + } + + /* Prepare cached route attributes */ + if (!*a) + { + a0->src = *src; + *a = rta_lookup(a0); + } + + net *n = net_get(p->p.table, prefix, pxlen); + rte *e = rte_get_temp(rta_clone(*a)); + e->net = n; + e->pflags = 0; + e->u.bgp.suppressed = 0; + rte_update2(p->p.main_ahook, n, e, *src); +} + +static inline void +bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen, + u32 path_id, u32 *last_id, struct rte_src **src) +{ + if (path_id != *last_id) + { + *src = rt_find_source(&p->p, path_id); + *last_id = path_id; + } + + net *n = net_find(p->p.table, prefix, pxlen); + rte_update2( p->p.main_ahook, n, NULL, *src); +} + static inline int bgp_set_next_hop(struct bgp_proto *p, rta *a) { @@ -878,18 +975,20 @@ bgp_do_rx_update(struct bgp_conn *conn, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; - net *n; - rta *a0, *a = NULL; + struct rte_src *src = p->p.main_source; + rta *a0, *a; ip_addr prefix; int pxlen, err = 0; + u32 path_id = 0; + u32 last_id = 0; /* Withdraw routes */ while (withdrawn_len) { DECODE_PREFIX(withdrawn, withdrawn_len); DBG("Withdraw %I/%d\n", prefix, pxlen); - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } if (!attr_len && !nlri_len) /* shortcut */ @@ -900,28 +999,22 @@ bgp_do_rx_update(struct bgp_conn *conn, if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ return; - if (a0 && nlri_len && bgp_set_next_hop(p, a0)) - a = rta_lookup(a0); + if (a0 && ! bgp_set_next_hop(p, a0)) + a0 = NULL; + + a = NULL; + last_id = 0; + src = p->p.main_source; while (nlri_len) { DECODE_PREFIX(nlri, nlri_len); DBG("Add %I/%d\n", prefix, pxlen); - if (a) - { - rte *e = rte_get_temp(rta_clone(a)); - e->net = net_get(p->p.table, prefix, pxlen); - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update(p->p.table, e->net, &p->p, &p->p, e); - } - else - { - /* Forced withdraw as a result of soft error */ - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); - } + if (a0) + bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); + else /* Forced withdraw as a result of soft error */ + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } done: @@ -977,13 +1070,15 @@ bgp_do_rx_update(struct bgp_conn *conn, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; + struct rte_src *src = p->p.main_source; byte *start, *x; int len, len0; unsigned af, sub; - net *n; - rta *a0, *a = NULL; + rta *a0, *a; ip_addr prefix; int pxlen, err = 0; + u32 path_id = 0; + u32 last_id = 0; p->mp_reach_len = 0; p->mp_unreach_len = 0; @@ -998,8 +1093,7 @@ bgp_do_rx_update(struct bgp_conn *conn, { DECODE_PREFIX(x, len); DBG("Withdraw %I/%d\n", prefix, pxlen); - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } } @@ -1016,28 +1110,22 @@ bgp_do_rx_update(struct bgp_conn *conn, len -= *x + 2; x += *x + 2; - if (a0 && bgp_set_next_hop(p, a0)) - a = rta_lookup(a0); + if (a0 && ! bgp_set_next_hop(p, a0)) + a0 = NULL; + + a = NULL; + last_id = 0; + src = p->p.main_source; while (len) { DECODE_PREFIX(x, len); DBG("Add %I/%d\n", prefix, pxlen); - if (a) - { - rte *e = rte_get_temp(rta_clone(a)); - e->net = net_get(p->p.table, prefix, pxlen); - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update(p->p.table, e->net, &p->p, &p->p, e); - } - else - { - /* Forced withdraw as a result of soft error */ - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); - } + if (a0) + bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); + else /* Forced withdraw as a result of soft error */ + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } } diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 2fa87201..232f3f6c 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -300,14 +300,14 @@ ospf_init(struct proto_config *c) { struct proto *p = proto_new(c, sizeof(struct proto_ospf)); - p->make_tmp_attrs = ospf_make_tmp_attrs; - p->store_tmp_attrs = ospf_store_tmp_attrs; - p->import_control = ospf_import_control; - p->reload_routes = ospf_reload_routes; p->accept_ra_types = RA_OPTIMAL; p->rt_notify = ospf_rt_notify; p->if_notify = ospf_if_notify; p->ifa_notify = ospf_ifa_notify; + p->import_control = ospf_import_control; + p->reload_routes = ospf_reload_routes; + p->make_tmp_attrs = ospf_make_tmp_attrs; + p->store_tmp_attrs = ospf_store_tmp_attrs; p->rte_better = ospf_rte_better; p->rte_same = ospf_rte_same; @@ -504,7 +504,7 @@ ospf_import_control(struct proto *p, rte ** new, ea_list ** attrs, struct ospf_area *oa = ospf_main_area((struct proto_ospf *) p); rte *e = *new; - if (p == e->attrs->proto) + if (e->attrs->src->proto == p) return -1; /* Reject our own routes */ if (oa_is_stub(oa)) diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index f509b896..52110aa1 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -1991,10 +1991,10 @@ again1: if (nf->n.type) /* Add the route */ { rta a0 = { - .proto = p, + .src = p->main_source, .source = nf->n.type, .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST, + .cast = RTC_UNICAST }; if (nf->n.nhs->next) @@ -2032,7 +2032,7 @@ again1: DBG("Mod rte type %d - %I/%d via %I on iface %s, met %d\n", a0.source, nf->fn.prefix, nf->fn.pxlen, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); - rte_update(p->table, ne, p, p, e); + rte_update(p, ne, e); } } else if (nf->old_rta) @@ -2042,7 +2042,7 @@ again1: nf->old_rta = NULL; net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen); - rte_update(p->table, ne, p, p, NULL); + rte_update(p, ne, NULL); } /* Remove unused rt entry. Entries with fn.x0 == 1 are persistent. */ diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 51be3c7d..2e206038 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -49,7 +49,7 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e struct pipe_proto *p = (struct pipe_proto *) P; struct announce_hook *ah = (src_table == P->table) ? p->peer_ahook : P->main_ahook; rtable *dst_table = ah->table; - struct proto *src; + struct rte_src *src; net *nn; rte *e; @@ -72,7 +72,7 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e if (p->mode == PIPE_OPAQUE) { - a.proto = &p->p; + a.src = P->main_source; a.source = RTS_PIPE; } @@ -91,16 +91,16 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e e->pflags = new->pflags; } - src = new->attrs->proto; + src = a.src; } else { e = NULL; - src = old->attrs->proto; + src = old->attrs->src; } src_table->pipe_busy = 1; - rte_update2(ah, nn, e, (p->mode == PIPE_OPAQUE) ? &p->p : src); + rte_update2(ah, nn, e, src); src_table->pipe_busy = 0; } @@ -173,6 +173,12 @@ pipe_start(struct proto *P) p->peer_ahook->in_limit = cf->c.out_limit; proto_reset_limit(p->peer_ahook->in_limit); + if (p->mode == PIPE_OPAQUE) + { + P->main_source = rt_get_source(P, 0); + rt_lock_source(P->main_source); + } + return PS_UP; } @@ -187,6 +193,10 @@ pipe_cleanup(struct proto *P) P->main_ahook = NULL; p->peer_ahook = NULL; + if (p->mode == PIPE_OPAQUE) + rt_unlock_source(P->main_source); + P->main_source = NULL; + rt_unlock_table(P->table); rt_unlock_table(p->peer_table); } diff --git a/proto/rip/rip.c b/proto/rip/rip.c index ad285bb3..5cc40403 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -263,16 +263,18 @@ find_interface(struct proto *p, struct iface *what) * This part is responsible for any updates that come from network */ +static int rip_rte_better(struct rte *new, struct rte *old); + static void rip_rte_update_if_better(rtable *tab, net *net, struct proto *p, rte *new) { rte *old; - old = rte_find(net, p); - if (!old || p->rte_better(new, old) || + old = rte_find(net, p->main_source); + if (!old || rip_rte_better(new, old) || (ipa_equal(old->attrs->from, new->attrs->from) && (old->u.rip.metric != new->u.rip.metric)) ) - rte_update(tab, net, p, p, new); + rte_update(p, net, new); else rte_free(new); } @@ -295,7 +297,7 @@ advertise_entry( struct proto *p, struct rip_block *b, ip_addr whotoldme, struct int pxlen; bzero(&A, sizeof(A)); - A.proto = p; + A.src= p->main_source; A.source = RTS_RIP; A.scope = SCOPE_UNIVERSE; A.cast = RTC_UNICAST; @@ -614,20 +616,10 @@ rip_start(struct proto *p) add_head( &P->interfaces, NODE rif ); CHK_MAGIC; - rip_init_instance(p); - DBG( "RIP: ...done\n"); return PS_UP; } -static struct proto * -rip_init(struct proto_config *cfg) -{ - struct proto *p = proto_new(cfg, sizeof(struct rip_proto)); - - return p; -} - static void rip_dump(struct proto *p) { @@ -855,7 +847,7 @@ rip_gen_attrs(struct linpool *pool, int metric, u16 tag) static int rip_import_control(struct proto *p, struct rte **rt, struct ea_list **attrs, struct linpool *pool) { - if ((*rt)->attrs->proto == p) /* My own must not be touched */ + if ((*rt)->attrs->src->proto == p) /* My own must not be touched */ return 1; if ((*rt)->attrs->source != RTS_RIP) { @@ -907,7 +899,7 @@ rip_rt_notify(struct proto *p, struct rtable *table UNUSED, struct network *net, if (e->metric > P_CF->infinity) e->metric = P_CF->infinity; - if (new->attrs->proto == p) + if (new->attrs->src->proto == p) e->whotoldme = new->attrs->from; if (!e->metric) /* That's okay: this way user can set his own value for external @@ -929,7 +921,7 @@ rip_rte_same(struct rte *new, struct rte *old) static int rip_rte_better(struct rte *new, struct rte *old) { - struct proto *p = new->attrs->proto; + struct proto *p = new->attrs->src->proto; if (ipa_equal(old->attrs->from, new->attrs->from)) return 1; @@ -940,7 +932,7 @@ rip_rte_better(struct rte *new, struct rte *old) if (old->u.rip.metric > new->u.rip.metric) return 1; - if (old->attrs->proto == new->attrs->proto) /* This does not make much sense for different protocols */ + if (old->attrs->src->proto == new->attrs->src->proto) /* This does not make much sense for different protocols */ if ((old->u.rip.metric == new->u.rip.metric) && ((now - old->lastmod) > (P_CF->timeout_time / 2))) return 1; @@ -956,7 +948,7 @@ rip_rte_better(struct rte *new, struct rte *old) static void rip_rte_insert(net *net UNUSED, rte *rte) { - struct proto *p = rte->attrs->proto; + struct proto *p = rte->attrs->src->proto; CHK_MAGIC; DBG( "rip_rte_insert: %p\n", rte ); add_head( &P->garbage, &rte->u.rip.garbage ); @@ -969,16 +961,18 @@ static void rip_rte_remove(net *net UNUSED, rte *rte) { #ifdef LOCAL_DEBUG - struct proto *p = rte->attrs->proto; + struct proto *p = rte->attrs->src->proto; CHK_MAGIC; DBG( "rip_rte_remove: %p\n", rte ); #endif rem_node( &rte->u.rip.garbage ); } -void -rip_init_instance(struct proto *p) +static struct proto * +rip_init(struct proto_config *cfg) { + struct proto *p = proto_new(cfg, sizeof(struct rip_proto)); + p->accept_ra_types = RA_OPTIMAL; p->if_notify = rip_if_notify; p->rt_notify = rip_rt_notify; @@ -989,6 +983,8 @@ rip_init_instance(struct proto *p) p->rte_same = rip_rte_same; p->rte_insert = rip_rte_insert; p->rte_remove = rip_rte_remove; + + return p; } void diff --git a/proto/rip/rip.h b/proto/rip/rip.h index 6e0d5aad..2a327260 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -177,7 +177,6 @@ struct rip_proto { #endif -void rip_init_instance(struct proto *p); void rip_init_config(struct rip_proto_config *c); /* Authentication functions */ diff --git a/proto/static/static.c b/proto/static/static.c index 9eee820d..9b115acd 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -67,7 +67,7 @@ static_install(struct proto *p, struct static_route *r, struct iface *ifa) DBG("Installing static route %I/%d, rtd=%d\n", r->net, r->masklen, r->dest); bzero(&a, sizeof(a)); - a.proto = p; + a.src = p->main_source; a.source = (r->dest == RTD_DEVICE) ? RTS_STATIC_DEVICE : RTS_STATIC; a.scope = SCOPE_UNIVERSE; a.cast = RTC_UNICAST; @@ -113,7 +113,7 @@ static_install(struct proto *p, struct static_route *r, struct iface *ifa) e = rte_get_temp(aa); e->net = n; e->pflags = 0; - rte_update(p->table, n, p, p, e); + rte_update(p, n, e); r->installed = 1; } @@ -127,8 +127,7 @@ static_remove(struct proto *p, struct static_route *r) DBG("Removing static route %I/%d\n", r->net, r->masklen); n = net_find(p->table, r->net, r->masklen); - if (n) - rte_update(p->table, n, p, p, NULL); + rte_update(p, n, NULL); r->installed = 0; } @@ -367,6 +366,7 @@ static_init(struct proto_config *c) p->neigh_notify = static_neigh_notify; p->if_notify = static_if_notify; + return p; } diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 84ce9c60..176e11ed 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -432,7 +432,7 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) net = net_get(p->p.table, idst, pxlen); rta a = { - .proto = &p->p, + .src = p->p.main_source, .source = RTS_INHERIT, .scope = SCOPE_UNIVERSE, .cast = RTC_UNICAST diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index df9ed622..ed8769b7 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -805,7 +805,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) net *net = net_get(p->p.table, dst, i->rtm_dst_len); rta ra = { - .proto = &p->p, + .src= p->p.main_source, .source = RTS_INHERIT, .scope = SCOPE_UNIVERSE, .cast = RTC_UNICAST diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 8f24cf51..6fdef619 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -351,15 +351,14 @@ krt_learn_announce_update(struct krt_proto *p, rte *e) ee->pflags = 0; ee->pref = p->p.preference; ee->u.krt = e->u.krt; - rte_update(p->p.table, nn, &p->p, &p->p, ee); + rte_update(&p->p, nn, ee); } static void krt_learn_announce_delete(struct krt_proto *p, net *n) { n = net_find(p->p.table, n->n.prefix, n->n.pxlen); - if (n) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + rte_update(&p->p, n, NULL); } /* Called when alien route is discovered during scan */ @@ -697,7 +696,7 @@ krt_export_rte(struct krt_proto *p, rte **new, ea_list **tmpa) if (filter == FILTER_ACCEPT) return 1; - struct proto *src = (*new)->attrs->proto; + struct proto *src = (*new)->attrs->src->proto; *tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(*new, krt_filter_lp) : NULL; return f_run(filter, new, tmpa, krt_filter_lp, FF_FORCE_TMPATTR) <= F_ACCEPT; } @@ -939,7 +938,7 @@ krt_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool * struct krt_proto *p = (struct krt_proto *) P; rte *e = *new; - if (e->attrs->proto == P) + if (e->attrs->src->proto == P) return -1; if (!KRT_CF->devroutes && @@ -991,10 +990,10 @@ krt_init(struct proto_config *c) struct krt_proto *p = proto_new(c, sizeof(struct krt_proto)); p->p.accept_ra_types = RA_OPTIMAL; - p->p.make_tmp_attrs = krt_make_tmp_attrs; - p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.import_control = krt_import_control; p->p.rt_notify = krt_notify; + p->p.make_tmp_attrs = krt_make_tmp_attrs; + p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.rte_same = krt_rte_same; krt_sys_init(p);