Core multipath support.

This commit is contained in:
Ondrej Zajicek 2010-12-07 23:33:55 +01:00
parent 01427d3f2b
commit 7e95c05d88
5 changed files with 140 additions and 55 deletions

View file

@ -51,7 +51,7 @@ CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIREC
RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE) RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE)
CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE) CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE)
CF_ENUM(T_ENUM_RTC, RTC_, UNICAST, BROADCAST, MULTICAST, ANYCAST) CF_ENUM(T_ENUM_RTC, RTC_, UNICAST, BROADCAST, MULTICAST, ANYCAST)
CF_ENUM(T_ENUM_RTD, RTD_, ROUTER, DEVICE, BLACKHOLE, UNREACHABLE, PROHIBIT) CF_ENUM(T_ENUM_RTD, RTD_, ROUTER, DEVICE, BLACKHOLE, UNREACHABLE, PROHIBIT, MULTIPATH)
%type <i32> idval %type <i32> idval
%type <f> imexport %type <f> imexport

View file

@ -170,7 +170,7 @@ struct hostentry {
struct hostentry *next; /* Next in hash chain */ struct hostentry *next; /* Next in hash chain */
unsigned hash_key; /* Hash key */ unsigned hash_key; /* Hash key */
unsigned uc; /* Use count */ unsigned uc; /* Use count */
struct iface *iface; /* Chosen outgoing interface */ struct rta *src; /* Source rta entry */
ip_addr gw; /* Chosen next hop */ ip_addr gw; /* Chosen next hop */
byte dest; /* Chosen route destination type (RTD_...) */ byte dest; /* Chosen route destination type (RTD_...) */
u32 igp_metric; /* Chosen route IGP metric */ u32 igp_metric; /* Chosen route IGP metric */
@ -266,6 +266,14 @@ void rt_show(struct rt_show_data *);
* construction of BGP route attribute lists. * construction of BGP route attribute lists.
*/ */
/* Multipath next-hop */
struct mpnh {
ip_addr gw; /* Next hop */
struct iface *iface; /* Outgoing interface */
struct mpnh *next;
unsigned char weight;
};
typedef struct rta { typedef struct rta {
struct rta *next, **pprev; /* Hash chain */ struct rta *next, **pprev; /* Hash chain */
struct proto *proto; /* Protocol instance that originally created the route */ struct proto *proto; /* Protocol instance that originally created the route */
@ -282,6 +290,7 @@ typedef struct rta {
ip_addr from; /* Advertising router */ ip_addr from; /* Advertising router */
struct hostentry *hostentry; /* Hostentry for recursive next-hops */ struct hostentry *hostentry; /* Hostentry for recursive next-hops */
struct iface *iface; /* Outgoing interface */ struct iface *iface; /* Outgoing interface */
struct mpnh *nexthops; /* Next-hops for multipath routes */
struct ea_list *eattrs; /* Extended Attribute chain */ struct ea_list *eattrs; /* Extended Attribute chain */
} rta; } rta;
@ -309,7 +318,8 @@ typedef struct rta {
#define RTD_BLACKHOLE 2 /* Silently drop packets */ #define RTD_BLACKHOLE 2 /* Silently drop packets */
#define RTD_UNREACHABLE 3 /* Reject as unreachable */ #define RTD_UNREACHABLE 3 /* Reject as unreachable */
#define RTD_PROHIBIT 4 /* Administratively prohibited */ #define RTD_PROHIBIT 4 /* Administratively prohibited */
#define RTD_NONE 5 /* Invalid RTD */ #define RTD_MULTIPATH 5 /* Multipath route (nexthops != NULL) */
#define RTD_NONE 6 /* Invalid RTD */
#define RTAF_CACHED 1 /* This is a cached rta */ #define RTAF_CACHED 1 /* This is a cached rta */
@ -387,6 +397,10 @@ void ea_format(eattr *e, byte *buf);
#define EA_FORMAT_BUF_SIZE 256 #define EA_FORMAT_BUF_SIZE 256
ea_list *ea_append(ea_list *to, ea_list *what); ea_list *ea_append(ea_list *to, ea_list *what);
int mpnh__same(struct mpnh *x, struct mpnh *y); /* Compare multipath nexthops */
static inline int mpnh_same(struct mpnh *x, struct mpnh *y)
{ return (x == y) || mpnh__same(x, y); }
void rta_init(void); void rta_init(void);
rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */
static inline rta *rta_clone(rta *r) { r->uc++; return r; } static inline rta *rta_clone(rta *r) { r->uc++; return r; }
@ -403,12 +417,14 @@ void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, i
* count. Cached rta locks its hostentry (increases its use count), * count. Cached rta locks its hostentry (increases its use count),
* uncached rta does not lock it. Hostentry with zero use count is * uncached rta does not lock it. Hostentry with zero use count is
* removed asynchronously during host cache update, therefore it is * removed asynchronously during host cache update, therefore it is
* safe to hold such hostentry temorarily. There is no need to hold * safe to hold such hostentry temorarily. Hostentry holds a lock for
* a lock for hostentry->dep table, because that table contains routes * a 'source' rta, mainly to share multipath nexthops. There is no
* responsible for that hostentry, and therefore is non-empty if given * need to hold a lock for hostentry->dep table, because that table
* hostentry has non-zero use count. The protocol responsible for routes * contains routes responsible for that hostentry, and therefore is
* with recursive next hops should also hold a lock for a table governing * non-empty if given hostentry has non-zero use count. The protocol
* that routes (argument tab to rta_set_recursive_next_hop()). * responsible for routes with recursive next hops should also hold a
* lock for a table governing that routes (argument tab to
* rta_set_recursive_next_hop()).
*/ */
static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; }

View file

@ -57,9 +57,65 @@
pool *rta_pool; pool *rta_pool;
static slab *rta_slab; static slab *rta_slab;
static slab *mpnh_slab;
struct protocol *attr_class_to_protocol[EAP_MAX]; struct protocol *attr_class_to_protocol[EAP_MAX];
static inline unsigned int
mpnh_hash(struct mpnh *x)
{
unsigned int h = 0;
for (; x; x = x->next)
h ^= ipa_hash(x->gw);
return h;
}
int
mpnh__same(struct mpnh *x, struct mpnh *y)
{
for (; x && y; x = x->next, y = y->next)
if (!ipa_equal(x->gw, y->gw) || (x->iface != y->iface) || (x->weight != y->weight))
return 0;
return x == y;
}
static struct mpnh *
mpnh_copy(struct mpnh *o)
{
struct mpnh *first = NULL;
struct mpnh **last = &first;
for (; o; o = o->next)
{
struct mpnh *n = sl_alloc(mpnh_slab);
n->gw = o->gw;
n->iface = o->iface;
n->next = NULL;
n->weight = o->weight;
*last = n;
last = &(n->next);
}
return first;
}
static void
mpnh_free(struct mpnh *o)
{
struct mpnh *n;
while (o)
{
n = o->next;
sl_free(mpnh_slab, o);
o = n;
}
}
/* /*
* Extended Attributes * Extended Attributes
*/ */
@ -587,7 +643,8 @@ rta_alloc_hash(void)
static inline unsigned int static inline unsigned int
rta_hash(rta *a) rta_hash(rta *a)
{ {
return (a->proto->hash_key ^ ipa_hash(a->gw) ^ ea_hash(a->eattrs)) & 0xffff; return (a->proto->hash_key ^ ipa_hash(a->gw) ^
mpnh_hash(a->nexthops) ^ ea_hash(a->eattrs)) & 0xffff;
} }
static inline int static inline int
@ -604,6 +661,7 @@ rta_same(rta *x, rta *y)
ipa_equal(x->from, y->from) && ipa_equal(x->from, y->from) &&
x->iface == y->iface && x->iface == y->iface &&
x->hostentry == y->hostentry && x->hostentry == y->hostentry &&
mpnh_same(x->nexthops, y->nexthops) &&
ea_same(x->eattrs, y->eattrs)); ea_same(x->eattrs, y->eattrs));
} }
@ -614,6 +672,7 @@ rta_copy(rta *o)
memcpy(r, o, sizeof(rta)); memcpy(r, o, sizeof(rta));
r->uc = 1; r->uc = 1;
r->nexthops = mpnh_copy(o->nexthops);
r->eattrs = ea_list_copy(o->eattrs); r->eattrs = ea_list_copy(o->eattrs);
return r; return r;
} }
@ -707,6 +766,7 @@ rta__free(rta *a)
a->next->pprev = a->pprev; a->next->pprev = a->pprev;
a->aflags = 0; /* Poison the entry */ a->aflags = 0; /* Poison the entry */
rt_unlock_hostentry(a->hostentry); rt_unlock_hostentry(a->hostentry);
mpnh_free(a->nexthops);
ea_free(a->eattrs); ea_free(a->eattrs);
sl_free(rta_slab, a); sl_free(rta_slab, a);
} }
@ -798,6 +858,7 @@ rta_init(void)
{ {
rta_pool = rp_new(&root_pool, "Attributes"); rta_pool = rp_new(&root_pool, "Attributes");
rta_slab = sl_new(rta_pool, sizeof(rta)); rta_slab = sl_new(rta_pool, sizeof(rta));
mpnh_slab = sl_new(rta_pool, sizeof(struct mpnh));
rta_alloc_hash(); rta_alloc_hash();
} }

View file

@ -962,29 +962,31 @@ rt_preconfig(struct config *c)
* triggered by rt_schedule_nhu(). * triggered by rt_schedule_nhu().
*/ */
static inline int
hostentry_diff(struct hostentry *he, struct iface *iface, ip_addr gw,
byte dest, u32 igp_metric)
{
return (he->iface != iface) || !ipa_equal(he->gw, gw) ||
(he->dest != dest) || (he->igp_metric != igp_metric);
}
static inline int static inline int
rta_next_hop_outdated(rta *a) rta_next_hop_outdated(rta *a)
{ {
struct hostentry *he = a->hostentry; struct hostentry *he = a->hostentry;
return he && hostentry_diff(he, a->iface, a->gw, a->dest, a->igp_metric);
if (!he)
return 0;
if (!he->src)
return a->dest != RTD_UNREACHABLE;
return (a->iface != he->src->iface) || !ipa_equal(a->gw, he->gw) ||
(a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
!mpnh_same(a->nexthops, he->src->nexthops);
} }
static inline void static inline void
rta_apply_hostentry(rta *a, struct hostentry *he) rta_apply_hostentry(rta *a, struct hostentry *he)
{ {
a->hostentry = he; a->hostentry = he;
a->iface = he->iface; a->iface = he->src ? he->src->iface : NULL;
a->gw = he->gw; a->gw = he->gw;
a->dest = he->dest; a->dest = he->dest;
a->igp_metric = he->igp_metric; a->igp_metric = he->igp_metric;
a->nexthops = he->src ? he->src->nexthops : NULL;
} }
static inline rte * static inline rte *
@ -1388,6 +1390,7 @@ hc_new_hostentry(struct hostcache *hc, ip_addr a, ip_addr ll, rtable *dep, unsig
he->tab = dep; he->tab = dep;
he->hash_key = k; he->hash_key = k;
he->uc = 0; he->uc = 0;
he->src = NULL;
add_tail(&hc->hostentries, &he->ln); add_tail(&hc->hostentries, &he->ln);
hc_insert(hc, he); hc_insert(hc, he);
@ -1402,6 +1405,8 @@ hc_new_hostentry(struct hostcache *hc, ip_addr a, ip_addr ll, rtable *dep, unsig
static void static void
hc_delete_hostentry(struct hostcache *hc, struct hostentry *he) hc_delete_hostentry(struct hostcache *hc, struct hostentry *he)
{ {
rta_free(he->src);
rem_node(&he->ln); rem_node(&he->ln);
hc_remove(hc, he); hc_remove(hc, he);
sl_free(hc->slab, he); sl_free(hc->slab, he);
@ -1436,6 +1441,8 @@ rt_free_hostcache(rtable *tab)
WALK_LIST(n, hc->hostentries) WALK_LIST(n, hc->hostentries)
{ {
struct hostentry *he = SKIP_BACK(struct hostentry, ln, n); struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
rta_free(he->src);
if (he->uc) if (he->uc)
log(L_ERR "Hostcache is not empty in table %s", tab->name); log(L_ERR "Hostcache is not empty in table %s", tab->name);
} }
@ -1488,7 +1495,7 @@ rt_get_igp_metric(rte *rt)
return rt->u.rip.metric; return rt->u.rip.metric;
/* Device routes */ /* Device routes */
if (a->dest != RTD_ROUTER) if ((a->dest != RTD_ROUTER) && (a->dest != RTD_MULTIPATH))
return 0; return 0;
return IGP_METRIC_UNKNOWN; return IGP_METRIC_UNKNOWN;
@ -1497,12 +1504,15 @@ rt_get_igp_metric(rte *rt)
static int static int
rt_update_hostentry(rtable *tab, struct hostentry *he) rt_update_hostentry(rtable *tab, struct hostentry *he)
{ {
struct iface *old_iface = he->iface; rta *old_src = he->src;
ip_addr old_gw = he->gw;
byte old_dest = he->dest;
u32 old_metric = he->igp_metric;
int pxlen = 0; int pxlen = 0;
/* Reset the hostentry */
he->src = NULL;
he->gw = IPA_NONE;
he->dest = RTD_UNREACHABLE;
he->igp_metric = 0;
net *n = net_route(tab, he->addr, MAX_PREFIX_LENGTH); net *n = net_route(tab, he->addr, MAX_PREFIX_LENGTH);
if (n) if (n)
{ {
@ -1513,53 +1523,41 @@ rt_update_hostentry(rtable *tab, struct hostentry *he)
{ {
/* Recursive route should not depend on another recursive route */ /* Recursive route should not depend on another recursive route */
log(L_WARN "Next hop address %I resolvable through recursive route for %I/%d", log(L_WARN "Next hop address %I resolvable through recursive route for %I/%d",
he->addr, n->n.prefix, n->n.pxlen); he->addr, n->n.prefix, pxlen);
he->iface = NULL; goto done;
he->gw = IPA_NONE;
he->dest = RTD_UNREACHABLE;
} }
else if (a->dest == RTD_DEVICE)
if (a->dest == RTD_DEVICE)
{ {
if (if_local_addr(he->addr, a->iface)) if (if_local_addr(he->addr, a->iface))
{ {
/* The host address is a local address, this is not valid */ /* The host address is a local address, this is not valid */
log(L_WARN "Next hop address %I is a local address of iface %s", log(L_WARN "Next hop address %I is a local address of iface %s",
he->addr, a->iface->name); he->addr, a->iface->name);
he->iface = NULL; goto done;
he->gw = IPA_NONE;
he->dest = RTD_UNREACHABLE;
} }
else
{
/* The host is directly reachable, use link as a gateway */ /* The host is directly reachable, use link as a gateway */
he->iface = a->iface;
he->gw = he->link; he->gw = he->link;
he->dest = RTD_ROUTER; he->dest = RTD_ROUTER;
} }
}
else else
{ {
/* The host is reachable through some route entry */ /* The host is reachable through some route entry */
he->iface = a->iface;
he->gw = a->gw; he->gw = a->gw;
he->dest = a->dest; he->dest = a->dest;
} }
he->igp_metric = he->iface ? rt_get_igp_metric(n->routes) : 0; he->src = rta_clone(a);
} he->igp_metric = rt_get_igp_metric(n->routes);
else
{
/* The host is unreachable */
he->iface = NULL;
he->gw = IPA_NONE;
he->dest = RTD_UNREACHABLE;
he->igp_metric = 0;
} }
done:
/* Add a prefix range to the trie */ /* Add a prefix range to the trie */
trie_add_prefix(tab->hostcache->trie, he->addr, MAX_PREFIX_LENGTH, pxlen, MAX_PREFIX_LENGTH); trie_add_prefix(tab->hostcache->trie, he->addr, MAX_PREFIX_LENGTH, pxlen, MAX_PREFIX_LENGTH);
return hostentry_diff(he, old_iface, old_gw, old_dest, old_metric); rta_free(old_src);
return old_src != he->src;
} }
static void static void
@ -1630,6 +1628,7 @@ rt_format_via(rte *e, byte *via)
case RTD_BLACKHOLE: bsprintf(via, "blackhole"); break; case RTD_BLACKHOLE: bsprintf(via, "blackhole"); break;
case RTD_UNREACHABLE: bsprintf(via, "unreachable"); break; case RTD_UNREACHABLE: bsprintf(via, "unreachable"); break;
case RTD_PROHIBIT: bsprintf(via, "prohibited"); break; case RTD_PROHIBIT: bsprintf(via, "prohibited"); break;
case RTD_MULTIPATH: bsprintf(via, "multipath"); break;
default: bsprintf(via, "???"); default: bsprintf(via, "???");
} }
} }
@ -1641,6 +1640,7 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm
byte tm[TM_DATETIME_BUFFER_SIZE], info[256]; byte tm[TM_DATETIME_BUFFER_SIZE], info[256];
rta *a = e->attrs; rta *a = e->attrs;
int primary = (e->net->routes == e); int primary = (e->net->routes == e);
struct mpnh *nh;
rt_format_via(e, via); rt_format_via(e, via);
tm_format_datetime(tm, &config->tf_route, e->lastmod); tm_format_datetime(tm, &config->tf_route, e->lastmod);
@ -1663,6 +1663,8 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm
bsprintf(info, " (%d)", e->pref); bsprintf(info, " (%d)", e->pref);
cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->proto->name, cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->proto->name,
tm, from, primary ? " *" : "", info); tm, from, primary ? " *" : "", info);
for (nh = a->nexthops; nh; nh = nh->next)
cli_printf(c, -1007, "\tvia %I on %s weight %d", nh->gw, nh->iface->name, nh->weight + 1);
if (d->verbose) if (d->verbose)
rta_show(c, a, tmpa); rta_show(c, a, tmpa);
} }

View file

@ -1015,6 +1015,13 @@ bgp_get_neighbor(rte *r)
return ((struct bgp_proto *) r->attrs->proto)->remote_as; return ((struct bgp_proto *) r->attrs->proto)->remote_as;
} }
static inline int
rte_resolvable(rte *rt)
{
int rd = rt->attrs->dest;
return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH);
}
int int
bgp_rte_better(rte *new, rte *old) bgp_rte_better(rte *new, rte *old)
{ {
@ -1024,9 +1031,8 @@ bgp_rte_better(rte *new, rte *old)
u32 n, o; u32 n, o;
/* RFC 4271 9.1.2.1. Route resolvability test */ /* RFC 4271 9.1.2.1. Route resolvability test */
/* non-NULL iface means it is either RTD_ROUTER or RTD_DEVICE route */ n = rte_resolvable(new);
n = new->attrs->iface != NULL; o = rte_resolvable(old);
o = old->attrs->iface != NULL;
if (n > o) if (n > o)
return 1; return 1;
if (n < o) if (n < o)
@ -1502,7 +1508,7 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
buf += bsprintf(buf, " (%d", e->pref); buf += bsprintf(buf, " (%d", e->pref);
if (e->attrs->hostentry) if (e->attrs->hostentry)
{ {
if (!e->attrs->iface) if (!rte_resolvable(e))
buf += bsprintf(buf, "/-"); buf += bsprintf(buf, "/-");
else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
buf += bsprintf(buf, "/?"); buf += bsprintf(buf, "/?");