Commit 145368f5 authored by Ondřej Zajíček's avatar Ondřej Zajíček
Browse files

Extends multipath support for OSPF.

Fixes cases where the same network or external route are propagated by
several OSPF routes and some other corner cases in next hop construction
and ECMP. Allows to specify whether external routes should be merged.

Thanks to Peter Christensen for the original patch.
parent 4dd24f05
......@@ -2291,6 +2291,7 @@ protocol ospf <name> {
stub router <switch>;
tick <num>;
ecmp <switch> [limit <num>];
merge external <switch>;
area <id> {
stub;
nssa;
......@@ -2396,6 +2397,14 @@ protocol ospf <name> {
nexthops in one route. By default, ECMP is disabled. If enabled,
default value of the limit is 16.
<tag>merge external <M>switch</M></tag>
This option specifies whether OSPF should merge external routes from
different routers/LSAs for the same destination. When enabled together
with <cf/ecmp/, equal-cost external routes will be combined to multipath
routes in the same way as regular routes. When disabled, external routes
from different LSAs are treated as separate even if they represents the
same destination. Default value is no.
<tag>area <M>id</M></tag>
This defines an OSPF area with given area ID (an integer or an IPv4
address, similarly to a router ID). The most important area is the
......
......@@ -132,7 +132,7 @@ CF_KEYWORDS(ELIGIBLE, POLL, NETWORKS, HIDDEN, VIRTUAL, CHECK, LINK, ONLY, BFD)
CF_KEYWORDS(RX, BUFFER, LARGE, NORMAL, STUBNET, HIDDEN, SUMMARY, TAG, EXTERNAL)
CF_KEYWORDS(WAIT, DELAY, LSADB, ECMP, LIMIT, WEIGHT, NSSA, TRANSLATOR, STABILITY)
CF_KEYWORDS(GLOBAL, LSID, ROUTER, SELF, INSTANCE, REAL, NETMASK, TX, PRIORITY, LENGTH)
CF_KEYWORDS(SECONDARY)
CF_KEYWORDS(SECONDARY, MERGE)
%type <t> opttext
%type <ld> lsadb_args
......@@ -162,6 +162,7 @@ ospf_proto_item:
| STUB ROUTER bool { OSPF_CFG->stub_router = $3; }
| ECMP bool { OSPF_CFG->ecmp = $2 ? DEFAULT_ECMP_LIMIT : 0; }
| ECMP bool LIMIT expr { OSPF_CFG->ecmp = $2 ? $4 : 0; if ($4 < 0) cf_error("ECMP limit cannot be negative"); }
| MERGE EXTERNAL bool { OSPF_CFG->merge_external = $3; }
| TICK expr { OSPF_CFG->tick = $2; if($2<=0) cf_error("Tick must be greater than zero"); }
| ospf_area
;
......
......@@ -234,6 +234,7 @@ ospf_start(struct proto *p)
po->router_id = proto_get_router_id(p->cf);
po->rfc1583 = c->rfc1583;
po->stub_router = c->stub_router;
po->merge_external = c->merge_external;
po->ebit = 0;
po->ecmp = c->ecmp;
po->tick = c->tick;
......@@ -742,6 +743,7 @@ ospf_reconfigure(struct proto *p, struct proto_config *c)
return 0;
po->stub_router = new->stub_router;
po->merge_external = new->merge_external;
po->ecmp = new->ecmp;
po->tick = new->tick;
po->disp_timer->recurrent = po->tick;
......
......@@ -81,6 +81,7 @@ struct ospf_config
unsigned tick;
byte rfc1583;
byte stub_router;
byte merge_external;
byte abr;
int ecmp;
list area_list; /* list of struct ospf_area_config */
......@@ -777,6 +778,7 @@ struct proto_ospf
struct fib rtf; /* Routing table */
byte rfc1583; /* RFC1583 compatibility */
byte stub_router; /* Do not forward transit traffic */
byte merge_external; /* Should i merge external routes? */
byte ebit; /* Did I originate any ext lsa? */
byte ecmp; /* Maximal number of nexthops in ECMP route, or 0 */
struct ospf_area *backbone; /* If exists */
......
......@@ -37,9 +37,15 @@ ospf_rt_initort(struct fib_node *fn)
}
static inline int
unresolved_vlink(struct mpnh *nhs)
nh_is_vlink(struct mpnh *nhs)
{
return nhs && !nhs->iface;
return !nhs->iface;
}
static inline int
unresolved_vlink(ort *ort)
{
return ort->n.nhs && nh_is_vlink(ort->n.nhs);
}
static inline struct mpnh *
......@@ -54,7 +60,7 @@ new_nexthop(struct proto_ospf *po, ip_addr gw, struct iface *iface, unsigned cha
}
static inline struct mpnh *
copy_nexthop(struct proto_ospf *po, struct mpnh *src)
copy_nexthop(struct proto_ospf *po, const struct mpnh *src)
{
struct mpnh *nh = lp_alloc(po->nhpool, sizeof(struct mpnh));
nh->gw = src->gw;
......@@ -64,72 +70,138 @@ copy_nexthop(struct proto_ospf *po, struct mpnh *src)
return nh;
}
/* If new is better return 1 */
/* Compare nexthops during merge.
We need to maintain nhs sorted to eliminate duplicities */
static int
ri_better(struct proto_ospf *po, orta *new, orta *old)
cmp_nhs(struct mpnh *s1, struct mpnh *s2)
{
if (old->type == RTS_DUMMY)
return 1;
int r;
if (new->type < old->type)
if (!s1)
return 1;
if (new->type > old->type)
return 0;
if (!s2)
return -1;
if (new->metric1 < old->metric1)
return 1;
r = ((int) s2->weight) - ((int) s1->weight);
if (r)
return r;
if (new->metric1 > old->metric1)
return 0;
r = ipa_compare(s1->gw, s2->gw);
if (r)
return r;
return 0;
return ((int) s1->iface->index) - ((int) s2->iface->index);
}
static struct mpnh *
merge_nexthops(struct proto_ospf *po, struct mpnh *s1, struct mpnh *s2, int r1, int r2)
{
struct mpnh *root = NULL;
struct mpnh **n = &root;
int count = po->ecmp;
/* Whether the ASBR or the forward address destination is preferred
in AS external route selection according to 16.4.1. */
/*
* r1, r2 signalize whether we can reuse nexthops from s1, s2.
* New nexthops (s2, new) can be reused if they are not inherited
* from the parent (i.e. it is allocated in calc_next_hop()).
* Current nexthops (s1, en->nhs) can be reused if they weren't
* inherited in previous steps (that is stored in nhs_reuse,
* i.e. created by merging or allocalted in calc_next_hop()).
*
* Generally, a node first inherits shared nexthops from its
* parent and later possibly gets reusable copy during merging.
*/
while ((s1 || s2) && count--)
{
int cmp = cmp_nhs(s1, s2);
if (cmp < 0)
{
*n = r1 ? s1 : copy_nexthop(po, s1);
s1 = s1->next;
}
else if (cmp > 0)
{
*n = r2 ? s2 : copy_nexthop(po, s2);
s2 = s2->next;
}
else
{
*n = r1 ? s1 : (r2 ? s2 : copy_nexthop(po, s1));
s1 = s1->next;
s2 = s2->next;
}
n = &((*n)->next);
}
*n = NULL;
return root;
}
/* Returns true if there are device nexthops in n */
static inline int
epath_preferred(orta *ep)
has_device_nexthops(const struct mpnh *n)
{
return (ep->type == RTS_OSPF) && (ep->oa->areaid != 0);
for (; n; n = n->next)
if (ipa_zero(n->gw))
return 1;
return 0;
}
/* 16.4. (3), return 1 if new is better */
static int
ri_better_asbr(struct proto_ospf *po, orta *new, orta *old)
/* Replace device nexthops with nexthops to gw */
static struct mpnh *
fix_device_nexthops(struct proto_ospf *po, const struct mpnh *n, ip_addr gw)
{
if (old->type == RTS_DUMMY)
return 1;
struct mpnh *root1 = NULL;
struct mpnh *root2 = NULL;
struct mpnh **nn1 = &root1;
struct mpnh **nn2 = &root2;
if (!po->rfc1583)
{
int new_pref = epath_preferred(new);
int old_pref = epath_preferred(old);
/* This is a bit tricky. We cannot just copy the list and update n->gw,
because the list should stay sorted, so we create two lists, one with new
gateways and one with old ones, and then merge them. */
if (new_pref > old_pref)
return 1;
for (; n; n = n->next)
{
struct mpnh *nn = new_nexthop(po, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight);
if (new_pref < old_pref)
return 0;
if (ipa_zero(n->gw))
{
*nn1 = nn;
nn1 = &(nn->next);
}
else
{
*nn2 = nn;
nn2 = &(nn->next);
}
}
if (new->metric1 < old->metric1)
return 1;
return merge_nexthops(po, root1, root2, 1, 1);
}
if (new->metric1 > old->metric1)
return 0;
/* Larger area ID is preferred */
if (new->oa->areaid > old->oa->areaid)
return 1;
/* Whether the ASBR or the forward address destination is preferred
in AS external route selection according to 16.4.1. */
static inline int
epath_preferred(const orta *ep)
{
return (ep->type == RTS_OSPF) && (ep->oa->areaid != 0);
}
return 0;
/* Whether the ext route has ASBR/next_hop marked as preferred. */
static inline int
orta_pref(const orta *nf)
{
return !!(nf->options & ORTA_PREF);
}
/* Classify orta entries according to RFC 3101 2.5 (6e) priorities:
Type-7 LSA with P-bit, Type-5 LSA, Type-7 LSA without P-bit */
static int
orta_prio(orta *nf)
orta_prio(const orta *nf)
{
/* RFC 3103 2.5 (6e) priorities */
u32 opts = nf->options & (ORTA_NSSA | ORTA_PROP);
......@@ -145,98 +217,246 @@ orta_prio(orta *nf)
return 0;
}
/* 16.4. (6), return 1 if new is better */
/* Whether the route is better according to RFC 3101 2.5 (6e):
Prioritize Type-7 LSA with P-bit, then Type-5 LSA, then higher router ID */
static int
orta_prefer_lsa(const orta *new, const orta *old)
{
int pn = orta_prio(new);
int po = orta_prio(old);
return (pn > po) || ((pn == po) && (new->en->lsa.rt > old->en->lsa.rt));
}
/*
* Compare an existing routing table entry with a new one. Applicable for
* intra-area routes, inter-area routes and router entries. Returns integer
* <, = or > than 0 if the new orta is less, equal or more preferred than
* the old orta.
*/
static int
ri_better_ext(struct proto_ospf *po, orta *new, orta *old)
orta_compare(const struct proto_ospf *po, const orta *new, const orta *old)
{
int r;
if (old->type == RTS_DUMMY)
return 1;
/* 16.4. (6a) */
if (new->type < old->type)
/* Prefer intra-area to inter-area to externals */
r = ((int) old->type) - ((int) new->type);
if (r) return r;
/* Prefer lowest type 1 metric */
r = ((int) old->metric1) - ((int) new->metric1);
if (r) return r;
/* Rest is BIRD-specific */
/* Area-wide routes should not mix next-hops from different areas.
This generally should not happen unless there is some misconfiguration. */
if (new->oa->areaid != old->oa->areaid)
return (new->oa->areaid > old->oa->areaid) ? 1 : -1;
/* Prefer routes for configured stubnets (!nhs) to regular routes to dummy
vlink nexthops. We intentionally return -1 if both are stubnets or vlinks. */
if (!old->nhs)
return -1;
if (!new->nhs)
return 1;
if (nh_is_vlink(new->nhs))
return -1;
if (nh_is_vlink(old->nhs))
return 1;
if (new->type > old->type)
if (po->ecmp)
return 0;
/* 16.4. (6b), same type */
if (new->type == RTS_OSPF_EXT2)
{
if (new->metric2 < old->metric2)
return 1;
/* Prefer routes with higher Router ID, just to be more deterministic */
if (new->rid > old->rid)
return 1;
return -1;
}
if (new->metric2 > old->metric2)
return 0;
}
/*
* Compare ASBR routing table entry with a new one, used for precompute ASBRs
* for AS external route selection (RFC 2328 16.4 (3)), Returns integer < or >
* than 0 if the new ASBR is less or more preferred than the old ASBR.
*/
static int
orta_compare_asbr(const struct proto_ospf *po, const orta *new, const orta *old)
{
int r;
if (old->type == RTS_DUMMY)
return 1;
/* 16.4. (6c) */
if (!po->rfc1583)
{
u32 new_pref = new->options & ORTA_PREF;
u32 old_pref = old->options & ORTA_PREF;
if (new_pref > old_pref)
return 1;
if (new_pref < old_pref)
return 0;
r = epath_preferred(new) - epath_preferred(old);
if (r) return r;
}
/* 16.4. (6d) */
if (new->metric1 < old->metric1)
r = ((int) old->metric1) - ((int) new->metric1);
if (r) return r;
/* Larger area ID is preferred */
if (new->oa->areaid > old->oa->areaid)
return 1;
if (new->metric1 > old->metric1)
return 0;
/* There is just one ASBR of that RID per area, so tie is not possible */
return -1;
}
/* RFC 3103, 2.5. (6e) */
int new_prio = orta_prio(new);
int old_prio = orta_prio(old);
/*
* Compare a routing table entry with a new one, for AS external routes
* (RFC 2328 16.4) and NSSA routes (RFC 3103 2.5), Returns integer <, = or >
* than 0 if the new orta is less, equal or more preferred than the old orta.
*/
static int
orta_compare_ext(const struct proto_ospf *po, const orta *new, const orta *old)
{
int r;
if (new_prio > old_prio)
if (old->type == RTS_DUMMY)
return 1;
if (old_prio > new_prio)
/* 16.4 (6a) - prefer routes with lower type */
r = ((int) old->type) - ((int) new->type);
if (r) return r;
/* 16.4 (6b) - prefer routes with lower type 2 metric */
if (new->type == RTS_OSPF_EXT2)
{
r = ((int) old->metric2) - ((int) new->metric2);
if (r) return r;
}
/* 16.4 (6c) - if not RFC1583, prefer routes with preferred ASBR/next_hop */
if (!po->rfc1583)
{
r = orta_pref(new) - orta_pref(old);
if (r) return r;
}
/* 16.4 (6d) - prefer routes with lower type 1 metric */
r = ((int) old->metric1) - ((int) new->metric1);
if (r) return r;
if (po->ecmp && po->merge_external)
return 0;
/* make it more deterministic */
if (new->rid > old->rid)
/*
* RFC 3101 2.5 (6e) - prioritize Type-7 LSA with P-bit, then Type-5 LSA, then
* LSA with higher router ID. Although this should apply just to functionally
* equivalent LSAs (i.e. ones with the same non-zero forwarding address), we
* use it also to disambiguate otherwise equally preferred nexthops.
*/
if (orta_prefer_lsa(new, old))
return 1;
return 0;
return -1;
}
static inline void
ort_replace(ort *o, const orta *new)
{
memcpy(&o->n, new, sizeof(orta));
}
static void
ort_merge(struct proto_ospf *po, ort *o, const orta *new)
{
orta *old = &o->n;
if (old->nhs != new->nhs)
{
old->nhs = merge_nexthops(po, old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse);
old->nhs_reuse = 1;
}
if (old->rid < new->rid)
old->rid = new->rid;
}
static void
ort_merge_ext(struct proto_ospf *po, ort *o, const orta *new)
{
orta *old = &o->n;
if (old->nhs != new->nhs)
{
old->nhs = merge_nexthops(po, old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse);
old->nhs_reuse = 1;
}
if (old->tag != new->tag)
old->tag = 0;
/*
* Even with multipath, we store only one LSA in orta.en for the purpose of
* NSSA/ext translation. Therefore, we apply procedures from RFC 3101 2.5 (6e)
* to all chosen LSAs for given network, not just to functionally equivalent
* ones (i.e. ones with the same non-zero forwarding address).
*/
if (orta_prefer_lsa(new, old))
{
old->options = new->options;
old->rid = new->rid;
old->oa = new->oa;
old->en = new->en;
}
}
static inline void
ri_install_net(struct proto_ospf *po, ip_addr prefix, int pxlen, orta *new)
ri_install_net(struct proto_ospf *po, ip_addr prefix, int pxlen, const orta *new)
{
ort *old = (ort *) fib_get(&po->rtf, &prefix, pxlen);
if (ri_better(po, new, &old->n))
memcpy(&old->n, new, sizeof(orta));
int cmp = orta_compare(po, new, &old->n);
if (cmp > 0)
ort_replace(old, new);
else if (cmp == 0)
ort_merge(po, old, new);
}
static inline void
ri_install_rt(struct ospf_area *oa, u32 rid, orta *new)
ri_install_rt(struct ospf_area *oa, u32 rid, const orta *new)
{
ip_addr addr = ipa_from_rid(rid);
ort *old = (ort *) fib_get(&oa->rtr, &addr, MAX_PREFIX_LENGTH);
if (ri_better(oa->po, new, &old->n))
memcpy(&old->n, new, sizeof(orta));
int cmp = orta_compare(oa->po, new, &old->n);
if (cmp > 0)
ort_replace(old, new);
else if (cmp == 0)
ort_merge(oa->po, old, new);
}
static inline void
ri_install_asbr(struct proto_ospf *po, ip_addr *addr, orta *new)
ri_install_asbr(struct proto_ospf *po, ip_addr *addr, const orta *new)
{
ort *old = (ort *) fib_get(&po->backbone->rtr, addr, MAX_PREFIX_LENGTH);
if (ri_better_asbr(po, new, &old->n))
memcpy(&old->n, new, sizeof(orta));
if (orta_compare_asbr(po, new, &old->n) > 0)
ort_replace(old, new);
}
static inline void
ri_install_ext(struct proto_ospf *po, ip_addr prefix, int pxlen, orta *new)
ri_install_ext(struct proto_ospf *po, ip_addr prefix, int pxlen, const orta *new)
{
ort *old = (ort *) fib_get(&po->rtf, &prefix, pxlen);
if (ri_better_ext(po, new, &old->n))
memcpy(&old->n, new, sizeof(orta));
int cmp = orta_compare_ext(po, new, &old->n);
if (cmp > 0)
ort_replace(old, new);
else if (cmp == 0)
ort_merge_ext(po, old, new);
}
static inline struct ospf_iface *
......@@ -842,7 +1062,7 @@ ospf_rt_sum_tr(struct ospf_area *oa)
/* 16.3. (5) */
if ((metric < re->n.metric1) ||
((metric == re->n.metric1) && unresolved_vlink(re->n.nhs)))
((metric == re->n.metric1) && unresolved_vlink(re)))
{
/* We want to replace the next-hop even if the metric is equal
to replace a virtual next-hop through vlink with a real one.
......@@ -1129,13 +1349,24 @@ ospf_rt_abr1(struct proto_ospf *po)
struct area_net *anet;
ort *nf, *default_nf;
/* RFC 2328 G.3 - incomplete resolution of virtual next hops - routers */
FIB_WALK(&po->backbone->rtr, nftmp)
{
nf = (ort *) nftmp;
if (nf->n.type && unresolved_vlink(nf))
reset_ri(nf);
}
FIB_WALK_END;
FIB_WALK(&po->rtf, nftmp)
{
nf = (ort *) nftmp;
/* RFC 2328 G.3 - incomplete resolution of virtual next hops */
if (nf->n.type && unresolved_vlink(nf->n.nhs))
/* RFC 2328 G.3 - incomplete resolution of virtual next hops - networks */
if (nf->n.type && unresolved_vlink(nf))
reset_ri(nf);
......@@ -1361,7 +1592,7 @@ static void
ospf_ext_spf(struct proto_ospf *po)
{
ort *nf1, *nf2;
orta nfa;
orta nfa = {};
struct top_hash_entry *en;
struct proto *p = &po->proto;
struct ospf_lsa_ext *le;
......@@ -1369,7 +1600,6 @@ ospf_ext_spf(struct proto_ospf *po)
ip_addr ip, rtid, rt_fwaddr;
u32 br_metric, rt_metric, rt_tag;
struct ospf_area *atmp;
struct mpnh* nhs = NULL;
OSPF_TRACE(D_EVENTS, "Starting routing table calculation for ext routes");
......@@ -1465,7 +1695,7 @@ ospf_ext_spf(struct proto_ospf *po)
if (!rt_fwaddr_valid)
{