Commit 1e37e35c authored by Ondřej Zajíček's avatar Ondřej Zajíček
Browse files

BGP: Support for MPLS labels and VPN SAFI

Basic support for SAFI 4 and 128 (MPLS labeled IP and VPN) for IPv4 and
IPv6. Should work for route reflector, but does not properly handle
originating routes with next hop self.

Based on patches from Jan Matejka.
parent ead7b8f4
......@@ -35,6 +35,8 @@
#define NB_MPLS (1 << NET_MPLS)
#define NB_IP (NB_IP4 | NB_IP6)
#define NB_VPN (NB_VPN4 | NB_VPN6)
#define NB_FLOW (NB_FLOW4 | NB_FLOW6)
#define NB_ANY 0xffffffff
......@@ -481,6 +483,12 @@ static inline void net_normalize_ip4(net_addr_ip4 *n)
static inline void net_normalize_ip6(net_addr_ip6 *n)
{ n->prefix = ip6_and(n->prefix, ip6_mkmask(n->pxlen)); }
static inline void net_normalize_vpn4(net_addr_vpn4 *n)
{ net_normalize_ip4((net_addr_ip4 *) n); }
static inline void net_normalize_vpn6(net_addr_vpn6 *n)
{ net_normalize_ip6((net_addr_ip6 *) n); }
void net_normalize(net_addr *N);
......
......@@ -28,6 +28,13 @@ get_u16(const void *p)
return ntohs(x);
}
static inline u32
get_u24(const void *P)
{
const byte *p = P;
return (p[0] << 16) + (p[1] << 8) + p[2];
}
static inline u32
get_u32(const void *p)
{
......@@ -52,6 +59,13 @@ put_u16(void *p, u16 x)
memcpy(p, &x, 2);
}
static inline void
put_u24(void *p, u32 x)
{
x = htonl(x);
memcpy(p, ((char *) &x) + 1, 3);
}
static inline void
put_u32(void *p, u32 x)
{
......
......@@ -551,7 +551,15 @@ static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta
void rta_dump(rta *);
void rta_dump_all(void);
void rta_show(struct cli *, rta *, ea_list *);
void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls);
struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep);
void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls);
static inline void
rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls)
{
rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls);
}
/*
* rta_set_recursive_next_hop() acquires hostentry from hostcache and fills
......
......@@ -1766,7 +1766,7 @@ rta_next_hop_outdated(rta *a)
(!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
}
static inline void
void
rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
{
a->hostentry = he;
......@@ -1794,7 +1794,7 @@ no_nexthop:
struct nexthop *nhp = NULL, *nhr = NULL;
int skip_nexthop = 0;
for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
{
if (skip_nexthop)
......@@ -2475,7 +2475,7 @@ rt_update_hostcache(rtable *tab)
tab->hcu_scheduled = 0;
}
static struct hostentry *
struct hostentry *
rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
{
struct hostentry *he;
......@@ -2489,17 +2489,11 @@ rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
if (ipa_equal(he->addr, a) && (he->tab == dep))
return he;
he = hc_new_hostentry(hc, a, ll, dep, k);
he = hc_new_hostentry(hc, a, ipa_zero(ll) ? a : ll, dep, k);
rt_update_hostentry(tab, he);
return he;
}
void
rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls)
{
rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ipa_zero(ll) ? gw : ll, dep), mls);
}
/*
* CLI commands
......
......@@ -629,6 +629,75 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla
bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad);
}
static void
bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
{
net_addr *n = s->route->net->n.addr;
u32 *labels = (u32 *) a->u.ptr->data;
uint lnum = a->u.ptr->length / 4;
/* Perhaps we should just ignore it? */
if (!s->mpls)
WITHDRAW("Unexpected MPLS stack");
/* Empty MPLS stack is not allowed */
if (!lnum)
WITHDRAW("Malformed MPLS stack - empty");
/* This is ugly, but we must ensure that labels fit into NLRI field */
if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255)
WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum);
for (uint i = 0; i < lnum; i++)
{
if (labels[i] > 0xfffff)
WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]);
/* TODO: Check for special-purpose label values? */
}
}
static int
bgp_encode_mpls_label_stack(struct bgp_write_state *s, eattr *a, byte *buf UNUSED, uint size UNUSED)
{
/*
* MPLS labels are encoded as a part of the NLRI in MP_REACH_NLRI attribute,
* so we store MPLS_LABEL_STACK and encode it later by AFI-specific hooks.
*/
s->mpls_labels = a->u.ptr;
return 0;
}
static void
bgp_decode_mpls_label_stack(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED)
{
DISCARD("Discarding received attribute #0");
}
static void
bgp_format_mpls_label_stack(eattr *a, byte *buf, uint size)
{
u32 *labels = (u32 *) a->u.ptr->data;
uint lnum = a->u.ptr->length / 4;
char *pos = buf;
for (uint i = 0; i < lnum; i++)
{
if (size < 20)
{
bsprintf(pos, "...");
return;
}
uint l = bsprintf(pos, "%d/", labels[i]);
ADVANCE(pos, size, l);
}
/* Clear last slash or terminate empty string */
pos[lnum ? -1 : 0] = 0;
}
static inline void
bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to)
{
......@@ -763,6 +832,14 @@ static const struct bgp_attr_desc bgp_attr_table[] = {
.encode = bgp_encode_u32s,
.decode = bgp_decode_large_community,
},
[BA_MPLS_LABEL_STACK] = {
.name = "mpls_label_stack",
.type = EAF_TYPE_INT_SET,
.export = bgp_export_mpls_label_stack,
.encode = bgp_encode_mpls_label_stack,
.decode = bgp_decode_mpls_label_stack,
.format = bgp_format_mpls_label_stack,
},
};
static inline int
......@@ -849,7 +926,6 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs)
return NULL;
return new;
}
......@@ -1340,7 +1416,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at
{
struct proto *SRC = e->attrs->src->proto;
struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL;
struct bgp_export_state s = { .proto = p, .channel =c, .pool = pool, .src = src, .route = e };
struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls };
ea_list *attrs = attrs0;
eattr *a;
adata *ad;
......@@ -1453,13 +1529,13 @@ bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea
if (new)
{
attrs = bgp_update_attrs(p, c, new, attrs, bgp_linpool);
attrs = bgp_update_attrs(p, c, new, attrs, bgp_linpool2);
/* If attributes are invalid, we fail back to withdraw */
buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c);
path = new->attrs->src->global_id;
lp_flush(bgp_linpool);
lp_flush(bgp_linpool2);
}
else
{
......
......@@ -86,6 +86,7 @@
struct linpool *bgp_linpool; /* Global temporary pool */
struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */
static list bgp_sockets; /* Global list of listening sockets */
......@@ -151,7 +152,10 @@ bgp_open(struct bgp_proto *p)
add_tail(&bgp_sockets, &bs->n);
if (!bgp_linpool)
bgp_linpool = lp_new(proto_pool, 4080);
{
bgp_linpool = lp_new(proto_pool, 4080);
bgp_linpool2 = lp_new(proto_pool, 4080);
}
return 0;
......@@ -187,6 +191,9 @@ bgp_close(struct bgp_proto *p)
rfree(bgp_linpool);
bgp_linpool = NULL;
rfree(bgp_linpool2);
bgp_linpool2 = NULL;
}
static inline int
......@@ -1970,7 +1977,7 @@ struct protocol proto_bgp = {
.template = "bgp%d",
.attr_class = EAP_BGP,
.preference = DEF_PREF_BGP,
.channel_mask = NB_IP | NB_FLOW4 | NB_FLOW6,
.channel_mask = NB_IP | NB_VPN | NB_FLOW,
.proto_size = sizeof(struct bgp_proto),
.config_size = sizeof(struct bgp_config),
.postconfig = bgp_postconfig,
......
......@@ -31,6 +31,8 @@ struct eattr;
#define BGP_SAFI_UNICAST 1
#define BGP_SAFI_MULTICAST 2
#define BGP_SAFI_MPLS 4
#define BGP_SAFI_MPLS_VPN 128
#define BGP_SAFI_FLOW 133
/* Internal AF codes */
......@@ -43,6 +45,10 @@ struct eattr;
#define BGP_AF_IPV6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_UNICAST )
#define BGP_AF_IPV4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MULTICAST )
#define BGP_AF_IPV6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MULTICAST )
#define BGP_AF_IPV4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS )
#define BGP_AF_IPV6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS )
#define BGP_AF_VPN4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS_VPN )
#define BGP_AF_VPN6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS_VPN )
#define BGP_AF_FLOW4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_FLOW )
#define BGP_AF_FLOW6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_FLOW )
......@@ -55,6 +61,7 @@ struct bgp_bucket;
struct bgp_af_desc {
u32 afi;
u32 net;
int mpls;
const char *name;
uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size);
void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a);
......@@ -308,6 +315,7 @@ struct bgp_export_state {
struct bgp_proto *src;
rte *route;
int mpls;
u32 attrs_seen[1];
uint err_withdraw;
......@@ -320,8 +328,10 @@ struct bgp_write_state {
int as4_session;
int add_path;
int mpls;
eattr *mp_next_hop;
adata *mpls_labels;
};
struct bgp_parse_state {
......@@ -331,14 +341,13 @@ struct bgp_parse_state {
int as4_session;
int add_path;
int mpls;
u32 attrs_seen[256/32];
u32 mp_reach_af;
u32 mp_unreach_af;
mpls_label_stack mls;
uint attr_len;
uint ip_reach_len;
uint ip_unreach_len;
......@@ -359,6 +368,9 @@ struct bgp_parse_state {
uint err_subcode;
jmp_buf err_jmpbuf;
struct hostentry *hostentry;
adata *mpls_labels;
/* Cached state for bgp_rte_update() */
u32 last_id;
struct rte_src *last_src;
......@@ -392,6 +404,7 @@ bgp_parse_error(struct bgp_parse_state *s, uint subcode)
}
extern struct linpool *bgp_linpool;
extern struct linpool *bgp_linpool2;
void bgp_start_timer(struct timer *t, int value);
......@@ -528,6 +541,9 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to);
#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */
#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */
/* Bird's private internal BGP attributes */
#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */
/* BGP connection states */
#define BS_IDLE 0
......
......@@ -139,6 +139,10 @@ bgp_afi:
| IPV6 { $$ = BGP_AF_IPV6; }
| IPV4 MULTICAST { $$ = BGP_AF_IPV4_MC; }
| IPV6 MULTICAST { $$ = BGP_AF_IPV6_MC; }
| IPV4 MPLS { $$ = BGP_AF_IPV4_MPLS; }
| IPV6 MPLS { $$ = BGP_AF_IPV6_MPLS; }
| VPN4 MPLS { $$ = BGP_AF_VPN4_MPLS; }
| VPN6 MPLS { $$ = BGP_AF_VPN6_MPLS; }
| FLOW4 { $$ = BGP_AF_FLOW4; }
| FLOW6 { $$ = BGP_AF_FLOW6; }
;
......
......@@ -32,6 +32,13 @@
#define BGP_RR_BEGIN 1
#define BGP_RR_END 2
#define BGP_NLRI_MAX (4 + 1 + 32)
#define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
#define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
#define BGP_MPLS_NULL 3 /* Implicit NULL label */
#define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
......@@ -282,8 +289,8 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
/* Create capability list in buffer */
/*
* Note that max length is ~ 20+14*af_count. With max 6 channels that is
* 104. Option limit is 253 and buffer size is 4096, so we cannot overflow
* Note that max length is ~ 20+14*af_count. With max 10 channels that is
* 160. Option limit is 253 and buffer size is 4096, so we cannot overflow
* unless we add new capabilities or more AFs.
*/
......@@ -722,6 +729,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
#define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
#define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
#define NO_NEXT_HOP "Missing NEXT_HOP attribute"
#define NO_LABEL_STACK "Missing MPLS stack"
static void
......@@ -744,19 +752,56 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
WITHDRAW(BAD_NEXT_HOP);
a->dest = RTD_UNICAST;
a->nh = (struct nexthop){ .gw = nbr->addr, .iface = nbr->iface };
a->hostentry = NULL;
a->igp_metric = 0;
a->nh.gw = nbr->addr;
a->nh.iface = nbr->iface;
}
else /* GW_RECURSIVE */
{
if (ipa_zero(gw))
WITHDRAW(BAD_NEXT_HOP);
rta_set_recursive_next_hop(c->c.table, a, c->igp_table, gw, ll, &(s->mls));
s->hostentry = rt_get_hostentry(c->igp_table, gw, ll, c->c.table);
if (!s->mpls)
rta_apply_hostentry(a, s->hostentry, NULL);
/* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
}
}
static void
bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
{
if (lnum > MPLS_MAX_LABEL_STACK)
{
REPORT("Too many MPLS labels ($u)", lnum);
a->dest = RTD_UNREACHABLE;
a->hostentry = NULL;
a->nh = (struct nexthop) { };
return;
}
/* Handle implicit NULL as empty MPLS stack */
if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
lnum = 0;
if (s->channel->cf->gw_mode == GW_DIRECT)
{
a->nh.labels = lnum;
memcpy(a->nh.label, labels, 4*lnum);
}
else /* GW_RECURSIVE */
{
mpls_label_stack ms;
ms.len = lnum;
memcpy(ms.stack, labels, 4*lnum);
rta_apply_hostentry(a, s->hostentry, &ms);
}
}
static inline int
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
{
......@@ -810,13 +855,26 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
{
if (bgp_use_gateway(s))
{
ip_addr nh[1] = { s->route->attrs->nh.gw };
rta *ra = s->route->attrs;
ip_addr nh[1] = { ra->nh.gw };
bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
if (s->mpls)
{
u32 implicit_null = BGP_MPLS_NULL;
u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
uint lnum = ra->nh.labels ? ra->nh.labels : 1;
bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
}
}
else
{
ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
/* TODO: Use local MPLS assigned label */
if (s->mpls)
bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK);
}
}
......@@ -834,6 +892,10 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
WITHDRAW(BAD_NEXT_HOP);
/* Just check if MPLS stack */
if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
WITHDRAW(NO_LABEL_STACK);
}
static uint
......@@ -905,14 +967,76 @@ bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
rte_update2(&s->channel->c, n, e, s->last_src);
}
static void
bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
{
u32 dummy = 0;
u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
uint lnum = mpls ? (mpls->length / 4) : 1;
for (uint i = 0; i < lnum; i++)
{
put_u24(*pos, labels[i] << 4);
ADVANCE(*pos, *size, 3);
}
/* Add bottom-of-stack flag */
(*pos)[-1] |= BGP_MPLS_BOS;
*pxlen += 24 * lnum;
}
static void
bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
{
u32 labels[BGP_MPLS_MAX], label;
uint lnum = 0;
do {
if (*pxlen < 24)
bgp_parse_error(s, 1);
label = get_u24(*pos);
labels[lnum++] = label >> 4;
ADVANCE(*pos, *len, 3);
*pxlen -= 24;
/* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */
if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC))
break;
}
while (!(label & BGP_MPLS_BOS));
if (!a)
return;
/* Attach MPLS attribute unless we already have one */
if (!s->mpls_labels)
{
s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
}
/* Overwrite data in the attribute */
s->mpls_labels->length = 4*lnum;
memcpy(s->mpls_labels->data, labels, 4*lnum);
/* Update next hop entry in rta */
bgp_apply_mpls_labels(s, a, labels, lnum);
/* Attributes were changed, invalidate cached entry */
rta_free(s->cached_rta);
s->cached_rta = NULL;
return;
}
static uint
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
{
byte *pos = buf;
while (!EMPTY_LIST(buck->prefixes) && (size >= (5 + sizeof(ip4_addr))))
while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
{
struct bgp_prefix *px = HEAD(buck->prefixes);
struct net_addr_ip4 *net = (void *) px->net;
......@@ -924,14 +1048,17 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
ADVANCE(pos, size, 4);
}
ip4_addr a = ip4_hton(net->prefix);
uint b = (net->pxlen + 7) / 8;
/* Encode prefix length */
*pos = net->pxlen;
ADVANCE(pos, size, 1);
/* Encode MPLS labels */
if (s->mpls)
bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
/* Encode prefix body */
ip4_addr a = ip4_hton(net->prefix);
uint b = (net->pxlen + 7) / 8;
memcpy(pos, &a, b);
ADVANCE(pos, size, b);
......@@ -961,17 +1088,21 @@ bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
/* Decode prefix length */
uint l = *pos;
uint b = (l + 7) / 8;
ADVANCE(pos, len, 1);
if (len < ((l + 7) / 8))
bgp_parse_error(s, 1);
/* Decode MPLS labels */
if (s->mpls)
bgp_decode_mpls_labels(s, &pos, &len, &l, a);
if (l > IP4_MAX_PREFIX_LENGTH)
bgp_parse_error(s, 10);
if (len < b)
bgp_parse_error(s, 1);
/* Decode prefix body */
ip4_addr addr = IP4_NONE;
uint b = (l + 7) / 8;
memcpy(&addr, pos, b);
ADVANCE(pos, len, b);
......@@ -1016,7 +1147,7 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
{
byte