bgp.c 33 KB
Newer Older
Martin Mareš's avatar
Martin Mareš committed
1 2 3 4 5 6 7 8
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/**
 * DOC: Border Gateway Protocol
 *
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the
 * connection and most of the interface with BIRD core, |packets.c| handling
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
 * As opposed to the other existing routing daemons, BIRD has a sophisticated core
 * architecture which is able to keep all the information needed by BGP in the
 * primary routing table, therefore no complex data structures like a central
 * BGP table are needed. This increases memory footprint of a BGP router with
 * many connections, but not too much and, which is more important, it makes
 * BGP much easier to implement.
 *
Martin Mareš's avatar
Martin Mareš committed
24
 * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto
Martin Mareš's avatar
Martin Mareš committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 * structure to which are attached individual connections represented by &bgp_connection
 * (usually, there exists only one connection, but during BGP session setup, there
 * can be more of them). The connections are handled according to the BGP state machine
 * defined in the RFC with all the timers and all the parameters configurable.
 *
 * In incoming direction, we listen on the connection's socket and each time we receive
 * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and
 * passes complete packets to bgp_rx_packet() which distributes the packet according
 * to its type.
 *
 * In outgoing direction, we gather all the routing updates and sort them to buckets
 * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison
 * of &rta's and a &fib which helps us to find if we already have another route for
 * the same destination queued for sending, so that we can replace it with the new one
 * immediately instead of sending both updates). There also exists a special bucket holding
 * all the route withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the connection
Martin Mareš's avatar
Martin Mareš committed
42
 * tracking code wanting to send a Open, Keepalive or Notification message), we call
Martin Mareš's avatar
Martin Mareš committed
43 44 45 46 47 48 49 50 51 52 53 54 55
 * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send
 * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty,
 * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls
 * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet
 * type if we have more data of the same type to send.
 *
 * The processing of attributes consists of two functions: bgp_decode_attrs() for checking
 * of the attribute blocks and translating them to the language of BIRD's extended attributes
 * and bgp_encode_attrs() which does the converse. Both functions are built around a
 * @bgp_attr_table array describing all important characteristics of all known attributes.
 * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
 */

56
#undef LOCAL_DEBUG
Martin Mareš's avatar
Martin Mareš committed
57 58 59 60 61

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
62
#include "nest/locks.h"
63
#include "nest/cli.h"
Martin Mareš's avatar
Martin Mareš committed
64
#include "conf/conf.h"
65
#include "lib/socket.h"
66
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
67
#include "lib/string.h"
Martin Mareš's avatar
Martin Mareš committed
68 69 70

#include "bgp.h"

71
struct linpool *bgp_linpool;		/* Global temporary pool */
72 73 74
static sock *bgp_listen_sk;		/* Global listening socket */
static int bgp_counter;			/* Number of protocol instances using the listening socket */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
75
static void bgp_close(struct bgp_proto *p, int apply_md5);
76
static void bgp_connect(struct bgp_proto *p);
77
static void bgp_active(struct bgp_proto *p);
78
static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags);
Martin Mareš's avatar
Martin Mareš committed
79

80

Ondřej Zajíček's avatar
Ondřej Zajíček committed
81 82 83 84 85 86 87 88 89 90 91 92 93
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
 * This function allocates and configures shared BGP resources.
 * Should be called as the last step during initialization
 * (when lock is acquired and neighbor is ready).
 * When error, state changed to PS_DOWN, -1 is returned and caller
 * should return immediately.
 */
static int
bgp_open(struct bgp_proto *p)
{
94
  struct config *cfg = p->cf->c.global;
95 96
  int errcode;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
97 98 99
  bgp_counter++;

  if (!bgp_listen_sk)
100
    bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
101

102 103 104
  if (!bgp_listen_sk)
    {
      bgp_counter--;
105 106
      errcode = BEM_NO_SOCKET;
      goto err;
107 108
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
109 110 111 112 113
  if (!bgp_linpool)
    bgp_linpool = lp_new(&root_pool, 4080);

  if (p->cf->password)
    {
114
      int rv = sk_set_md5_auth(bgp_listen_sk, p->cf->remote_ip, p->cf->iface, p->cf->password);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
115 116 117
      if (rv < 0)
	{
	  bgp_close(p, 0);
118 119
	  errcode = BEM_INVALID_MD5;
	  goto err;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
120 121 122 123
	}
    }

  return 0;
124 125 126 127 128 129

err:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, errcode);
  proto_notify_state(&p->p, PS_DOWN);
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
130 131
}

132 133 134 135 136
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
  p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
137 138 139

  if (!p->cf->passive)
    bgp_active(p);
140 141 142 143 144 145 146 147 148 149 150 151
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
152 153 154 155
  int rv = bgp_open(p);
  if (rv < 0)
    return;

156 157 158 159 160 161 162 163 164
  if (p->startup_delay)
    {
      BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds", p->startup_delay);
      bgp_start_timer(p->startup_timer, p->startup_delay);
    }
  else
    bgp_startup(p);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
165 166 167 168 169 170 171 172 173
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 * @apply_md5: 0 to disable unsetting MD5 auth
 *
 * This function frees and deconfigures shared BGP resources.
 * @apply_md5 is set to 0 when bgp_close is called as a cleanup
 * from failed bgp_open().
 */
174
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
175
bgp_close(struct bgp_proto *p, int apply_md5)
176 177 178
{
  ASSERT(bgp_counter);
  bgp_counter--;
179

Ondřej Zajíček's avatar
Ondřej Zajíček committed
180
  if (p->cf->password && apply_md5)
181
    sk_set_md5_auth(bgp_listen_sk, p->cf->remote_ip, p->cf->iface, NULL);
182

183 184 185 186
  if (!bgp_counter)
    {
      rfree(bgp_listen_sk);
      bgp_listen_sk = NULL;
187 188
      rfree(bgp_linpool);
      bgp_linpool = NULL;
189 190 191
    }
}

Martin Mareš's avatar
Martin Mareš committed
192 193 194 195 196 197 198 199 200
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
 * @value: time to fire (0 to disable the timer)
 *
 * This functions calls tm_start() on @t with time @value and the
 * amount of randomization suggested by the BGP standard. Please use
 * it for all BGP timers.
 */
201
void
202 203
bgp_start_timer(timer *t, int value)
{
204
  if (value)
Martin Mareš's avatar
Martin Mareš committed
205 206 207 208 209
    {
      /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
      t->randomize = value / 4;
      tm_start(t, value - t->randomize);
    }
210 211 212 213
  else
    tm_stop(t);
}

Martin Mareš's avatar
Martin Mareš committed
214 215 216 217 218 219 220
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
 * This function takes a connection described by the &bgp_conn structure,
 * closes its socket and frees all resources associated with it.
 */
221 222 223
void
bgp_close_conn(struct bgp_conn *conn)
{
224
  // struct bgp_proto *p = conn->bgp;
225 226 227 228 229 230 231 232 233

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
  rfree(conn->connect_retry_timer);
  conn->connect_retry_timer = NULL;
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
234
  rfree(conn->sk);
235
  conn->sk = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
 * This function updates a startup delay that is used to postpone next BGP connect.
 * It also handles disable_after_error and might stop BGP instance when error
 * happened and disable_after_error is on.
 *
 * It should be called when BGP protocol error happened.
 */
void
252
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
253 254 255
{
  struct bgp_config *cf = p->cf;

256
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
257

258
  if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time))
259 260
    p->startup_delay = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
261 262 263 264 265 266 267
  p->last_proto_error = now;

  if (cf->disable_after_error)
    {
      p->startup_delay = 0;
      p->p.disabled = 1;
      return;
268
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
269 270 271 272

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
273
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
274 275
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
276
static void
277
bgp_graceful_close_conn(struct bgp_conn *conn, unsigned subcode)
278
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
279
  switch (conn->state)
280 281
    {
    case BS_IDLE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
282 283
    case BS_CLOSE:
      return;
284 285
    case BS_CONNECT:
    case BS_ACTIVE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
286 287
      bgp_conn_enter_idle_state(conn);
      return;
288 289 290
    case BS_OPENSENT:
    case BS_OPENCONFIRM:
    case BS_ESTABLISHED:
291
      bgp_error(conn, 6, subcode, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
292
      return;
293
    default:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
294
      bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
295 296 297
    }
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
298 299 300 301 302 303
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
    bgp_close(p, 1);

304
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
305 306 307 308 309 310 311 312 313 314
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
  if ((p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
315 316
      && (p->outgoing_conn.state == BS_IDLE)
      && (!p->cf->passive))
317
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
318 319 320 321 322 323 324

  if ((p->p.proto_state == PS_STOP)
      && (p->outgoing_conn.state == BS_IDLE)
      && (p->incoming_conn.state == BS_IDLE))
    bgp_down(p);
}

325 326
void
bgp_stop(struct bgp_proto *p, unsigned subcode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
327 328
{
  proto_notify_state(&p->p, PS_STOP);
329 330
  bgp_graceful_close_conn(&p->outgoing_conn, subcode);
  bgp_graceful_close_conn(&p->incoming_conn, subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
331 332 333
  ev_schedule(p->event);
}

334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
static inline void
bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state)
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
350 351 352 353 354 355 356 357
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
 
  BGP_TRACE(D_EVENTS, "BGP session established");
  DBG("BGP: UP!!!\n");

358 359 360 361
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
    p->source_addr = conn->sk->saddr; 

Ondřej Zajíček's avatar
Ondřej Zajíček committed
362 363 364 365
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
  bgp_attr_init(conn->bgp);
366
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
367 368 369 370 371 372 373 374 375 376
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

  if (p->p.proto_state == PS_UP)
377
    bgp_stop(p, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
378 379 380 381 382 383 384 385
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

386
  bgp_conn_set_state(conn, BS_CLOSE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
387 388 389
  tm_stop(conn->keepalive_timer);
  conn->sk->rx_hook = NULL;

390 391 392
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
393 394 395 396 397 398 399 400 401 402 403
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
404
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
405 406 407 408 409 410
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

411 412 413
static void
bgp_send_open(struct bgp_conn *conn)
{
414 415 416
  conn->start_state = conn->bgp->start_state;
  conn->want_as4_support = conn->bgp->cf->enable_as4 && (conn->start_state != BSS_CONNECT_NOCAP);
  conn->peer_as4_support = 0;	// Default value, possibly changed by receiving capability.
417
  conn->advertised_as = 0;
418

419 420
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
421
  conn->sk->tx_hook = bgp_tx;
422
  tm_stop(conn->connect_retry_timer);
Martin Mareš's avatar
Martin Mareš committed
423
  bgp_schedule_packet(conn, PKT_OPEN);
424
  bgp_conn_set_state(conn, BS_OPENSENT);
425
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
426 427
}

428 429
static void
bgp_connected(sock *sk)
430 431
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
432
  struct bgp_proto *p = conn->bgp;
433

Martin Mareš's avatar
Martin Mareš committed
434
  BGP_TRACE(D_EVENTS, "Connected");
435 436 437 438 439 440
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
441
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
442
  struct bgp_proto *p = conn->bgp;
443

Martin Mareš's avatar
Martin Mareš committed
444
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
445 446 447 448 449 450 451
  if (p->p.proto_state == PS_START)
    {
      bgp_close_conn(conn);
      bgp_connect(p);
    }
  else
    bgp_conn_enter_idle_state(conn);
452 453 454
}

static void
455
bgp_sock_err(sock *sk, int err)
456 457
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
458
  struct bgp_proto *p = conn->bgp;
459

460 461 462 463 464 465 466 467 468
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
469 470
  bgp_store_error(p, conn, BE_SOCKET, err);

471 472 473 474
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
475 476

  bgp_conn_enter_idle_state(conn);
477 478
}

479 480 481 482
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
483
  struct bgp_proto *p = conn->bgp;
484

485 486
  DBG("BGP: Hold timeout\n");

487 488 489 490 491 492 493 494
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

495 496 497 498 499 500 501
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
  else
    bgp_error(conn, 4, 0, NULL, 0);
502 503 504 505 506 507 508 509 510 511 512
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
  bgp_schedule_packet(conn, PKT_KEEPALIVE);
}

513
static void
514
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
515 516 517
{
  timer *t;

518
  conn->sk = NULL;
519
  conn->bgp = p;
Martin Mareš's avatar
Martin Mareš committed
520
  conn->packets_to_send = 0;
521 522 523

  t = conn->connect_retry_timer = tm_new(p->p.pool);
  t->hook = bgp_connect_timeout;
524 525
  t->data = conn;
  t = conn->hold_timer = tm_new(p->p.pool);
526
  t->hook = bgp_hold_timeout;
527 528
  t->data = conn;
  t = conn->keepalive_timer = tm_new(p->p.pool);
529
  t->hook = bgp_keepalive_timeout;
530
  t->data = conn;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
531 532 533
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
534 535
}

536
static void
537
bgp_setup_sk(struct bgp_conn *conn, sock *s)
538 539 540 541 542 543
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
544
static void
545
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
546
{
547
  int delay = MAX(1, p->cf->start_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
548 549 550 551
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
552
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
553 554 555
  bgp_start_timer(conn->connect_retry_timer, delay);
}

Martin Mareš's avatar
Martin Mareš committed
556 557 558 559 560 561 562 563
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
564 565 566 567
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
  sock *s;
568
  struct bgp_conn *conn = &p->outgoing_conn;
569
  int hops = p->cf->multihop ? : 1;
570 571 572 573

  DBG("BGP: Connecting\n");
  s = sk_new(p->p.pool);
  s->type = SK_TCP_ACTIVE;
574
  s->saddr = p->source_addr;
575
  s->daddr = p->cf->remote_ip;
576
  s->iface = p->neigh ? p->neigh->iface : NULL;
577
  s->dport = BGP_PORT;
578
  s->ttl = p->cf->ttl_security ? 255 : hops;
579 580 581 582 583
  s->rbsize = BGP_RX_BUFFER_SIZE;
  s->tbsize = BGP_TX_BUFFER_SIZE;
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
584 585
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
	    s->saddr, ipa_has_link_scope(s->saddr) ? s->iface : NULL);
586
  bgp_setup_conn(p, conn);
587
  bgp_setup_sk(conn, s);
588
  bgp_conn_set_state(conn, BS_CONNECT);
589 590 591 592 593 594 595 596 597 598 599 600

  if (sk_open(s) < 0)
    {
      bgp_sock_err(s, 0);
      return;
    }

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
  {
    DBG("Setting minimum received TTL to %d", 256 - hops);
    if (sk_set_min_ttl(s, 256 - hops) < 0)
601
    {
602
      log(L_ERR "TTL security configuration failed, closing session");
603
      bgp_sock_err(s, 0);
604 605
      return;
    }
606 607
  }

608 609 610 611
  DBG("BGP: Waiting for connect success\n");
  bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time);
}

Martin Mareš's avatar
Martin Mareš committed
612 613 614 615 616 617 618 619 620 621 622 623
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
624
static int
Martin Mareš's avatar
Martin Mareš committed
625
bgp_incoming_connection(sock *sk, int dummy UNUSED)
626
{
627
  struct proto_config *pc;
628

629
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
630 631 632 633
  WALK_LIST(pc, config->protos)
    if (pc->protocol == &proto_bgp && pc->proto)
      {
	struct bgp_proto *p = (struct bgp_proto *) pc->proto;
634 635
	if (ipa_equal(p->cf->remote_ip, sk->daddr) &&
	    (!ipa_has_link_scope(sk->daddr) || (p->cf->iface == sk->iface)))
636
	  {
637 638 639 640
	    /* We are in proper state and there is no other incoming connection */
	    int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
	      (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);

641 642 643
	    BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
		      sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL,
		      sk->dport, acc ? "accepted" : "rejected");
644 645 646 647

	    if (!acc)
	      goto err;

648 649 650 651 652 653 654 655 656 657 658 659 660 661
	    int hops = p->cf->multihop ? : 1;
	    if (p->cf->ttl_security)
	    {
	      /* TTL security support */
	      if ((sk_set_ttl(sk, 255) < 0) ||
		  (sk_set_min_ttl(sk, 256 - hops) < 0))
	      {
		log(L_ERR "TTL security configuration failed, closing session");
		goto err;
	      }
	    }
	    else
	      sk_set_ttl(sk, hops);

662
	    bgp_setup_conn(p, &p->incoming_conn);
663
	    bgp_setup_sk(&p->incoming_conn, sk);
664 665
	    bgp_send_open(&p->incoming_conn);
	    return 0;
666 667
	  }
      }
668

669 670
  log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
      sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL, sk->dport);
671
 err:
672 673 674 675
  rfree(sk);
  return 0;
}

676
static void
677
bgp_listen_sock_err(sock *sk UNUSED, int err)
678 679 680 681
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
682
    log(L_ERR "BGP: Error on listening socket: %M", err);
683 684
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
685
static sock *
686
bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags)
687
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
688
  sock *s = sk_new(&root_pool);
689
  DBG("BGP: Creating listening socket\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
690
  s->type = SK_TCP_PASSIVE;
691
  s->ttl = 255;
692 693
  s->saddr = addr;
  s->sport = port ? port : BGP_PORT;
694
  s->flags = flags ? 0 : SKF_V6ONLY;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
695 696 697 698
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->rbsize = BGP_RX_BUFFER_SIZE;
  s->tbsize = BGP_TX_BUFFER_SIZE;
  s->rx_hook = bgp_incoming_connection;
699
  s->err_hook = bgp_listen_sock_err;
700 701

  if (sk_open(s) < 0)
702
    {
703
      log(L_ERR "BGP: Unable to open listening socket");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
704 705
      rfree(s);
      return NULL;
706
    }
707 708

  return s;
709 710 711 712 713
}

static void
bgp_start_neighbor(struct bgp_proto *p)
{
714 715
  /* Called only for single-hop BGP sessions */

716
  /* Remove this ? */
717 718
  if (ipa_zero(p->source_addr))
    p->source_addr = p->neigh->iface->addr->ip; 
719

720 721 722
#ifdef IPV6
  {
    struct ifa *a;
723
    p->local_link = IPA_NONE;
724 725 726 727 728 729
    WALK_LIST(a, p->neigh->iface->addrs)
      if (a->scope == SCOPE_LINK)
        {
	  p->local_link = a->ip;
	  break;
	}
730 731 732 733

    if (! ipa_nonzero(p->local_link))
      log(L_WARN "%s: Missing link local address on interface %s", p->p.name,  p->neigh->iface->name);

734 735 736
    DBG("BGP: Selected link-level address %I\n", p->local_link);
  }
#endif
Ondřej Zajíček's avatar
Ondřej Zajíček committed
737

738
  bgp_initiate(p);
739 740 741 742 743 744 745
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;

746 747 748
  if (! (n->flags & NEF_STICKY))
    return;

749
  if (n->scope > 0)
750
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
751 752 753 754 755
      if ((p->p.proto_state == PS_START) && (p->start_state == BSS_PREPARE))
	{
	  BGP_TRACE(D_EVENTS, "Neighbor found");
	  bgp_start_neighbor(p);
	}
756 757 758
    }
  else
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
759 760 761 762
      if ((p->p.proto_state == PS_START) || (p->p.proto_state == PS_UP))
	{
	  BGP_TRACE(D_EVENTS, "Neighbor lost");
	  bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
763
	  bgp_stop(p, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
764
	}
765 766 767
    }
}

768 769 770 771 772 773 774 775 776 777 778
static int
bgp_reload_routes(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  if (!p->conn || !p->conn->peer_refresh_support)
    return 0;

  bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH);
  return 1;
}

779 780 781 782 783 784
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
785 786 787
  if (p->p.proto_state != PS_START)
    {
      DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
788
      return;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
789 790
    }

791
  DBG("BGP: Got lock\n");
792

793
  if (cf->multihop)
794
    {
795 796 797
      /* Multi-hop sessions do not use neighbor entries */
      bgp_initiate(p);
      return;
798 799
    }

800
  p->neigh = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
801
  if (!p->neigh || (p->neigh->scope == SCOPE_HOST))
802
    {
803
      log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
804
      /* As we do not start yet, we can just disable protocol */
805
      p->p.disabled = 1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
806
      bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
807
      proto_notify_state(&p->p, PS_DOWN);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
808
      return;
809
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
810
  
811
  if (p->neigh->scope > 0)
812 813
    bgp_start_neighbor(p);
  else
814
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
815 816
}

Martin Mareš's avatar
Martin Mareš committed
817 818 819
static int
bgp_start(struct proto *P)
{
820 821 822
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

823
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
824
  p->start_state = BSS_PREPARE;
825 826
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
827
  p->neigh = NULL;
828

829 830
  rt_lock_table(p->igp_table);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
831 832 833
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
834

835 836 837 838
  p->startup_timer = tm_new(p->p.pool);
  p->startup_timer->hook = bgp_startup_timeout;
  p->startup_timer->data = p;

839 840 841 842
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

843 844 845
  p->remote_id = 0;
  p->source_addr = p->cf->source_addr;

846 847 848 849 850 851 852 853
  /*
   *  Before attempting to create the connection, we need to lock the
   *  port, so that are sure we're the only instance attempting to talk
   *  with that neighbor.
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
854
  lock->iface = p->cf->iface;
855 856 857 858 859
  lock->type = OBJLOCK_TCP;
  lock->port = BGP_PORT;
  lock->hook = bgp_start_locked;
  lock->data = p;
  olock_acquire(lock);
860

861
  return PS_START;
Martin Mareš's avatar
Martin Mareš committed
862 863
}

864 865
extern int proto_restart;

Martin Mareš's avatar
Martin Mareš committed
866 867 868
static int
bgp_shutdown(struct proto *P)
{
869
  struct bgp_proto *p = (struct bgp_proto *) P;
870
  unsigned subcode = 0;
871

Martin Mareš's avatar
Martin Mareš committed
872
  BGP_TRACE(D_EVENTS, "Shutdown requested");
873

874
  switch (P->down_code)
875
    {
876 877 878 879 880 881 882 883 884 885
    case PDC_CF_REMOVE:
    case PDC_CF_DISABLE:
      subcode = 3; // Errcode 6, 3 - peer de-configured
      break;

    case PDC_CF_RESTART:
      subcode = 6; // Errcode 6, 6 - other configuration change
      break;

    case PDC_CMD_DISABLE:
886
    case PDC_CMD_SHUTDOWN:
887 888 889 890 891 892 893
      subcode = 2; // Errcode 6, 2 - administrative shutdown
      break;

    case PDC_CMD_RESTART:
      subcode = 4; // Errcode 6, 4 - administrative reset
      break;

894
    case PDC_RX_LIMIT_HIT:
895 896
    case PDC_IN_LIMIT_HIT:
      subcode = 1; // Errcode 6, 1 - max number of prefixes reached
897
      /* log message for compatibility */
898
      log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
899 900 901 902
      goto limit;

    case PDC_OUT_LIMIT_HIT:
      subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
903

904
    limit:
905
      bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
906
      if (proto_restart)
907
	bgp_update_startup_delay(p);
908
      else
909 910
	p->startup_delay = 0;
      goto done;
911 912
    }

913
  bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
914
  p->startup_delay = 0;
915

916 917
 done:
  bgp_stop(p, subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
918
  return p->p.proto_state;
Martin Mareš's avatar
Martin Mareš committed
919 920
}

921 922 923 924 925 926 927 928 929 930 931 932 933
static void
bgp_cleanup(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  rt_unlock_table(p->igp_table);
}

static rtable *
get_igp_table(struct bgp_config *cf)
{
  return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
}

934 935 936 937 938 939 940
static struct proto *
bgp_init(struct proto_config *C)
{
  struct bgp_config *c = (struct bgp_config *) C;
  struct proto *P = proto_new(C, sizeof(struct bgp_proto));
  struct bgp_proto *p = (struct bgp_proto *) P;

941
  P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL;
942 943 944 945
  P->rt_notify = bgp_rt_notify;
  P->rte_better = bgp_rte_better;
  P->import_control = bgp_import_control;
  P->neigh_notify = bgp_neigh_notify;
946
  P->reload_routes = bgp_reload_routes;
947 948 949 950

  if (c->deterministic_med)
    P->rte_recalculate = bgp_rte_recalculate;

951 952 953 954
  p->cf = c;
  p->local_as = c->local_as;
  p->remote_as = c->remote_as;
  p->is_internal = (c->local_as == c->remote_as);
955 956
  p->rs_client = c->rs_client;
  p->rr_client = c->rr_client;
957
  p->igp_table = get_igp_table(c);
958

959 960 961
  return P;
}

962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986

void
bgp_check_config(struct bgp_config *c)
{
  int internal = (c->local_as == c->remote_as);

  /* Do not check templates at all */
  if (c->c.class == SYM_TEMPLATE)
    return;

  if (!c->local_as)
    cf_error("Local AS number must be set");

  if (!c->remote_as)
    cf_error("Neighbor must be configured");

  if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF))
    cf_error("Neighbor AS number out of range (AS4 not available)");

  if (!internal && c->rr_client)
    cf_error("Only internal neighbor can be RR client");

  if (internal && c->rs_client)
    cf_error("Only external neighbor can be RS client");

987

988 989 990
  if (c->multihop && (c->gw_mode == GW_DIRECT))
    cf_error("Multihop BGP cannot use direct gateway mode");

991 992 993 994 995
  if (c->multihop && (ipa_has_link_scope(c->remote_ip) || 
		      ipa_has_link_scope(c->source_addr)))
    cf_error("Multihop BGP cannot be used with link-local addresses");


996 997 998 999 1000 1001 1002
  /* Different default based on rs_client */
  if (!c->missing_lladdr)
    c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF;

  /* Different default for gw_mode */
  if (!c->gw_mode)
    c->gw_mode = (c->multihop || internal) ? GW_RECURSIVE : GW_DIRECT;
1003 1004 1005 1006

  /* Disable after error incompatible with restart limit action */
  if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error)
    c->c.in_limit->action = PLA_DISABLE;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1007

1008 1009 1010 1011 1012 1013 1014 1015 1016

  if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted)
    cf_error("BGP in recursive mode prohibits sorted table");

  if (c->deterministic_med && c->c.table->sorted)
    cf_error("BGP with deterministic MED prohibits sorted table");

  if (c->secondary && !c->c.table->sorted)
    cf_error("BGP with secondary option requires sorted table");
1017 1018 1019 1020 1021 1022 1023 1024 1025
}

static int
bgp_reconfigure(struct proto *P, struct proto_config *C)
{
  struct bgp_config *new = (struct bgp_config *) C;
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_config *old = p->cf;

1026 1027 1028
  if (proto_get_router_id(C) != p->local_id)
    return 0;

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
  int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
		     ((byte *) new) + sizeof(struct proto_config),
		     // password item is last and must be checked separately
		     OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
    && ((!old->password && !new->password)
	|| (old->password && new->password && !strcmp(old->password, new->password)))
    && (get_igp_table(old) == get_igp_table(new));

  /* We should update our copy of configuration ptr as old configuration will be freed */
  if (same)
    p->cf = new;

  return same;
}

static void
bgp_copy_config(struct proto_config *dest, struct proto_config *src)
{
  /* Just a shallow copy */
  proto_copy_rest(dest, src, sizeof(struct bgp_config));
}


Martin Mareš's avatar
Martin Mareš committed
1052 1053 1054 1055
/**
 * bgp_error - report a protocol error
 * @c: connection
 * @code: error code (according to the RFC)
Martin Mareš's avatar
Martin Mareš committed
1056
 * @subcode: error sub-code
Martin Mareš's avatar
Martin Mareš committed
1057 1058 1059 1060
 * @data: data to be passed in the Notification message
 * @len: length of the data
 *
 * bgp_error() sends a notification packet to tell the other side that a protocol
Martin Mareš's avatar
Martin Mareš committed
1061
 * error has occurred (including the data considered erroneous if possible) and
Martin Mareš's avatar
Martin Mareš committed
1062 1063
 * closes the connection.
 */
1064
void
1065
bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len)
1066
{
1067 1068
  struct bgp_proto *p = c->bgp;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1069
  if (c->state == BS_CLOSE)
1070
    return;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1071

1072 1073
  bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len);
  bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1074 1075
  bgp_conn_enter_close_state(c);

1076 1077
  c->notify_code = code;
  c->notify_subcode = subcode;
1078 1079
  c->notify_data = data;
  c->notify_size = (len > 0) ? len : 0;
1080
  bgp_schedule_packet(c, PKT_NOTIFICATION);
1081 1082 1083 1084 1085 1086

  if (code != 6)
    {
      bgp_update_startup_delay(p);
      bgp_stop(p, 0);
    }
1087 1088
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1089