]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/bgp.c
Merge commit 'origin/master' into socket
[thirdparty/bird.git] / proto / bgp / bgp.c
1 /*
2 * BIRD -- The Border Gateway Protocol
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 /**
10 * DOC: Border Gateway Protocol
11 *
12 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the
13 * connection and most of the interface with BIRD core, |packets.c| handling
14 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
15 * manipulation with BGP attribute lists.
16 *
17 * As opposed to the other existing routing daemons, BIRD has a sophisticated core
18 * architecture which is able to keep all the information needed by BGP in the
19 * primary routing table, therefore no complex data structures like a central
20 * BGP table are needed. This increases memory footprint of a BGP router with
21 * many connections, but not too much and, which is more important, it makes
22 * BGP much easier to implement.
23 *
24 * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto
25 * structure to which are attached individual connections represented by &bgp_connection
26 * (usually, there exists only one connection, but during BGP session setup, there
27 * can be more of them). The connections are handled according to the BGP state machine
28 * defined in the RFC with all the timers and all the parameters configurable.
29 *
30 * In incoming direction, we listen on the connection's socket and each time we receive
31 * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and
32 * passes complete packets to bgp_rx_packet() which distributes the packet according
33 * to its type.
34 *
35 * In outgoing direction, we gather all the routing updates and sort them to buckets
36 * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison
37 * of &rta's and a &fib which helps us to find if we already have another route for
38 * the same destination queued for sending, so that we can replace it with the new one
39 * immediately instead of sending both updates). There also exists a special bucket holding
40 * all the route withdrawals which cannot be queued anywhere else as they don't have any
41 * attributes. If we have any packet to send (due to either new routes or the connection
42 * tracking code wanting to send a Open, Keepalive or Notification message), we call
43 * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send
44 * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty,
45 * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls
46 * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet
47 * type if we have more data of the same type to send.
48 *
49 * The processing of attributes consists of two functions: bgp_decode_attrs() for checking
50 * of the attribute blocks and translating them to the language of BIRD's extended attributes
51 * and bgp_encode_attrs() which does the converse. Both functions are built around a
52 * @bgp_attr_table array describing all important characteristics of all known attributes.
53 * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
54 */
55
56 #undef LOCAL_DEBUG
57
58 #include "nest/bird.h"
59 #include "nest/iface.h"
60 #include "nest/protocol.h"
61 #include "nest/route.h"
62 #include "nest/cli.h"
63 #include "nest/locks.h"
64 #include "conf/conf.h"
65 #include "lib/socket.h"
66 #include "lib/resource.h"
67 #include "lib/string.h"
68
69 #include "bgp.h"
70
71 struct linpool *bgp_linpool; /* Global temporary pool */
72 static sock *bgp_listen_sk; /* Global listening socket */
73 static int bgp_counter; /* Number of protocol instances using the listening socket */
74
75 static void bgp_close(struct bgp_proto *p, int apply_md5);
76 static void bgp_connect(struct bgp_proto *p);
77 static void bgp_active(struct bgp_proto *p);
78 static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags);
79 static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
80
81
82 /**
83 * bgp_open - open a BGP instance
84 * @p: BGP instance
85 *
86 * This function allocates and configures shared BGP resources.
87 * Should be called as the last step during initialization
88 * (when lock is acquired and neighbor is ready).
89 * When error, state changed to PS_DOWN, -1 is returned and caller
90 * should return immediately.
91 */
92 static int
93 bgp_open(struct bgp_proto *p)
94 {
95 struct config *cfg = p->cf->c.global;
96 int errcode;
97
98 bgp_counter++;
99
100 if (!bgp_listen_sk)
101 bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags);
102
103 if (!bgp_listen_sk)
104 {
105 bgp_counter--;
106 errcode = BEM_NO_SOCKET;
107 goto err;
108 }
109
110 if (!bgp_linpool)
111 bgp_linpool = lp_new(&root_pool, 4080);
112
113 if (p->cf->password)
114 {
115 int rv = sk_set_md5_auth(bgp_listen_sk, p->cf->remote_ip, p->cf->iface, p->cf->password);
116 if (rv < 0)
117 {
118 bgp_close(p, 0);
119 errcode = BEM_INVALID_MD5;
120 goto err;
121 }
122 }
123
124 return 0;
125
126 err:
127 p->p.disabled = 1;
128 bgp_store_error(p, NULL, BE_MISC, errcode);
129 proto_notify_state(&p->p, PS_DOWN);
130 return -1;
131 }
132
133 static void
134 bgp_startup(struct bgp_proto *p)
135 {
136 BGP_TRACE(D_EVENTS, "Started");
137 p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP;
138
139 if (!p->cf->passive)
140 bgp_active(p);
141 }
142
143 static void
144 bgp_startup_timeout(timer *t)
145 {
146 bgp_startup(t->data);
147 }
148
149
150 static void
151 bgp_initiate(struct bgp_proto *p)
152 {
153 int rv = bgp_open(p);
154 if (rv < 0)
155 return;
156
157 if (p->cf->bfd)
158 bgp_update_bfd(p, p->cf->bfd);
159
160 if (p->startup_delay)
161 {
162 p->start_state = BSS_DELAY;
163 BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds", p->startup_delay);
164 bgp_start_timer(p->startup_timer, p->startup_delay);
165 }
166 else
167 bgp_startup(p);
168 }
169
170 /**
171 * bgp_close - close a BGP instance
172 * @p: BGP instance
173 * @apply_md5: 0 to disable unsetting MD5 auth
174 *
175 * This function frees and deconfigures shared BGP resources.
176 * @apply_md5 is set to 0 when bgp_close is called as a cleanup
177 * from failed bgp_open().
178 */
179 static void
180 bgp_close(struct bgp_proto *p, int apply_md5)
181 {
182 ASSERT(bgp_counter);
183 bgp_counter--;
184
185 if (p->cf->password && apply_md5)
186 sk_set_md5_auth(bgp_listen_sk, p->cf->remote_ip, p->cf->iface, NULL);
187
188 if (!bgp_counter)
189 {
190 rfree(bgp_listen_sk);
191 bgp_listen_sk = NULL;
192 rfree(bgp_linpool);
193 bgp_linpool = NULL;
194 }
195 }
196
197 /**
198 * bgp_start_timer - start a BGP timer
199 * @t: timer
200 * @value: time to fire (0 to disable the timer)
201 *
202 * This functions calls tm_start() on @t with time @value and the
203 * amount of randomization suggested by the BGP standard. Please use
204 * it for all BGP timers.
205 */
206 void
207 bgp_start_timer(timer *t, int value)
208 {
209 if (value)
210 {
211 /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
212 t->randomize = value / 4;
213 tm_start(t, value - t->randomize);
214 }
215 else
216 tm_stop(t);
217 }
218
219 /**
220 * bgp_close_conn - close a BGP connection
221 * @conn: connection to close
222 *
223 * This function takes a connection described by the &bgp_conn structure,
224 * closes its socket and frees all resources associated with it.
225 */
226 void
227 bgp_close_conn(struct bgp_conn *conn)
228 {
229 // struct bgp_proto *p = conn->bgp;
230
231 DBG("BGP: Closing connection\n");
232 conn->packets_to_send = 0;
233 rfree(conn->connect_retry_timer);
234 conn->connect_retry_timer = NULL;
235 rfree(conn->keepalive_timer);
236 conn->keepalive_timer = NULL;
237 rfree(conn->hold_timer);
238 conn->hold_timer = NULL;
239 rfree(conn->sk);
240 conn->sk = NULL;
241 rfree(conn->tx_ev);
242 conn->tx_ev = NULL;
243 }
244
245
246 /**
247 * bgp_update_startup_delay - update a startup delay
248 * @p: BGP instance
249 *
250 * This function updates a startup delay that is used to postpone next BGP connect.
251 * It also handles disable_after_error and might stop BGP instance when error
252 * happened and disable_after_error is on.
253 *
254 * It should be called when BGP protocol error happened.
255 */
256 void
257 bgp_update_startup_delay(struct bgp_proto *p)
258 {
259 struct bgp_config *cf = p->cf;
260
261 DBG("BGP: Updating startup delay\n");
262
263 if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time))
264 p->startup_delay = 0;
265
266 p->last_proto_error = now;
267
268 if (cf->disable_after_error)
269 {
270 p->startup_delay = 0;
271 p->p.disabled = 1;
272 return;
273 }
274
275 if (!p->startup_delay)
276 p->startup_delay = cf->error_delay_time_min;
277 else
278 p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
279 }
280
281 static void
282 bgp_graceful_close_conn(struct bgp_conn *conn, unsigned subcode)
283 {
284 switch (conn->state)
285 {
286 case BS_IDLE:
287 case BS_CLOSE:
288 return;
289 case BS_CONNECT:
290 case BS_ACTIVE:
291 bgp_conn_enter_idle_state(conn);
292 return;
293 case BS_OPENSENT:
294 case BS_OPENCONFIRM:
295 case BS_ESTABLISHED:
296 bgp_error(conn, 6, subcode, NULL, 0);
297 return;
298 default:
299 bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
300 }
301 }
302
303 static void
304 bgp_down(struct bgp_proto *p)
305 {
306 if (p->start_state > BSS_PREPARE)
307 bgp_close(p, 1);
308
309 BGP_TRACE(D_EVENTS, "Down");
310 proto_notify_state(&p->p, PS_DOWN);
311 }
312
313 static void
314 bgp_decision(void *vp)
315 {
316 struct bgp_proto *p = vp;
317
318 DBG("BGP: Decision start\n");
319 if ((p->p.proto_state == PS_START)
320 && (p->outgoing_conn.state == BS_IDLE)
321 && (!p->cf->passive))
322 bgp_active(p);
323
324 if ((p->p.proto_state == PS_STOP)
325 && (p->outgoing_conn.state == BS_IDLE)
326 && (p->incoming_conn.state == BS_IDLE))
327 bgp_down(p);
328 }
329
330 void
331 bgp_stop(struct bgp_proto *p, unsigned subcode)
332 {
333 proto_notify_state(&p->p, PS_STOP);
334 bgp_graceful_close_conn(&p->outgoing_conn, subcode);
335 bgp_graceful_close_conn(&p->incoming_conn, subcode);
336 ev_schedule(p->event);
337 }
338
339 static inline void
340 bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state)
341 {
342 if (conn->bgp->p.mrtdump & MD_STATES)
343 mrt_dump_bgp_state_change(conn, conn->state, new_state);
344
345 conn->state = new_state;
346 }
347
348 void
349 bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
350 {
351 /* Really, most of the work is done in bgp_rx_open(). */
352 bgp_conn_set_state(conn, BS_OPENCONFIRM);
353 }
354
355 void
356 bgp_conn_enter_established_state(struct bgp_conn *conn)
357 {
358 struct bgp_proto *p = conn->bgp;
359
360 BGP_TRACE(D_EVENTS, "BGP session established");
361 DBG("BGP: UP!!!\n");
362
363 /* For multi-hop BGP sessions */
364 if (ipa_zero(p->source_addr))
365 p->source_addr = conn->sk->saddr;
366
367 p->conn = conn;
368 p->last_error_class = 0;
369 p->last_error_code = 0;
370 bgp_init_bucket_table(p);
371 bgp_init_prefix_table(p, 8);
372
373 bgp_conn_set_state(conn, BS_ESTABLISHED);
374 proto_notify_state(&p->p, PS_UP);
375 }
376
377 static void
378 bgp_conn_leave_established_state(struct bgp_proto *p)
379 {
380 BGP_TRACE(D_EVENTS, "BGP session closed");
381 p->conn = NULL;
382
383 if (p->p.proto_state == PS_UP)
384 bgp_stop(p, 0);
385 }
386
387 void
388 bgp_conn_enter_close_state(struct bgp_conn *conn)
389 {
390 struct bgp_proto *p = conn->bgp;
391 int os = conn->state;
392
393 bgp_conn_set_state(conn, BS_CLOSE);
394 tm_stop(conn->keepalive_timer);
395 conn->sk->rx_hook = NULL;
396
397 /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
398 bgp_start_timer(conn->hold_timer, 10);
399
400 if (os == BS_ESTABLISHED)
401 bgp_conn_leave_established_state(p);
402 }
403
404 void
405 bgp_conn_enter_idle_state(struct bgp_conn *conn)
406 {
407 struct bgp_proto *p = conn->bgp;
408 int os = conn->state;
409
410 bgp_close_conn(conn);
411 bgp_conn_set_state(conn, BS_IDLE);
412 ev_schedule(p->event);
413
414 if (os == BS_ESTABLISHED)
415 bgp_conn_leave_established_state(p);
416 }
417
418 static void
419 bgp_send_open(struct bgp_conn *conn)
420 {
421 conn->start_state = conn->bgp->start_state;
422
423 // Default values, possibly changed by receiving capabilities.
424 conn->peer_refresh_support = 0;
425 conn->peer_as4_support = 0;
426 conn->peer_add_path = 0;
427 conn->advertised_as = 0;
428
429 DBG("BGP: Sending open\n");
430 conn->sk->rx_hook = bgp_rx;
431 conn->sk->tx_hook = bgp_tx;
432 tm_stop(conn->connect_retry_timer);
433 bgp_schedule_packet(conn, PKT_OPEN);
434 bgp_conn_set_state(conn, BS_OPENSENT);
435 bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
436 }
437
438 static void
439 bgp_connected(sock *sk)
440 {
441 struct bgp_conn *conn = sk->data;
442 struct bgp_proto *p = conn->bgp;
443
444 BGP_TRACE(D_EVENTS, "Connected");
445 bgp_send_open(conn);
446 }
447
448 static void
449 bgp_connect_timeout(timer *t)
450 {
451 struct bgp_conn *conn = t->data;
452 struct bgp_proto *p = conn->bgp;
453
454 DBG("BGP: connect_timeout\n");
455 if (p->p.proto_state == PS_START)
456 {
457 bgp_close_conn(conn);
458 bgp_connect(p);
459 }
460 else
461 bgp_conn_enter_idle_state(conn);
462 }
463
464 static void
465 bgp_sock_err(sock *sk, int err)
466 {
467 struct bgp_conn *conn = sk->data;
468 struct bgp_proto *p = conn->bgp;
469
470 /*
471 * This error hook may be called either asynchronously from main
472 * loop, or synchronously from sk_send(). But sk_send() is called
473 * only from bgp_tx() and bgp_kick_tx(), which are both called
474 * asynchronously from main loop. Moreover, they end if err hook is
475 * called. Therefore, we could suppose that it is always called
476 * asynchronously.
477 */
478
479 bgp_store_error(p, conn, BE_SOCKET, err);
480
481 if (err)
482 BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
483 else
484 BGP_TRACE(D_EVENTS, "Connection closed");
485
486 bgp_conn_enter_idle_state(conn);
487 }
488
489 static void
490 bgp_hold_timeout(timer *t)
491 {
492 struct bgp_conn *conn = t->data;
493 struct bgp_proto *p = conn->bgp;
494
495 DBG("BGP: Hold timeout\n");
496
497 /* We are already closing the connection - just do hangup */
498 if (conn->state == BS_CLOSE)
499 {
500 BGP_TRACE(D_EVENTS, "Connection stalled");
501 bgp_conn_enter_idle_state(conn);
502 return;
503 }
504
505 /* If there is something in input queue, we are probably congested
506 and perhaps just not processed BGP packets in time. */
507
508 if (sk_rx_ready(conn->sk) > 0)
509 bgp_start_timer(conn->hold_timer, 10);
510 else
511 bgp_error(conn, 4, 0, NULL, 0);
512 }
513
514 static void
515 bgp_keepalive_timeout(timer *t)
516 {
517 struct bgp_conn *conn = t->data;
518
519 DBG("BGP: Keepalive timer\n");
520 bgp_schedule_packet(conn, PKT_KEEPALIVE);
521 }
522
523 static void
524 bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
525 {
526 timer *t;
527
528 conn->sk = NULL;
529 conn->bgp = p;
530 conn->packets_to_send = 0;
531
532 t = conn->connect_retry_timer = tm_new(p->p.pool);
533 t->hook = bgp_connect_timeout;
534 t->data = conn;
535 t = conn->hold_timer = tm_new(p->p.pool);
536 t->hook = bgp_hold_timeout;
537 t->data = conn;
538 t = conn->keepalive_timer = tm_new(p->p.pool);
539 t->hook = bgp_keepalive_timeout;
540 t->data = conn;
541 conn->tx_ev = ev_new(p->p.pool);
542 conn->tx_ev->hook = bgp_kick_tx;
543 conn->tx_ev->data = conn;
544 }
545
546 static void
547 bgp_setup_sk(struct bgp_conn *conn, sock *s)
548 {
549 s->data = conn;
550 s->err_hook = bgp_sock_err;
551 conn->sk = s;
552 }
553
554 static void
555 bgp_active(struct bgp_proto *p)
556 {
557 int delay = MAX(1, p->cf->start_delay_time);
558 struct bgp_conn *conn = &p->outgoing_conn;
559
560 BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
561 bgp_setup_conn(p, conn);
562 bgp_conn_set_state(conn, BS_ACTIVE);
563 bgp_start_timer(conn->connect_retry_timer, delay);
564 }
565
566 /**
567 * bgp_connect - initiate an outgoing connection
568 * @p: BGP instance
569 *
570 * The bgp_connect() function creates a new &bgp_conn and initiates
571 * a TCP connection to the peer. The rest of connection setup is governed
572 * by the BGP state machine as described in the standard.
573 */
574 static void
575 bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */
576 {
577 sock *s;
578 struct bgp_conn *conn = &p->outgoing_conn;
579 int hops = p->cf->multihop ? : 1;
580
581 DBG("BGP: Connecting\n");
582 s = sk_new(p->p.pool);
583 s->type = SK_TCP_ACTIVE;
584 s->saddr = p->source_addr;
585 s->daddr = p->cf->remote_ip;
586 s->iface = p->neigh ? p->neigh->iface : NULL;
587 s->dport = BGP_PORT;
588 s->ttl = p->cf->ttl_security ? 255 : hops;
589 s->rbsize = BGP_RX_BUFFER_SIZE;
590 s->tbsize = BGP_TX_BUFFER_SIZE;
591 s->tos = IP_PREC_INTERNET_CONTROL;
592 s->password = p->cf->password;
593 s->tx_hook = bgp_connected;
594 BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
595 s->saddr, ipa_has_link_scope(s->saddr) ? s->iface : NULL);
596 bgp_setup_conn(p, conn);
597 bgp_setup_sk(conn, s);
598 bgp_conn_set_state(conn, BS_CONNECT);
599
600 if (sk_open(s) < 0)
601 {
602 bgp_sock_err(s, 0);
603 return;
604 }
605
606 /* Set minimal receive TTL if needed */
607 if (p->cf->ttl_security)
608 {
609 DBG("Setting minimum received TTL to %d", 256 - hops);
610 if (sk_set_min_ttl(s, 256 - hops) < 0)
611 {
612 log(L_ERR "TTL security configuration failed, closing session");
613 bgp_sock_err(s, 0);
614 return;
615 }
616 }
617
618 DBG("BGP: Waiting for connect success\n");
619 bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time);
620 }
621
622 /**
623 * bgp_incoming_connection - handle an incoming connection
624 * @sk: TCP socket
625 * @dummy: unused
626 *
627 * This function serves as a socket hook for accepting of new BGP
628 * connections. It searches a BGP instance corresponding to the peer
629 * which has connected and if such an instance exists, it creates a
630 * &bgp_conn structure, attaches it to the instance and either sends
631 * an Open message or (if there already is an active connection) it
632 * closes the new connection by sending a Notification message.
633 */
634 static int
635 bgp_incoming_connection(sock *sk, int dummy UNUSED)
636 {
637 struct proto_config *pc;
638
639 DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
640 WALK_LIST(pc, config->protos)
641 if (pc->protocol == &proto_bgp && pc->proto)
642 {
643 struct bgp_proto *p = (struct bgp_proto *) pc->proto;
644 if (ipa_equal(p->cf->remote_ip, sk->daddr) &&
645 (!ipa_has_link_scope(sk->daddr) || (p->cf->iface == sk->iface)))
646 {
647 /* We are in proper state and there is no other incoming connection */
648 int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
649 (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
650
651 BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
652 sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL,
653 sk->dport, acc ? "accepted" : "rejected");
654
655 if (!acc)
656 goto err;
657
658 int hops = p->cf->multihop ? : 1;
659 if (p->cf->ttl_security)
660 {
661 /* TTL security support */
662 if ((sk_set_ttl(sk, 255) < 0) ||
663 (sk_set_min_ttl(sk, 256 - hops) < 0))
664 {
665 log(L_ERR "TTL security configuration failed, closing session");
666 goto err;
667 }
668 }
669 else
670 sk_set_ttl(sk, hops);
671
672 bgp_setup_conn(p, &p->incoming_conn);
673 bgp_setup_sk(&p->incoming_conn, sk);
674 bgp_send_open(&p->incoming_conn);
675 return 0;
676 }
677 }
678
679 log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
680 sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL, sk->dport);
681 err:
682 rfree(sk);
683 return 0;
684 }
685
686 static void
687 bgp_listen_sock_err(sock *sk UNUSED, int err)
688 {
689 if (err == ECONNABORTED)
690 log(L_WARN "BGP: Incoming connection aborted");
691 else
692 log(L_ERR "BGP: Error on listening socket: %M", err);
693 }
694
695 static sock *
696 bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags)
697 {
698 sock *s = sk_new(&root_pool);
699 DBG("BGP: Creating listening socket\n");
700 s->type = SK_TCP_PASSIVE;
701 s->ttl = 255;
702 s->saddr = addr;
703 s->sport = port ? port : BGP_PORT;
704 s->flags = flags ? 0 : SKF_V6ONLY;
705 s->tos = IP_PREC_INTERNET_CONTROL;
706 s->rbsize = BGP_RX_BUFFER_SIZE;
707 s->tbsize = BGP_TX_BUFFER_SIZE;
708 s->rx_hook = bgp_incoming_connection;
709 s->err_hook = bgp_listen_sock_err;
710
711 if (sk_open(s) < 0)
712 {
713 log(L_ERR "BGP: Unable to open listening socket");
714 rfree(s);
715 return NULL;
716 }
717
718 return s;
719 }
720
721 static void
722 bgp_start_neighbor(struct bgp_proto *p)
723 {
724 /* Called only for single-hop BGP sessions */
725
726 if (ipa_zero(p->source_addr))
727 p->source_addr = p->neigh->ifa->ip;
728
729 #ifdef IPV6
730 {
731 struct ifa *a;
732 p->local_link = IPA_NONE;
733 WALK_LIST(a, p->neigh->iface->addrs)
734 if (a->scope == SCOPE_LINK)
735 {
736 p->local_link = a->ip;
737 break;
738 }
739
740 if (! ipa_nonzero(p->local_link))
741 log(L_WARN "%s: Missing link local address on interface %s", p->p.name, p->neigh->iface->name);
742
743 DBG("BGP: Selected link-level address %I\n", p->local_link);
744 }
745 #endif
746
747 bgp_initiate(p);
748 }
749
750 static void
751 bgp_neigh_notify(neighbor *n)
752 {
753 struct bgp_proto *p = (struct bgp_proto *) n->proto;
754
755 if (! (n->flags & NEF_STICKY))
756 return;
757
758 if (n->scope > 0)
759 {
760 if ((p->p.proto_state == PS_START) && (p->start_state == BSS_PREPARE))
761 {
762 BGP_TRACE(D_EVENTS, "Neighbor found");
763 bgp_start_neighbor(p);
764 }
765 }
766 else
767 {
768 if ((p->p.proto_state == PS_START) || (p->p.proto_state == PS_UP))
769 {
770 BGP_TRACE(D_EVENTS, "Neighbor lost");
771 bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
772 bgp_stop(p, 0);
773 }
774 }
775 }
776
777 static void
778 bgp_bfd_notify(struct bfd_request *req)
779 {
780 struct bgp_proto *p = req->data;
781 int ps = p->p.proto_state;
782
783 if (req->down && ((ps == PS_START) || (ps == PS_UP)))
784 {
785 BGP_TRACE(D_EVENTS, "BFD session down");
786 bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
787 if (ps == PS_UP)
788 bgp_update_startup_delay(p);
789 bgp_stop(p, 0);
790 }
791 }
792
793 static void
794 bgp_update_bfd(struct bgp_proto *p, int use_bfd)
795 {
796 if (use_bfd && !p->bfd_req)
797 p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
798 p->cf->multihop ? NULL : p->neigh->iface,
799 bgp_bfd_notify, p);
800
801 if (!use_bfd && p->bfd_req)
802 {
803 rfree(p->bfd_req);
804 p->bfd_req = NULL;
805 }
806 }
807
808 static int
809 bgp_reload_routes(struct proto *P)
810 {
811 struct bgp_proto *p = (struct bgp_proto *) P;
812 if (!p->conn || !p->conn->peer_refresh_support)
813 return 0;
814
815 bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH);
816 return 1;
817 }
818
819 static void
820 bgp_start_locked(struct object_lock *lock)
821 {
822 struct bgp_proto *p = lock->data;
823 struct bgp_config *cf = p->cf;
824
825 if (p->p.proto_state != PS_START)
826 {
827 DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
828 return;
829 }
830
831 DBG("BGP: Got lock\n");
832
833 if (cf->multihop)
834 {
835 /* Multi-hop sessions do not use neighbor entries */
836 bgp_initiate(p);
837 return;
838 }
839
840 p->neigh = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
841 if (!p->neigh || (p->neigh->scope == SCOPE_HOST))
842 {
843 log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
844 /* As we do not start yet, we can just disable protocol */
845 p->p.disabled = 1;
846 bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
847 proto_notify_state(&p->p, PS_DOWN);
848 return;
849 }
850
851 if (p->neigh->scope > 0)
852 bgp_start_neighbor(p);
853 else
854 BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
855 }
856
857 static int
858 bgp_start(struct proto *P)
859 {
860 struct bgp_proto *p = (struct bgp_proto *) P;
861 struct object_lock *lock;
862
863 DBG("BGP: Startup.\n");
864 p->start_state = BSS_PREPARE;
865 p->outgoing_conn.state = BS_IDLE;
866 p->incoming_conn.state = BS_IDLE;
867 p->neigh = NULL;
868 p->bfd_req = NULL;
869
870 rt_lock_table(p->igp_table);
871
872 p->event = ev_new(p->p.pool);
873 p->event->hook = bgp_decision;
874 p->event->data = p;
875
876 p->startup_timer = tm_new(p->p.pool);
877 p->startup_timer->hook = bgp_startup_timeout;
878 p->startup_timer->data = p;
879
880 p->local_id = proto_get_router_id(P->cf);
881 if (p->rr_client)
882 p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
883
884 p->remote_id = 0;
885 p->source_addr = p->cf->source_addr;
886
887 /*
888 * Before attempting to create the connection, we need to lock the
889 * port, so that are sure we're the only instance attempting to talk
890 * with that neighbor.
891 */
892
893 lock = p->lock = olock_new(P->pool);
894 lock->addr = p->cf->remote_ip;
895 lock->iface = p->cf->iface;
896 lock->type = OBJLOCK_TCP;
897 lock->port = BGP_PORT;
898 lock->hook = bgp_start_locked;
899 lock->data = p;
900 olock_acquire(lock);
901
902 return PS_START;
903 }
904
905 extern int proto_restart;
906
907 static int
908 bgp_shutdown(struct proto *P)
909 {
910 struct bgp_proto *p = (struct bgp_proto *) P;
911 unsigned subcode = 0;
912
913 BGP_TRACE(D_EVENTS, "Shutdown requested");
914
915 switch (P->down_code)
916 {
917 case PDC_CF_REMOVE:
918 case PDC_CF_DISABLE:
919 subcode = 3; // Errcode 6, 3 - peer de-configured
920 break;
921
922 case PDC_CF_RESTART:
923 subcode = 6; // Errcode 6, 6 - other configuration change
924 break;
925
926 case PDC_CMD_DISABLE:
927 case PDC_CMD_SHUTDOWN:
928 subcode = 2; // Errcode 6, 2 - administrative shutdown
929 break;
930
931 case PDC_CMD_RESTART:
932 subcode = 4; // Errcode 6, 4 - administrative reset
933 break;
934
935 case PDC_RX_LIMIT_HIT:
936 case PDC_IN_LIMIT_HIT:
937 subcode = 1; // Errcode 6, 1 - max number of prefixes reached
938 /* log message for compatibility */
939 log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
940 goto limit;
941
942 case PDC_OUT_LIMIT_HIT:
943 subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
944
945 limit:
946 bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
947 if (proto_restart)
948 bgp_update_startup_delay(p);
949 else
950 p->startup_delay = 0;
951 goto done;
952 }
953
954 bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
955 p->startup_delay = 0;
956
957 done:
958 bgp_stop(p, subcode);
959 return p->p.proto_state;
960 }
961
962 static void
963 bgp_cleanup(struct proto *P)
964 {
965 struct bgp_proto *p = (struct bgp_proto *) P;
966 rt_unlock_table(p->igp_table);
967 }
968
969 static rtable *
970 get_igp_table(struct bgp_config *cf)
971 {
972 return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
973 }
974
975 static struct proto *
976 bgp_init(struct proto_config *C)
977 {
978 struct proto *P = proto_new(C, sizeof(struct bgp_proto));
979 struct bgp_config *c = (struct bgp_config *) C;
980 struct bgp_proto *p = (struct bgp_proto *) P;
981
982 P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL;
983 P->rt_notify = bgp_rt_notify;
984 P->import_control = bgp_import_control;
985 P->neigh_notify = bgp_neigh_notify;
986 P->reload_routes = bgp_reload_routes;
987 P->rte_better = bgp_rte_better;
988 P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
989
990 p->cf = c;
991 p->local_as = c->local_as;
992 p->remote_as = c->remote_as;
993 p->is_internal = (c->local_as == c->remote_as);
994 p->rs_client = c->rs_client;
995 p->rr_client = c->rr_client;
996 p->igp_table = get_igp_table(c);
997
998 return P;
999 }
1000
1001
1002 void
1003 bgp_check_config(struct bgp_config *c)
1004 {
1005 int internal = (c->local_as == c->remote_as);
1006
1007 /* Do not check templates at all */
1008 if (c->c.class == SYM_TEMPLATE)
1009 return;
1010
1011
1012 /* EBGP direct by default, IBGP multihop by default */
1013 if (c->multihop < 0)
1014 c->multihop = internal ? 64 : 0;
1015
1016 /* Different default for gw_mode */
1017 if (!c->gw_mode)
1018 c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT;
1019
1020 /* Different default based on rs_client */
1021 if (!c->missing_lladdr)
1022 c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF;
1023
1024 /* Disable after error incompatible with restart limit action */
1025 if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error)
1026 c->c.in_limit->action = PLA_DISABLE;
1027
1028
1029 if (!c->local_as)
1030 cf_error("Local AS number must be set");
1031
1032 if (!c->remote_as)
1033 cf_error("Neighbor must be configured");
1034
1035 if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF))
1036 cf_error("Neighbor AS number out of range (AS4 not available)");
1037
1038 if (!internal && c->rr_client)
1039 cf_error("Only internal neighbor can be RR client");
1040
1041 if (internal && c->rs_client)
1042 cf_error("Only external neighbor can be RS client");
1043
1044 if (c->multihop && (c->gw_mode == GW_DIRECT))
1045 cf_error("Multihop BGP cannot use direct gateway mode");
1046
1047 if (c->multihop && (ipa_has_link_scope(c->remote_ip) ||
1048 ipa_has_link_scope(c->source_addr)))
1049 cf_error("Multihop BGP cannot be used with link-local addresses");
1050
1051 if (c->multihop && c->bfd && ipa_zero(c->source_addr))
1052 cf_error("Multihop BGP with BFD requires specified source address");
1053
1054 if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted)
1055 cf_error("BGP in recursive mode prohibits sorted table");
1056
1057 if (c->deterministic_med && c->c.table->sorted)
1058 cf_error("BGP with deterministic MED prohibits sorted table");
1059
1060 if (c->secondary && !c->c.table->sorted)
1061 cf_error("BGP with secondary option requires sorted table");
1062 }
1063
1064 static int
1065 bgp_reconfigure(struct proto *P, struct proto_config *C)
1066 {
1067 struct bgp_config *new = (struct bgp_config *) C;
1068 struct bgp_proto *p = (struct bgp_proto *) P;
1069 struct bgp_config *old = p->cf;
1070
1071 if (proto_get_router_id(C) != p->local_id)
1072 return 0;
1073
1074 int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
1075 ((byte *) new) + sizeof(struct proto_config),
1076 // password item is last and must be checked separately
1077 OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
1078 && ((!old->password && !new->password)
1079 || (old->password && new->password && !strcmp(old->password, new->password)))
1080 && (get_igp_table(old) == get_igp_table(new));
1081
1082 if (same && (p->start_state > BSS_PREPARE))
1083 bgp_update_bfd(p, new->bfd);
1084
1085 /* We should update our copy of configuration ptr as old configuration will be freed */
1086 if (same)
1087 p->cf = new;
1088
1089 return same;
1090 }
1091
1092 static void
1093 bgp_copy_config(struct proto_config *dest, struct proto_config *src)
1094 {
1095 /* Just a shallow copy */
1096 proto_copy_rest(dest, src, sizeof(struct bgp_config));
1097 }
1098
1099
1100 /**
1101 * bgp_error - report a protocol error
1102 * @c: connection
1103 * @code: error code (according to the RFC)
1104 * @subcode: error sub-code
1105 * @data: data to be passed in the Notification message
1106 * @len: length of the data
1107 *
1108 * bgp_error() sends a notification packet to tell the other side that a protocol
1109 * error has occurred (including the data considered erroneous if possible) and
1110 * closes the connection.
1111 */
1112 void
1113 bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len)
1114 {
1115 struct bgp_proto *p = c->bgp;
1116
1117 if (c->state == BS_CLOSE)
1118 return;
1119
1120 bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len);
1121 bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
1122 bgp_conn_enter_close_state(c);
1123
1124 c->notify_code = code;
1125 c->notify_subcode = subcode;
1126 c->notify_data = data;
1127 c->notify_size = (len > 0) ? len : 0;
1128 bgp_schedule_packet(c, PKT_NOTIFICATION);
1129
1130 if (code != 6)
1131 {
1132 bgp_update_startup_delay(p);
1133 bgp_stop(p, 0);
1134 }
1135 }
1136
1137 /**
1138 * bgp_store_error - store last error for status report
1139 * @p: BGP instance
1140 * @c: connection
1141 * @class: error class (BE_xxx constants)
1142 * @code: error code (class specific)
1143 *
1144 * bgp_store_error() decides whether given error is interesting enough
1145 * and store that error to last_error variables of @p
1146 */
1147 void
1148 bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
1149 {
1150 /* During PS_UP, we ignore errors on secondary connection */
1151 if ((p->p.proto_state == PS_UP) && c && (c != p->conn))
1152 return;
1153
1154 /* During PS_STOP, we ignore any errors, as we want to report
1155 * the error that caused transition to PS_STOP
1156 */
1157 if (p->p.proto_state == PS_STOP)
1158 return;
1159
1160 p->last_error_class = class;
1161 p->last_error_code = code;
1162 }
1163
1164 static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
1165 static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
1166 static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down" };
1167 static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
1168
1169 static const char *
1170 bgp_last_errmsg(struct bgp_proto *p)
1171 {
1172 switch (p->last_error_class)
1173 {
1174 case BE_MISC:
1175 return bgp_misc_errors[p->last_error_code];
1176 case BE_SOCKET:
1177 return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
1178 case BE_BGP_RX:
1179 case BE_BGP_TX:
1180 return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
1181 case BE_AUTO_DOWN:
1182 return bgp_auto_errors[p->last_error_code];
1183 default:
1184 return "";
1185 }
1186 }
1187
1188 static const char *
1189 bgp_state_dsc(struct bgp_proto *p)
1190 {
1191 if (p->p.proto_state == PS_DOWN)
1192 return "Down";
1193
1194 int state = MAX(p->incoming_conn.state, p->outgoing_conn.state);
1195 if ((state == BS_IDLE) && (p->start_state >= BSS_CONNECT) && p->cf->passive)
1196 return "Passive";
1197
1198 return bgp_state_names[state];
1199 }
1200
1201 static void
1202 bgp_get_status(struct proto *P, byte *buf)
1203 {
1204 struct bgp_proto *p = (struct bgp_proto *) P;
1205
1206 const char *err1 = bgp_err_classes[p->last_error_class];
1207 const char *err2 = bgp_last_errmsg(p);
1208
1209 if (P->proto_state == PS_DOWN)
1210 bsprintf(buf, "%s%s", err1, err2);
1211 else
1212 bsprintf(buf, "%-14s%s%s", bgp_state_dsc(p), err1, err2);
1213 }
1214
1215 static void
1216 bgp_show_proto_info(struct proto *P)
1217 {
1218 struct bgp_proto *p = (struct bgp_proto *) P;
1219 struct bgp_conn *c = p->conn;
1220
1221 proto_show_basic_info(P);
1222
1223 cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p));
1224 cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface);
1225 cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
1226
1227 if (P->proto_state == PS_START)
1228 {
1229 struct bgp_conn *oc = &p->outgoing_conn;
1230
1231 if ((p->start_state < BSS_CONNECT) &&
1232 (p->startup_timer->expires))
1233 cli_msg(-1006, " Error wait: %d/%d",
1234 p->startup_timer->expires - now, p->startup_delay);
1235
1236 if ((oc->state == BS_ACTIVE) &&
1237 (oc->connect_retry_timer->expires))
1238 cli_msg(-1006, " Start delay: %d/%d",
1239 oc->connect_retry_timer->expires - now, p->cf->start_delay_time);
1240 }
1241 else if (P->proto_state == PS_UP)
1242 {
1243 cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
1244 cli_msg(-1006, " Neighbor caps: %s%s%s%s",
1245 c->peer_refresh_support ? " refresh" : "",
1246 c->peer_as4_support ? " AS4" : "",
1247 (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "",
1248 (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "");
1249 cli_msg(-1006, " Session: %s%s%s%s%s%s%s",
1250 p->is_internal ? "internal" : "external",
1251 p->cf->multihop ? " multihop" : "",
1252 p->rr_client ? " route-reflector" : "",
1253 p->rs_client ? " route-server" : "",
1254 p->as4_session ? " AS4" : "",
1255 p->add_path_rx ? " add-path-rx" : "",
1256 p->add_path_tx ? " add-path-tx" : "");
1257 cli_msg(-1006, " Source address: %I", p->source_addr);
1258 if (P->cf->in_limit)
1259 cli_msg(-1006, " Route limit: %d/%d",
1260 p->p.stats.imp_routes + p->p.stats.filt_routes, P->cf->in_limit->limit);
1261 cli_msg(-1006, " Hold timer: %d/%d",
1262 tm_remains(c->hold_timer), c->hold_time);
1263 cli_msg(-1006, " Keepalive timer: %d/%d",
1264 tm_remains(c->keepalive_timer), c->keepalive_time);
1265 }
1266
1267 if ((p->last_error_class != BE_NONE) &&
1268 (p->last_error_class != BE_MAN_DOWN))
1269 {
1270 const char *err1 = bgp_err_classes[p->last_error_class];
1271 const char *err2 = bgp_last_errmsg(p);
1272 cli_msg(-1006, " Last error: %s%s", err1, err2);
1273 }
1274 }
1275
1276 struct protocol proto_bgp = {
1277 name: "BGP",
1278 template: "bgp%d",
1279 attr_class: EAP_BGP,
1280 preference: DEF_PREF_BGP,
1281 init: bgp_init,
1282 start: bgp_start,
1283 shutdown: bgp_shutdown,
1284 cleanup: bgp_cleanup,
1285 reconfigure: bgp_reconfigure,
1286 copy_config: bgp_copy_config,
1287 get_status: bgp_get_status,
1288 get_attr: bgp_get_attr,
1289 get_route_info: bgp_get_route_info,
1290 show_proto_info: bgp_show_proto_info
1291 };