]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/bgp.c
Merge branch 'master' into mq-filter-stack
[thirdparty/bird.git] / proto / bgp / bgp.c
1 /*
2 * BIRD -- The Border Gateway Protocol
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 /**
12 * DOC: Border Gateway Protocol
13 *
14 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
15 * the connection and most of the interface with BIRD core, |packets.c| handling
16 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
17 * manipulation with BGP attribute lists.
18 *
19 * As opposed to the other existing routing daemons, BIRD has a sophisticated
20 * core architecture which is able to keep all the information needed by BGP in
21 * the primary routing table, therefore no complex data structures like a
22 * central BGP table are needed. This increases memory footprint of a BGP router
23 * with many connections, but not too much and, which is more important, it
24 * makes BGP much easier to implement.
25 *
26 * Each instance of BGP (corresponding to a single BGP peer) is described by a
27 * &bgp_proto structure to which are attached individual connections represented
28 * by &bgp_connection (usually, there exists only one connection, but during BGP
29 * session setup, there can be more of them). The connections are handled
30 * according to the BGP state machine defined in the RFC with all the timers and
31 * all the parameters configurable.
32 *
33 * In incoming direction, we listen on the connection's socket and each time we
34 * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
35 * markers and passes complete packets to bgp_rx_packet() which distributes the
36 * packet according to its type.
37 *
38 * In outgoing direction, we gather all the routing updates and sort them to
39 * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
40 * fast comparison of &rta's and a &fib which helps us to find if we already
41 * have another route for the same destination queued for sending, so that we
42 * can replace it with the new one immediately instead of sending both
43 * updates). There also exists a special bucket holding all the route
44 * withdrawals which cannot be queued anywhere else as they don't have any
45 * attributes. If we have any packet to send (due to either new routes or the
46 * connection tracking code wanting to send a Open, Keepalive or Notification
47 * message), we call bgp_schedule_packet() which sets the corresponding bit in a
48 * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
49 * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
50 * packet type bits and calls the corresponding bgp_create_xx() functions,
51 * eventually rescheduling the same packet type if we have more data of the same
52 * type to send.
53 *
54 * The processing of attributes consists of two functions: bgp_decode_attrs()
55 * for checking of the attribute blocks and translating them to the language of
56 * BIRD's extended attributes and bgp_encode_attrs() which does the
57 * converse. Both functions are built around a @bgp_attr_table array describing
58 * all important characteristics of all known attributes. Unknown transitive
59 * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
60 *
61 * BGP protocol implements graceful restart in both restarting (local restart)
62 * and receiving (neighbor restart) roles. The first is handled mostly by the
63 * graceful restart code in the nest, BGP protocol just handles capabilities,
64 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
65 * The second is implemented by internal restart of the BGP state to %BS_IDLE
66 * and protocol state to %PS_START, but keeping the protocol up from the core
67 * point of view and therefore maintaining received routes. Routing table
68 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
69 * stale routes after reestablishment of BGP session during graceful restart.
70 *
71 * Supported standards:
72 * RFC 4271 - Border Gateway Protocol 4 (BGP)
73 * RFC 1997 - BGP Communities Attribute
74 * RFC 2385 - Protection of BGP Sessions via TCP MD5 Signature
75 * RFC 2545 - Use of BGP Multiprotocol Extensions for IPv6
76 * RFC 2918 - Route Refresh Capability
77 * RFC 3107 - Carrying Label Information in BGP
78 * RFC 4360 - BGP Extended Communities Attribute
79 * RFC 4364 - BGP/MPLS IPv4 Virtual Private Networks
80 * RFC 4456 - BGP Route Reflection
81 * RFC 4486 - Subcodes for BGP Cease Notification Message
82 * RFC 4659 - BGP/MPLS IPv6 Virtual Private Networks
83 * RFC 4724 - Graceful Restart Mechanism for BGP
84 * RFC 4760 - Multiprotocol extensions for BGP
85 * RFC 4798 - Connecting IPv6 Islands over IPv4 MPLS
86 * RFC 5065 - AS confederations for BGP
87 * RFC 5082 - Generalized TTL Security Mechanism
88 * RFC 5492 - Capabilities Advertisement with BGP
89 * RFC 5549 - Advertising IPv4 NLRI with an IPv6 Next Hop
90 * RFC 5575 - Dissemination of Flow Specification Rules
91 * RFC 5668 - 4-Octet AS Specific BGP Extended Community
92 * RFC 6286 - AS-Wide Unique BGP Identifier
93 * RFC 6608 - Subcodes for BGP Finite State Machine Error
94 * RFC 6793 - BGP Support for 4-Octet AS Numbers
95 * RFC 7313 - Enhanced Route Refresh Capability for BGP
96 * RFC 7606 - Revised Error Handling for BGP UPDATE Messages
97 * RFC 7911 - Advertisement of Multiple Paths in BGP
98 * RFC 7947 - Internet Exchange BGP Route Server
99 * RFC 8092 - BGP Large Communities Attribute
100 * RFC 8203 - BGP Administrative Shutdown Communication
101 * RFC 8212 - Default EBGP Route Propagation Behavior without Policies
102 * draft-ietf-idr-bgp-extended-messages-27
103 * draft-uttaro-idr-bgp-persistence-04
104 */
105
106 #undef LOCAL_DEBUG
107
108 #include <stdlib.h>
109
110 #include "nest/bird.h"
111 #include "nest/iface.h"
112 #include "nest/protocol.h"
113 #include "nest/route.h"
114 #include "nest/cli.h"
115 #include "nest/locks.h"
116 #include "conf/conf.h"
117 #include "filter/filter.h"
118 #include "lib/socket.h"
119 #include "lib/resource.h"
120 #include "lib/string.h"
121
122 #include "bgp.h"
123
124
125 struct linpool *bgp_linpool; /* Global temporary pool */
126 struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */
127 static list bgp_sockets; /* Global list of listening sockets */
128
129
130 static void bgp_connect(struct bgp_proto *p);
131 static void bgp_active(struct bgp_proto *p);
132 static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn);
133 static void bgp_setup_sk(struct bgp_conn *conn, sock *s);
134 static void bgp_send_open(struct bgp_conn *conn);
135 static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
136
137 static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
138 static void bgp_listen_sock_err(sock *sk UNUSED, int err);
139
140 /**
141 * bgp_open - open a BGP instance
142 * @p: BGP instance
143 *
144 * This function allocates and configures shared BGP resources, mainly listening
145 * sockets. Should be called as the last step during initialization (when lock
146 * is acquired and neighbor is ready). When error, caller should change state to
147 * PS_DOWN and return immediately.
148 */
149 static int
150 bgp_open(struct bgp_proto *p)
151 {
152 struct bgp_socket *bs = NULL;
153 struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
154 ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
155 (p->ipv4 ? IPA_NONE4 : IPA_NONE6);
156 uint port = p->cf->local_port;
157
158 /* FIXME: Add some global init? */
159 if (!bgp_linpool)
160 init_list(&bgp_sockets);
161
162 /* We assume that cf->iface is defined iff cf->local_ip is link-local */
163
164 WALK_LIST(bs, bgp_sockets)
165 if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->sport == port) &&
166 (bs->sk->iface == ifa) && (bs->sk->vrf == p->p.vrf))
167 {
168 bs->uc++;
169 p->sock = bs;
170 return 0;
171 }
172
173 sock *sk = sk_new(proto_pool);
174 sk->type = SK_TCP_PASSIVE;
175 sk->ttl = 255;
176 sk->saddr = addr;
177 sk->sport = port;
178 sk->iface = ifa;
179 sk->vrf = p->p.vrf;
180 sk->flags = 0;
181 sk->tos = IP_PREC_INTERNET_CONTROL;
182 sk->rbsize = BGP_RX_BUFFER_SIZE;
183 sk->tbsize = BGP_TX_BUFFER_SIZE;
184 sk->rx_hook = bgp_incoming_connection;
185 sk->err_hook = bgp_listen_sock_err;
186
187 if (sk_open(sk) < 0)
188 goto err;
189
190 bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
191 bs->sk = sk;
192 bs->uc = 1;
193 p->sock = bs;
194 sk->data = bs;
195
196 add_tail(&bgp_sockets, &bs->n);
197
198 if (!bgp_linpool)
199 {
200 bgp_linpool = lp_new_default(proto_pool);
201 bgp_linpool2 = lp_new_default(proto_pool);
202 }
203
204 return 0;
205
206 err:
207 sk_log_error(sk, p->p.name);
208 log(L_ERR "%s: Cannot open listening socket", p->p.name);
209 rfree(sk);
210 return -1;
211 }
212
213 /**
214 * bgp_close - close a BGP instance
215 * @p: BGP instance
216 *
217 * This function frees and deconfigures shared BGP resources.
218 */
219 static void
220 bgp_close(struct bgp_proto *p)
221 {
222 struct bgp_socket *bs = p->sock;
223
224 ASSERT(bs && bs->uc);
225
226 if (--bs->uc)
227 return;
228
229 rfree(bs->sk);
230 rem_node(&bs->n);
231 mb_free(bs);
232
233 if (!EMPTY_LIST(bgp_sockets))
234 return;
235
236 rfree(bgp_linpool);
237 bgp_linpool = NULL;
238
239 rfree(bgp_linpool2);
240 bgp_linpool2 = NULL;
241 }
242
243 static inline int
244 bgp_setup_auth(struct bgp_proto *p, int enable)
245 {
246 if (p->cf->password)
247 {
248 int rv = sk_set_md5_auth(p->sock->sk,
249 p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
250 enable ? p->cf->password : NULL, p->cf->setkey);
251
252 if (rv < 0)
253 sk_log_error(p->sock->sk, p->p.name);
254
255 return rv;
256 }
257 else
258 return 0;
259 }
260
261 static inline struct bgp_channel *
262 bgp_find_channel(struct bgp_proto *p, u32 afi)
263 {
264 struct bgp_channel *c;
265 WALK_LIST(c, p->p.channels)
266 if (c->afi == afi)
267 return c;
268
269 return NULL;
270 }
271
272 static void
273 bgp_startup(struct bgp_proto *p)
274 {
275 BGP_TRACE(D_EVENTS, "Started");
276 p->start_state = BSS_CONNECT;
277
278 if (!p->passive)
279 bgp_active(p);
280
281 if (p->postponed_sk)
282 {
283 /* Apply postponed incoming connection */
284 bgp_setup_conn(p, &p->incoming_conn);
285 bgp_setup_sk(&p->incoming_conn, p->postponed_sk);
286 bgp_send_open(&p->incoming_conn);
287 p->postponed_sk = NULL;
288 }
289 }
290
291 static void
292 bgp_startup_timeout(timer *t)
293 {
294 bgp_startup(t->data);
295 }
296
297
298 static void
299 bgp_initiate(struct bgp_proto *p)
300 {
301 int err_val;
302
303 if (bgp_open(p) < 0)
304 { err_val = BEM_NO_SOCKET; goto err1; }
305
306 if (bgp_setup_auth(p, 1) < 0)
307 { err_val = BEM_INVALID_MD5; goto err2; }
308
309 if (p->cf->bfd)
310 bgp_update_bfd(p, p->cf->bfd);
311
312 if (p->startup_delay)
313 {
314 p->start_state = BSS_DELAY;
315 BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
316 bgp_start_timer(p->startup_timer, p->startup_delay);
317 }
318 else
319 bgp_startup(p);
320
321 return;
322
323 err2:
324 bgp_close(p);
325 err1:
326 p->p.disabled = 1;
327 bgp_store_error(p, NULL, BE_MISC, err_val);
328 proto_notify_state(&p->p, PS_DOWN);
329
330 return;
331 }
332
333 /**
334 * bgp_start_timer - start a BGP timer
335 * @t: timer
336 * @value: time (in seconds) to fire (0 to disable the timer)
337 *
338 * This functions calls tm_start() on @t with time @value and the amount of
339 * randomization suggested by the BGP standard. Please use it for all BGP
340 * timers.
341 */
342 void
343 bgp_start_timer(timer *t, uint value)
344 {
345 if (value)
346 {
347 /* The randomization procedure is specified in RFC 4271 section 10 */
348 btime time = value S;
349 btime randomize = random() % ((time / 4) + 1);
350 tm_start(t, time - randomize);
351 }
352 else
353 tm_stop(t);
354 }
355
356 /**
357 * bgp_close_conn - close a BGP connection
358 * @conn: connection to close
359 *
360 * This function takes a connection described by the &bgp_conn structure, closes
361 * its socket and frees all resources associated with it.
362 */
363 void
364 bgp_close_conn(struct bgp_conn *conn)
365 {
366 // struct bgp_proto *p = conn->bgp;
367
368 DBG("BGP: Closing connection\n");
369 conn->packets_to_send = 0;
370 conn->channels_to_send = 0;
371 rfree(conn->connect_timer);
372 conn->connect_timer = NULL;
373 rfree(conn->keepalive_timer);
374 conn->keepalive_timer = NULL;
375 rfree(conn->hold_timer);
376 conn->hold_timer = NULL;
377 rfree(conn->tx_ev);
378 conn->tx_ev = NULL;
379 rfree(conn->sk);
380 conn->sk = NULL;
381
382 mb_free(conn->local_caps);
383 conn->local_caps = NULL;
384 mb_free(conn->remote_caps);
385 conn->remote_caps = NULL;
386 }
387
388
389 /**
390 * bgp_update_startup_delay - update a startup delay
391 * @p: BGP instance
392 *
393 * This function updates a startup delay that is used to postpone next BGP
394 * connect. It also handles disable_after_error and might stop BGP instance
395 * when error happened and disable_after_error is on.
396 *
397 * It should be called when BGP protocol error happened.
398 */
399 void
400 bgp_update_startup_delay(struct bgp_proto *p)
401 {
402 const struct bgp_config *cf = p->cf;
403
404 DBG("BGP: Updating startup delay\n");
405
406 if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S))
407 p->startup_delay = 0;
408
409 p->last_proto_error = current_time();
410
411 if (cf->disable_after_error)
412 {
413 p->startup_delay = 0;
414 p->p.disabled = 1;
415 return;
416 }
417
418 if (!p->startup_delay)
419 p->startup_delay = cf->error_delay_time_min;
420 else
421 p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
422 }
423
424 static void
425 bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len)
426 {
427 switch (conn->state)
428 {
429 case BS_IDLE:
430 case BS_CLOSE:
431 return;
432
433 case BS_CONNECT:
434 case BS_ACTIVE:
435 bgp_conn_enter_idle_state(conn);
436 return;
437
438 case BS_OPENSENT:
439 case BS_OPENCONFIRM:
440 case BS_ESTABLISHED:
441 if (subcode < 0)
442 {
443 bgp_conn_enter_close_state(conn);
444 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
445 }
446 else
447 bgp_error(conn, 6, subcode, data, len);
448 return;
449
450 default:
451 bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
452 }
453 }
454
455 static void
456 bgp_down(struct bgp_proto *p)
457 {
458 if (p->start_state > BSS_PREPARE)
459 {
460 bgp_setup_auth(p, 0);
461 bgp_close(p);
462 }
463
464 BGP_TRACE(D_EVENTS, "Down");
465 proto_notify_state(&p->p, PS_DOWN);
466 }
467
468 static void
469 bgp_decision(void *vp)
470 {
471 struct bgp_proto *p = vp;
472
473 DBG("BGP: Decision start\n");
474 if ((p->p.proto_state == PS_START) &&
475 (p->outgoing_conn.state == BS_IDLE) &&
476 (p->incoming_conn.state != BS_OPENCONFIRM) &&
477 !p->passive)
478 bgp_active(p);
479
480 if ((p->p.proto_state == PS_STOP) &&
481 (p->outgoing_conn.state == BS_IDLE) &&
482 (p->incoming_conn.state == BS_IDLE))
483 bgp_down(p);
484 }
485
486 static struct bgp_proto *
487 bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip)
488 {
489 struct symbol *sym;
490 char fmt[SYM_MAX_LEN];
491
492 bsprintf(fmt, "%s%%0%dd", pp->cf->dynamic_name, pp->cf->dynamic_name_digits);
493
494 /* This is hack, we would like to share config, but we need to copy it now */
495 new_config = config;
496 cfg_mem = config->mem;
497 conf_this_scope = config->root_scope;
498 sym = cf_default_name(fmt, &(pp->dynamic_name_counter));
499 proto_clone_config(sym, pp->p.cf);
500 new_config = NULL;
501 cfg_mem = NULL;
502
503 /* Just pass remote_ip to bgp_init() */
504 ((struct bgp_config *) sym->proto)->remote_ip = remote_ip;
505
506 return (void *) proto_spawn(sym->proto, 0);
507 }
508
509 void
510 bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len)
511 {
512 proto_notify_state(&p->p, PS_STOP);
513 bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
514 bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
515 ev_schedule(p->event);
516 }
517
518 static inline void
519 bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
520 {
521 if (conn->bgp->p.mrtdump & MD_STATES)
522 bgp_dump_state_change(conn, conn->state, new_state);
523
524 conn->state = new_state;
525 }
526
527 void
528 bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
529 {
530 /* Really, most of the work is done in bgp_rx_open(). */
531 bgp_conn_set_state(conn, BS_OPENCONFIRM);
532 }
533
534 static const struct bgp_af_caps dummy_af_caps = { };
535 static const struct bgp_af_caps basic_af_caps = { .ready = 1 };
536
537 void
538 bgp_conn_enter_established_state(struct bgp_conn *conn)
539 {
540 struct bgp_proto *p = conn->bgp;
541 struct bgp_caps *local = conn->local_caps;
542 struct bgp_caps *peer = conn->remote_caps;
543 struct bgp_channel *c;
544
545 BGP_TRACE(D_EVENTS, "BGP session established");
546
547 /* For multi-hop BGP sessions */
548 if (ipa_zero(p->local_ip))
549 p->local_ip = conn->sk->saddr;
550
551 /* For promiscuous sessions */
552 if (!p->remote_as)
553 p->remote_as = conn->received_as;
554
555 /* In case of LLv6 is not valid during BGP start */
556 if (ipa_zero(p->link_addr) && p->neigh && p->neigh->iface && p->neigh->iface->llv6)
557 p->link_addr = p->neigh->iface->llv6->ip;
558
559 conn->sk->fast_rx = 0;
560
561 p->conn = conn;
562 p->last_error_class = 0;
563 p->last_error_code = 0;
564
565 p->as4_session = conn->as4_session;
566
567 p->route_refresh = peer->route_refresh;
568 p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
569
570 /* Whether we may handle possible GR/LLGR of peer (it has some AF GR-able) */
571 p->gr_ready = p->llgr_ready = 0; /* Updated later */
572
573 /* Whether peer is ready to handle our GR recovery */
574 int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
575
576 if (p->gr_active_num)
577 tm_stop(p->gr_timer);
578
579 /* Number of active channels */
580 int num = 0;
581
582 /* Summary state of ADD_PATH RX for active channels */
583 uint summary_add_path_rx = 0;
584
585 WALK_LIST(c, p->p.channels)
586 {
587 const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
588 const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi);
589
590 /* Use default if capabilities were not announced */
591 if (!local->length && (c->afi == BGP_AF_IPV4))
592 loc = &basic_af_caps;
593
594 if (!peer->length && (c->afi == BGP_AF_IPV4))
595 rem = &basic_af_caps;
596
597 /* Ignore AFIs that were not announced in multiprotocol capability */
598 if (!loc || !loc->ready)
599 loc = &dummy_af_caps;
600
601 if (!rem || !rem->ready)
602 rem = &dummy_af_caps;
603
604 int active = loc->ready && rem->ready;
605 c->c.disabled = !active;
606 c->c.reloadable = p->route_refresh || c->cf->import_table;
607
608 c->index = active ? num++ : 0;
609
610 c->feed_state = BFS_NONE;
611 c->load_state = BFS_NONE;
612
613 /* Channels where peer may do GR */
614 uint gr_ready = active && local->gr_aware && rem->gr_able;
615 uint llgr_ready = active && local->llgr_aware && rem->llgr_able;
616
617 c->gr_ready = gr_ready || llgr_ready;
618 p->gr_ready = p->gr_ready || c->gr_ready;
619 p->llgr_ready = p->llgr_ready || llgr_ready;
620
621 /* Remember last LLGR stale time */
622 c->stale_time = local->llgr_aware ? rem->llgr_time : 0;
623
624 /* Channels not able to recover gracefully */
625 if (p->p.gr_recovery && (!active || !peer_gr_ready))
626 channel_graceful_restart_unlock(&c->c);
627
628 /* Channels waiting for local convergence */
629 if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
630 c->c.gr_wait = 1;
631
632 /* Channels where regular graceful restart failed */
633 if ((c->gr_active == BGP_GRS_ACTIVE) &&
634 !(active && rem->gr_able && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
635 bgp_graceful_restart_done(c);
636
637 /* Channels where regular long-lived restart failed */
638 if ((c->gr_active == BGP_GRS_LLGR) &&
639 !(active && rem->llgr_able && (rem->gr_af_flags & BGP_LLGRF_FORWARDING)))
640 bgp_graceful_restart_done(c);
641
642 /* GR capability implies that neighbor will send End-of-RIB */
643 if (peer->gr_aware)
644 c->load_state = BFS_LOADING;
645
646 c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
647 c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
648 c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);
649
650 if (active)
651 summary_add_path_rx |= !c->add_path_rx ? 1 : 2;
652
653 /* Update RA mode */
654 if (c->add_path_tx)
655 c->c.ra_mode = RA_ANY;
656 else if (c->cf->secondary)
657 c->c.ra_mode = RA_ACCEPTED;
658 else
659 c->c.ra_mode = RA_OPTIMAL;
660 }
661
662 p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
663 p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
664 p->channel_count = num;
665 p->summary_add_path_rx = summary_add_path_rx;
666
667 WALK_LIST(c, p->p.channels)
668 {
669 if (c->c.disabled)
670 continue;
671
672 p->afi_map[c->index] = c->afi;
673 p->channel_map[c->index] = c;
674 }
675
676 /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
677
678 bgp_conn_set_state(conn, BS_ESTABLISHED);
679 proto_notify_state(&p->p, PS_UP);
680 }
681
682 static void
683 bgp_conn_leave_established_state(struct bgp_proto *p)
684 {
685 BGP_TRACE(D_EVENTS, "BGP session closed");
686 p->conn = NULL;
687
688 if (p->p.proto_state == PS_UP)
689 bgp_stop(p, 0, NULL, 0);
690 }
691
692 void
693 bgp_conn_enter_close_state(struct bgp_conn *conn)
694 {
695 struct bgp_proto *p = conn->bgp;
696 int os = conn->state;
697
698 bgp_conn_set_state(conn, BS_CLOSE);
699 tm_stop(conn->keepalive_timer);
700 conn->sk->rx_hook = NULL;
701
702 /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
703 bgp_start_timer(conn->hold_timer, 10);
704
705 if (os == BS_ESTABLISHED)
706 bgp_conn_leave_established_state(p);
707 }
708
709 void
710 bgp_conn_enter_idle_state(struct bgp_conn *conn)
711 {
712 struct bgp_proto *p = conn->bgp;
713 int os = conn->state;
714
715 bgp_close_conn(conn);
716 bgp_conn_set_state(conn, BS_IDLE);
717 ev_schedule(p->event);
718
719 if (os == BS_ESTABLISHED)
720 bgp_conn_leave_established_state(p);
721 }
722
723 /**
724 * bgp_handle_graceful_restart - handle detected BGP graceful restart
725 * @p: BGP instance
726 *
727 * This function is called when a BGP graceful restart of the neighbor is
728 * detected (when the TCP connection fails or when a new TCP connection
729 * appears). The function activates processing of the restart - starts routing
730 * table refresh cycle and activates BGP restart timer. The protocol state goes
731 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
732 * caller.
733 */
734 void
735 bgp_handle_graceful_restart(struct bgp_proto *p)
736 {
737 ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
738
739 BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
740 p->gr_active_num ? " - already pending" : "");
741
742 p->gr_active_num = 0;
743
744 struct bgp_channel *c;
745 WALK_LIST(c, p->p.channels)
746 {
747 /* FIXME: perhaps check for channel state instead of disabled flag? */
748 if (c->c.disabled)
749 continue;
750
751 if (c->gr_ready)
752 {
753 p->gr_active_num++;
754
755 switch (c->gr_active)
756 {
757 case BGP_GRS_NONE:
758 c->gr_active = BGP_GRS_ACTIVE;
759 rt_refresh_begin(c->c.table, &c->c);
760 break;
761
762 case BGP_GRS_ACTIVE:
763 rt_refresh_end(c->c.table, &c->c);
764 rt_refresh_begin(c->c.table, &c->c);
765 break;
766
767 case BGP_GRS_LLGR:
768 rt_refresh_begin(c->c.table, &c->c);
769 rt_modify_stale(c->c.table, &c->c);
770 break;
771 }
772 }
773 else
774 {
775 /* Just flush the routes */
776 rt_refresh_begin(c->c.table, &c->c);
777 rt_refresh_end(c->c.table, &c->c);
778 }
779
780 /* Reset bucket and prefix tables */
781 bgp_free_bucket_table(c);
782 bgp_free_prefix_table(c);
783 bgp_init_bucket_table(c);
784 bgp_init_prefix_table(c);
785 c->packets_to_send = 0;
786 }
787
788 /* p->gr_ready -> at least one active channel is c->gr_ready */
789 ASSERT(p->gr_active_num > 0);
790
791 proto_notify_state(&p->p, PS_START);
792 tm_start(p->gr_timer, p->conn->remote_caps->gr_time S);
793 }
794
795 /**
796 * bgp_graceful_restart_done - finish active BGP graceful restart
797 * @c: BGP channel
798 *
799 * This function is called when the active BGP graceful restart of the neighbor
800 * should be finished for channel @c - either successfully (the neighbor sends
801 * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
802 * unsuccessfully (the neighbor does not support BGP graceful restart on the new
803 * session). The function ends the routing table refresh cycle.
804 */
805 void
806 bgp_graceful_restart_done(struct bgp_channel *c)
807 {
808 struct bgp_proto *p = (void *) c->c.proto;
809
810 ASSERT(c->gr_active);
811 c->gr_active = 0;
812 p->gr_active_num--;
813
814 if (!p->gr_active_num)
815 BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
816
817 tm_stop(c->stale_timer);
818 rt_refresh_end(c->c.table, &c->c);
819 }
820
821 /**
822 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
823 * @t: timer
824 *
825 * This function is a timeout hook for @gr_timer, implementing BGP restart time
826 * limit for reestablisment of the BGP session after the graceful restart. When
827 * fired, we just proceed with the usual protocol restart.
828 */
829
830 static void
831 bgp_graceful_restart_timeout(timer *t)
832 {
833 struct bgp_proto *p = t->data;
834
835 BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
836
837 if (p->llgr_ready)
838 {
839 struct bgp_channel *c;
840 WALK_LIST(c, p->p.channels)
841 {
842 /* Channel is not in GR and is already flushed */
843 if (!c->gr_active)
844 continue;
845
846 /* Channel is already in LLGR from past restart */
847 if (c->gr_active == BGP_GRS_LLGR)
848 continue;
849
850 /* Channel is in GR, but does not support LLGR -> stop GR */
851 if (!c->stale_time)
852 {
853 bgp_graceful_restart_done(c);
854 continue;
855 }
856
857 /* Channel is in GR, and supports LLGR -> start LLGR */
858 c->gr_active = BGP_GRS_LLGR;
859 tm_start(c->stale_timer, c->stale_time S);
860 rt_modify_stale(c->c.table, &c->c);
861 }
862 }
863 else
864 bgp_stop(p, 0, NULL, 0);
865 }
866
867 static void
868 bgp_long_lived_stale_timeout(timer *t)
869 {
870 struct bgp_channel *c = t->data;
871 struct bgp_proto *p = (void *) c->c.proto;
872
873 BGP_TRACE(D_EVENTS, "Long-lived stale timeout");
874
875 bgp_graceful_restart_done(c);
876 }
877
878
879 /**
880 * bgp_refresh_begin - start incoming enhanced route refresh sequence
881 * @c: BGP channel
882 *
883 * This function is called when an incoming enhanced route refresh sequence is
884 * started by the neighbor, demarcated by the BoRR packet. The function updates
885 * the load state and starts the routing table refresh cycle. Note that graceful
886 * restart also uses routing table refresh cycle, but RFC 7313 and load states
887 * ensure that these two sequences do not overlap.
888 */
889 void
890 bgp_refresh_begin(struct bgp_channel *c)
891 {
892 struct bgp_proto *p = (void *) c->c.proto;
893
894 if (c->load_state == BFS_LOADING)
895 { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
896
897 c->load_state = BFS_REFRESHING;
898 rt_refresh_begin(c->c.table, &c->c);
899
900 if (c->c.in_table)
901 rt_refresh_begin(c->c.in_table, &c->c);
902 }
903
904 /**
905 * bgp_refresh_end - finish incoming enhanced route refresh sequence
906 * @c: BGP channel
907 *
908 * This function is called when an incoming enhanced route refresh sequence is
909 * finished by the neighbor, demarcated by the EoRR packet. The function updates
910 * the load state and ends the routing table refresh cycle. Routes not received
911 * during the sequence are removed by the nest.
912 */
913 void
914 bgp_refresh_end(struct bgp_channel *c)
915 {
916 struct bgp_proto *p = (void *) c->c.proto;
917
918 if (c->load_state != BFS_REFRESHING)
919 { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
920
921 c->load_state = BFS_NONE;
922 rt_refresh_end(c->c.table, &c->c);
923
924 if (c->c.in_table)
925 rt_prune_sync(c->c.in_table, 0);
926 }
927
928
929 static void
930 bgp_send_open(struct bgp_conn *conn)
931 {
932 DBG("BGP: Sending open\n");
933 conn->sk->rx_hook = bgp_rx;
934 conn->sk->tx_hook = bgp_tx;
935 tm_stop(conn->connect_timer);
936 bgp_prepare_capabilities(conn);
937 bgp_schedule_packet(conn, NULL, PKT_OPEN);
938 bgp_conn_set_state(conn, BS_OPENSENT);
939 bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
940 }
941
942 static void
943 bgp_connected(sock *sk)
944 {
945 struct bgp_conn *conn = sk->data;
946 struct bgp_proto *p = conn->bgp;
947
948 BGP_TRACE(D_EVENTS, "Connected");
949 bgp_send_open(conn);
950 }
951
952 static void
953 bgp_connect_timeout(timer *t)
954 {
955 struct bgp_conn *conn = t->data;
956 struct bgp_proto *p = conn->bgp;
957
958 DBG("BGP: connect_timeout\n");
959 if (p->p.proto_state == PS_START)
960 {
961 bgp_close_conn(conn);
962 bgp_connect(p);
963 }
964 else
965 bgp_conn_enter_idle_state(conn);
966 }
967
968 static void
969 bgp_sock_err(sock *sk, int err)
970 {
971 struct bgp_conn *conn = sk->data;
972 struct bgp_proto *p = conn->bgp;
973
974 /*
975 * This error hook may be called either asynchronously from main
976 * loop, or synchronously from sk_send(). But sk_send() is called
977 * only from bgp_tx() and bgp_kick_tx(), which are both called
978 * asynchronously from main loop. Moreover, they end if err hook is
979 * called. Therefore, we could suppose that it is always called
980 * asynchronously.
981 */
982
983 bgp_store_error(p, conn, BE_SOCKET, err);
984
985 if (err)
986 BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
987 else
988 BGP_TRACE(D_EVENTS, "Connection closed");
989
990 if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
991 bgp_handle_graceful_restart(p);
992
993 bgp_conn_enter_idle_state(conn);
994 }
995
996 static void
997 bgp_hold_timeout(timer *t)
998 {
999 struct bgp_conn *conn = t->data;
1000 struct bgp_proto *p = conn->bgp;
1001
1002 DBG("BGP: Hold timeout\n");
1003
1004 /* We are already closing the connection - just do hangup */
1005 if (conn->state == BS_CLOSE)
1006 {
1007 BGP_TRACE(D_EVENTS, "Connection stalled");
1008 bgp_conn_enter_idle_state(conn);
1009 return;
1010 }
1011
1012 /* If there is something in input queue, we are probably congested
1013 and perhaps just not processed BGP packets in time. */
1014
1015 if (sk_rx_ready(conn->sk) > 0)
1016 bgp_start_timer(conn->hold_timer, 10);
1017 else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready)
1018 {
1019 BGP_TRACE(D_EVENTS, "Hold timer expired");
1020 bgp_handle_graceful_restart(p);
1021 bgp_conn_enter_idle_state(conn);
1022 }
1023 else
1024 bgp_error(conn, 4, 0, NULL, 0);
1025 }
1026
1027 static void
1028 bgp_keepalive_timeout(timer *t)
1029 {
1030 struct bgp_conn *conn = t->data;
1031
1032 DBG("BGP: Keepalive timer\n");
1033 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
1034
1035 /* Kick TX a bit faster */
1036 if (ev_active(conn->tx_ev))
1037 ev_run(conn->tx_ev);
1038 }
1039
1040 static void
1041 bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
1042 {
1043 conn->sk = NULL;
1044 conn->bgp = p;
1045
1046 conn->packets_to_send = 0;
1047 conn->channels_to_send = 0;
1048 conn->last_channel = 0;
1049 conn->last_channel_count = 0;
1050
1051 conn->connect_timer = tm_new_init(p->p.pool, bgp_connect_timeout, conn, 0, 0);
1052 conn->hold_timer = tm_new_init(p->p.pool, bgp_hold_timeout, conn, 0, 0);
1053 conn->keepalive_timer = tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
1054
1055 conn->tx_ev = ev_new_init(p->p.pool, bgp_kick_tx, conn);
1056 }
1057
1058 static void
1059 bgp_setup_sk(struct bgp_conn *conn, sock *s)
1060 {
1061 s->data = conn;
1062 s->err_hook = bgp_sock_err;
1063 s->fast_rx = 1;
1064 conn->sk = s;
1065 }
1066
1067 static void
1068 bgp_active(struct bgp_proto *p)
1069 {
1070 int delay = MAX(1, p->cf->connect_delay_time);
1071 struct bgp_conn *conn = &p->outgoing_conn;
1072
1073 BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
1074 bgp_setup_conn(p, conn);
1075 bgp_conn_set_state(conn, BS_ACTIVE);
1076 bgp_start_timer(conn->connect_timer, delay);
1077 }
1078
1079 /**
1080 * bgp_connect - initiate an outgoing connection
1081 * @p: BGP instance
1082 *
1083 * The bgp_connect() function creates a new &bgp_conn and initiates
1084 * a TCP connection to the peer. The rest of connection setup is governed
1085 * by the BGP state machine as described in the standard.
1086 */
1087 static void
1088 bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */
1089 {
1090 struct bgp_conn *conn = &p->outgoing_conn;
1091 int hops = p->cf->multihop ? : 1;
1092
1093 DBG("BGP: Connecting\n");
1094 sock *s = sk_new(p->p.pool);
1095 s->type = SK_TCP_ACTIVE;
1096 s->saddr = p->local_ip;
1097 s->daddr = p->remote_ip;
1098 s->dport = p->cf->remote_port;
1099 s->iface = p->neigh ? p->neigh->iface : NULL;
1100 s->vrf = p->p.vrf;
1101 s->ttl = p->cf->ttl_security ? 255 : hops;
1102 s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
1103 s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
1104 s->tos = IP_PREC_INTERNET_CONTROL;
1105 s->password = p->cf->password;
1106 s->tx_hook = bgp_connected;
1107 BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J",
1108 s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL,
1109 s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
1110 bgp_setup_conn(p, conn);
1111 bgp_setup_sk(conn, s);
1112 bgp_conn_set_state(conn, BS_CONNECT);
1113
1114 if (sk_open(s) < 0)
1115 goto err;
1116
1117 /* Set minimal receive TTL if needed */
1118 if (p->cf->ttl_security)
1119 if (sk_set_min_ttl(s, 256 - hops) < 0)
1120 goto err;
1121
1122 DBG("BGP: Waiting for connect success\n");
1123 bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
1124 return;
1125
1126 err:
1127 sk_log_error(s, p->p.name);
1128 bgp_sock_err(s, 0);
1129 return;
1130 }
1131
1132 static inline int bgp_is_dynamic(struct bgp_proto *p)
1133 { return ipa_zero(p->remote_ip); }
1134
1135 /**
1136 * bgp_find_proto - find existing proto for incoming connection
1137 * @sk: TCP socket
1138 *
1139 */
1140 static struct bgp_proto *
1141 bgp_find_proto(sock *sk)
1142 {
1143 struct bgp_proto *best = NULL;
1144 struct bgp_proto *p;
1145
1146 /* sk->iface is valid only if src or dst address is link-local */
1147 int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr);
1148
1149 WALK_LIST(p, proto_list)
1150 if ((p->p.proto == &proto_bgp) &&
1151 (ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) &&
1152 (!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) &&
1153 (p->p.vrf == sk->vrf) &&
1154 (p->cf->local_port == sk->sport) &&
1155 (!link || (p->cf->iface == sk->iface)) &&
1156 (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)))
1157 {
1158 best = p;
1159
1160 if (!bgp_is_dynamic(p))
1161 break;
1162 }
1163
1164 return best;
1165 }
1166
1167 /**
1168 * bgp_incoming_connection - handle an incoming connection
1169 * @sk: TCP socket
1170 * @dummy: unused
1171 *
1172 * This function serves as a socket hook for accepting of new BGP
1173 * connections. It searches a BGP instance corresponding to the peer
1174 * which has connected and if such an instance exists, it creates a
1175 * &bgp_conn structure, attaches it to the instance and either sends
1176 * an Open message or (if there already is an active connection) it
1177 * closes the new connection by sending a Notification message.
1178 */
1179 static int
1180 bgp_incoming_connection(sock *sk, uint dummy UNUSED)
1181 {
1182 struct bgp_proto *p;
1183 int acc, hops;
1184
1185 DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
1186 p = bgp_find_proto(sk);
1187 if (!p)
1188 {
1189 log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
1190 sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
1191 rfree(sk);
1192 return 0;
1193 }
1194
1195 /*
1196 * BIRD should keep multiple incoming connections in OpenSent state (for
1197 * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
1198 * connections are rejected istead. The exception is the case where an
1199 * incoming connection triggers a graceful restart.
1200 */
1201
1202 acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
1203 (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
1204
1205 if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
1206 {
1207 bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
1208 bgp_handle_graceful_restart(p);
1209 bgp_conn_enter_idle_state(p->conn);
1210 acc = 1;
1211
1212 /* There might be separate incoming connection in OpenSent state */
1213 if (p->incoming_conn.state > BS_ACTIVE)
1214 bgp_close_conn(&p->incoming_conn);
1215 }
1216
1217 BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
1218 sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
1219 sk->dport, acc ? "accepted" : "rejected");
1220
1221 if (!acc)
1222 {
1223 rfree(sk);
1224 return 0;
1225 }
1226
1227 hops = p->cf->multihop ? : 1;
1228
1229 if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
1230 goto err;
1231
1232 if (p->cf->ttl_security)
1233 if (sk_set_min_ttl(sk, 256 - hops) < 0)
1234 goto err;
1235
1236 if (p->cf->enable_extended_messages)
1237 {
1238 sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
1239 sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
1240 sk_reallocate(sk);
1241 }
1242
1243 /* For dynamic BGP, spawn new instance and postpone the socket */
1244 if (bgp_is_dynamic(p))
1245 {
1246 p = bgp_spawn(p, sk->daddr);
1247 p->postponed_sk = sk;
1248 rmove(sk, p->p.pool);
1249 return 0;
1250 }
1251
1252 rmove(sk, p->p.pool);
1253 bgp_setup_conn(p, &p->incoming_conn);
1254 bgp_setup_sk(&p->incoming_conn, sk);
1255 bgp_send_open(&p->incoming_conn);
1256 return 0;
1257
1258 err:
1259 sk_log_error(sk, p->p.name);
1260 log(L_ERR "%s: Incoming connection aborted", p->p.name);
1261 rfree(sk);
1262 return 0;
1263 }
1264
1265 static void
1266 bgp_listen_sock_err(sock *sk UNUSED, int err)
1267 {
1268 if (err == ECONNABORTED)
1269 log(L_WARN "BGP: Incoming connection aborted");
1270 else
1271 log(L_ERR "BGP: Error on listening socket: %M", err);
1272 }
1273
1274 static void
1275 bgp_start_neighbor(struct bgp_proto *p)
1276 {
1277 /* Called only for single-hop BGP sessions */
1278
1279 if (ipa_zero(p->local_ip))
1280 p->local_ip = p->neigh->ifa->ip;
1281
1282 if (ipa_is_link_local(p->local_ip))
1283 p->link_addr = p->local_ip;
1284 else if (p->neigh->iface->llv6)
1285 p->link_addr = p->neigh->iface->llv6->ip;
1286
1287 bgp_initiate(p);
1288 }
1289
1290 static void
1291 bgp_neigh_notify(neighbor *n)
1292 {
1293 struct bgp_proto *p = (struct bgp_proto *) n->proto;
1294 int ps = p->p.proto_state;
1295
1296 if (n != p->neigh)
1297 return;
1298
1299 if ((ps == PS_DOWN) || (ps == PS_STOP))
1300 return;
1301
1302 int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);
1303
1304 if (n->scope <= 0)
1305 {
1306 if (!prepare)
1307 {
1308 BGP_TRACE(D_EVENTS, "Neighbor lost");
1309 bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
1310 /* Perhaps also run bgp_update_startup_delay(p)? */
1311 bgp_stop(p, 0, NULL, 0);
1312 }
1313 }
1314 else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
1315 {
1316 if (!prepare)
1317 {
1318 BGP_TRACE(D_EVENTS, "Link down");
1319 bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
1320 if (ps == PS_UP)
1321 bgp_update_startup_delay(p);
1322 bgp_stop(p, 0, NULL, 0);
1323 }
1324 }
1325 else
1326 {
1327 if (prepare)
1328 {
1329 BGP_TRACE(D_EVENTS, "Neighbor ready");
1330 bgp_start_neighbor(p);
1331 }
1332 }
1333 }
1334
1335 static void
1336 bgp_bfd_notify(struct bfd_request *req)
1337 {
1338 struct bgp_proto *p = req->data;
1339 int ps = p->p.proto_state;
1340
1341 if (req->down && ((ps == PS_START) || (ps == PS_UP)))
1342 {
1343 BGP_TRACE(D_EVENTS, "BFD session down");
1344 bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
1345
1346 if (p->cf->bfd == BGP_BFD_GRACEFUL)
1347 {
1348 /* Trigger graceful restart */
1349 if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
1350 bgp_handle_graceful_restart(p);
1351
1352 if (p->incoming_conn.state > BS_IDLE)
1353 bgp_conn_enter_idle_state(&p->incoming_conn);
1354
1355 if (p->outgoing_conn.state > BS_IDLE)
1356 bgp_conn_enter_idle_state(&p->outgoing_conn);
1357 }
1358 else
1359 {
1360 /* Trigger session down */
1361 if (ps == PS_UP)
1362 bgp_update_startup_delay(p);
1363 bgp_stop(p, 0, NULL, 0);
1364 }
1365 }
1366 }
1367
1368 static void
1369 bgp_update_bfd(struct bgp_proto *p, int use_bfd)
1370 {
1371 if (use_bfd && !p->bfd_req && !bgp_is_dynamic(p))
1372 p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip,
1373 p->cf->multihop ? NULL : p->neigh->iface,
1374 bgp_bfd_notify, p);
1375
1376 if (!use_bfd && p->bfd_req)
1377 {
1378 rfree(p->bfd_req);
1379 p->bfd_req = NULL;
1380 }
1381 }
1382
1383 static void
1384 bgp_reload_routes(struct channel *C)
1385 {
1386 struct bgp_proto *p = (void *) C->proto;
1387 struct bgp_channel *c = (void *) C;
1388
1389 ASSERT(p->conn && (p->route_refresh || c->c.in_table));
1390
1391 if (c->c.in_table)
1392 channel_schedule_reload(C);
1393 else
1394 bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
1395 }
1396
1397 static void
1398 bgp_feed_begin(struct channel *C, int initial)
1399 {
1400 struct bgp_proto *p = (void *) C->proto;
1401 struct bgp_channel *c = (void *) C;
1402
1403 /* This should not happen */
1404 if (!p->conn)
1405 return;
1406
1407 if (initial && p->cf->gr_mode)
1408 c->feed_state = BFS_LOADING;
1409
1410 /* It is refeed and both sides support enhanced route refresh */
1411 if (!initial && p->enhanced_refresh)
1412 {
1413 /* BoRR must not be sent before End-of-RIB */
1414 if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
1415 return;
1416
1417 c->feed_state = BFS_REFRESHING;
1418 bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
1419 }
1420 }
1421
1422 static void
1423 bgp_feed_end(struct channel *C)
1424 {
1425 struct bgp_proto *p = (void *) C->proto;
1426 struct bgp_channel *c = (void *) C;
1427
1428 /* This should not happen */
1429 if (!p->conn)
1430 return;
1431
1432 /* Non-demarcated feed ended, nothing to do */
1433 if (c->feed_state == BFS_NONE)
1434 return;
1435
1436 /* Schedule End-of-RIB packet */
1437 if (c->feed_state == BFS_LOADING)
1438 c->feed_state = BFS_LOADED;
1439
1440 /* Schedule EoRR packet */
1441 if (c->feed_state == BFS_REFRESHING)
1442 c->feed_state = BFS_REFRESHED;
1443
1444 /* Kick TX hook */
1445 bgp_schedule_packet(p->conn, c, PKT_UPDATE);
1446 }
1447
1448
1449 static void
1450 bgp_start_locked(struct object_lock *lock)
1451 {
1452 struct bgp_proto *p = lock->data;
1453 const struct bgp_config *cf = p->cf;
1454
1455 if (p->p.proto_state != PS_START)
1456 {
1457 DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
1458 return;
1459 }
1460
1461 DBG("BGP: Got lock\n");
1462
1463 if (cf->multihop || bgp_is_dynamic(p))
1464 {
1465 /* Multi-hop sessions do not use neighbor entries */
1466 bgp_initiate(p);
1467 return;
1468 }
1469
1470 neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY);
1471 if (!n)
1472 {
1473 log(L_ERR "%s: Invalid remote address %I%J", p->p.name, p->remote_ip, cf->iface);
1474 /* As we do not start yet, we can just disable protocol */
1475 p->p.disabled = 1;
1476 bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
1477 proto_notify_state(&p->p, PS_DOWN);
1478 return;
1479 }
1480
1481 p->neigh = n;
1482
1483 if (n->scope <= 0)
1484 BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", p->remote_ip, cf->iface);
1485 else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
1486 BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
1487 else
1488 bgp_start_neighbor(p);
1489 }
1490
1491 static int
1492 bgp_start(struct proto *P)
1493 {
1494 struct bgp_proto *p = (struct bgp_proto *) P;
1495 const struct bgp_config *cf = p->cf;
1496
1497 p->local_ip = cf->local_ip;
1498 p->local_as = cf->local_as;
1499 p->remote_as = cf->remote_as;
1500 p->public_as = cf->local_as;
1501
1502 /* For dynamic BGP childs, remote_ip is already set */
1503 if (ipa_nonzero(cf->remote_ip))
1504 p->remote_ip = cf->remote_ip;
1505
1506 /* Confederation ID is used for truly external peers */
1507 if (p->cf->confederation && !p->is_interior)
1508 p->public_as = cf->confederation;
1509
1510 p->passive = cf->passive || bgp_is_dynamic(p);
1511
1512 p->start_state = BSS_PREPARE;
1513 p->outgoing_conn.state = BS_IDLE;
1514 p->incoming_conn.state = BS_IDLE;
1515 p->neigh = NULL;
1516 p->bfd_req = NULL;
1517 p->postponed_sk = NULL;
1518 p->gr_ready = 0;
1519 p->gr_active_num = 0;
1520
1521 p->event = ev_new_init(p->p.pool, bgp_decision, p);
1522 p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0);
1523 p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0);
1524
1525 p->local_id = proto_get_router_id(P->cf);
1526 if (p->rr_client)
1527 p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
1528
1529 p->remote_id = 0;
1530 p->link_addr = IPA_NONE;
1531
1532 /* Lock all channels when in GR recovery mode */
1533 if (p->p.gr_recovery && p->cf->gr_mode)
1534 {
1535 struct bgp_channel *c;
1536 WALK_LIST(c, p->p.channels)
1537 channel_graceful_restart_lock(&c->c);
1538 }
1539
1540 /*
1541 * Before attempting to create the connection, we need to lock the port,
1542 * so that we are the only instance attempting to talk with that neighbor.
1543 */
1544 struct object_lock *lock;
1545 lock = p->lock = olock_new(P->pool);
1546 lock->addr = p->remote_ip;
1547 lock->port = p->cf->remote_port;
1548 lock->iface = p->cf->iface;
1549 lock->vrf = p->cf->iface ? NULL : p->p.vrf;
1550 lock->type = OBJLOCK_TCP;
1551 lock->hook = bgp_start_locked;
1552 lock->data = p;
1553 olock_acquire(lock);
1554
1555 return PS_START;
1556 }
1557
1558 extern int proto_restart;
1559
1560 static int
1561 bgp_shutdown(struct proto *P)
1562 {
1563 struct bgp_proto *p = (struct bgp_proto *) P;
1564 int subcode = 0;
1565
1566 char *message = NULL;
1567 byte *data = NULL;
1568 uint len = 0;
1569
1570 BGP_TRACE(D_EVENTS, "Shutdown requested");
1571
1572 switch (P->down_code)
1573 {
1574 case PDC_CF_REMOVE:
1575 case PDC_CF_DISABLE:
1576 subcode = 3; // Errcode 6, 3 - peer de-configured
1577 break;
1578
1579 case PDC_CF_RESTART:
1580 subcode = 6; // Errcode 6, 6 - other configuration change
1581 break;
1582
1583 case PDC_CMD_DISABLE:
1584 case PDC_CMD_SHUTDOWN:
1585 shutdown:
1586 subcode = 2; // Errcode 6, 2 - administrative shutdown
1587 message = P->message;
1588 break;
1589
1590 case PDC_CMD_RESTART:
1591 subcode = 4; // Errcode 6, 4 - administrative reset
1592 message = P->message;
1593 break;
1594
1595 case PDC_CMD_GR_DOWN:
1596 if ((p->cf->gr_mode != BGP_GR_ABLE) &&
1597 (p->cf->llgr_mode != BGP_LLGR_ABLE))
1598 goto shutdown;
1599
1600 subcode = -1; // Do not send NOTIFICATION, just close the connection
1601 break;
1602
1603 case PDC_RX_LIMIT_HIT:
1604 case PDC_IN_LIMIT_HIT:
1605 subcode = 1; // Errcode 6, 1 - max number of prefixes reached
1606 /* log message for compatibility */
1607 log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
1608 goto limit;
1609
1610 case PDC_OUT_LIMIT_HIT:
1611 subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
1612
1613 limit:
1614 bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
1615 if (proto_restart)
1616 bgp_update_startup_delay(p);
1617 else
1618 p->startup_delay = 0;
1619 goto done;
1620 }
1621
1622 bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
1623 p->startup_delay = 0;
1624
1625 /* RFC 8203 - shutdown communication */
1626 if (message)
1627 {
1628 uint msg_len = strlen(message);
1629 msg_len = MIN(msg_len, 255);
1630
1631 /* Buffer will be freed automatically by protocol shutdown */
1632 data = mb_alloc(p->p.pool, msg_len + 1);
1633 len = msg_len + 1;
1634
1635 data[0] = msg_len;
1636 memcpy(data+1, message, msg_len);
1637 }
1638
1639 done:
1640 bgp_stop(p, subcode, data, len);
1641 return p->p.proto_state;
1642 }
1643
1644 static struct proto *
1645 bgp_init(struct proto_config *CF)
1646 {
1647 struct proto *P = proto_new(CF);
1648 struct bgp_proto *p = (struct bgp_proto *) P;
1649 struct bgp_config *cf = (struct bgp_config *) CF;
1650
1651 P->rt_notify = bgp_rt_notify;
1652 P->preexport = bgp_preexport;
1653 P->neigh_notify = bgp_neigh_notify;
1654 P->reload_routes = bgp_reload_routes;
1655 P->feed_begin = bgp_feed_begin;
1656 P->feed_end = bgp_feed_end;
1657 P->rte_better = bgp_rte_better;
1658 P->rte_mergable = bgp_rte_mergable;
1659 P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;
1660 P->rte_modify = bgp_rte_modify_stale;
1661
1662 p->cf = cf;
1663 p->is_internal = (cf->local_as == cf->remote_as);
1664 p->is_interior = p->is_internal || cf->confederation_member;
1665 p->rs_client = cf->rs_client;
1666 p->rr_client = cf->rr_client;
1667
1668 p->ipv4 = ipa_nonzero(cf->remote_ip) ?
1669 ipa_is_ip4(cf->remote_ip) :
1670 (cf->remote_range && (cf->remote_range->type == NET_IP4));
1671
1672 p->remote_ip = cf->remote_ip;
1673 p->remote_as = cf->remote_as;
1674
1675 /* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */
1676 if (cf->c.parent)
1677 cf->remote_ip = IPA_NONE;
1678
1679 /* Add all channels */
1680 struct bgp_channel_config *cc;
1681 WALK_LIST(cc, CF->channels)
1682 proto_add_channel(P, &cc->c);
1683
1684 return P;
1685 }
1686
1687 static void
1688 bgp_channel_init(struct channel *C, struct channel_config *CF)
1689 {
1690 struct bgp_channel *c = (void *) C;
1691 struct bgp_channel_config *cf = (void *) CF;
1692
1693 c->cf = cf;
1694 c->afi = cf->afi;
1695 c->desc = cf->desc;
1696
1697 if (cf->igp_table_ip4)
1698 c->igp_table_ip4 = cf->igp_table_ip4->table;
1699
1700 if (cf->igp_table_ip6)
1701 c->igp_table_ip6 = cf->igp_table_ip6->table;
1702 }
1703
1704 static int
1705 bgp_channel_start(struct channel *C)
1706 {
1707 struct bgp_proto *p = (void *) C->proto;
1708 struct bgp_channel *c = (void *) C;
1709 ip_addr src = p->local_ip;
1710
1711 if (c->igp_table_ip4)
1712 rt_lock_table(c->igp_table_ip4);
1713
1714 if (c->igp_table_ip6)
1715 rt_lock_table(c->igp_table_ip6);
1716
1717 c->pool = p->p.pool; // XXXX
1718 bgp_init_bucket_table(c);
1719 bgp_init_prefix_table(c);
1720
1721 if (c->cf->import_table)
1722 channel_setup_in_table(C);
1723
1724 c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
1725
1726 c->next_hop_addr = c->cf->next_hop_addr;
1727 c->link_addr = IPA_NONE;
1728 c->packets_to_send = 0;
1729
1730 /* Try to use source address as next hop address */
1731 if (ipa_zero(c->next_hop_addr))
1732 {
1733 if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop))
1734 c->next_hop_addr = src;
1735
1736 if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop))
1737 c->next_hop_addr = src;
1738 }
1739
1740 /* Use preferred addresses associated with interface / source address */
1741 if (ipa_zero(c->next_hop_addr))
1742 {
1743 /* We know the iface for single-hop, we make lookup for multihop */
1744 struct neighbor *nbr = p->neigh ?: neigh_find(&p->p, src, NULL, 0);
1745 struct iface *iface = nbr ? nbr->iface : NULL;
1746
1747 if (bgp_channel_is_ipv4(c) && iface && iface->addr4)
1748 c->next_hop_addr = iface->addr4->ip;
1749
1750 if (bgp_channel_is_ipv6(c) && iface && iface->addr6)
1751 c->next_hop_addr = iface->addr6->ip;
1752 }
1753
1754 /* Exit if no feasible next hop address is found */
1755 if (ipa_zero(c->next_hop_addr))
1756 {
1757 log(L_WARN "%s: Missing next hop address", p->p.name);
1758 return 0;
1759 }
1760
1761 /* Set link-local address for IPv6 single-hop BGP */
1762 if (ipa_is_ip6(c->next_hop_addr) && p->neigh)
1763 {
1764 c->link_addr = p->link_addr;
1765
1766 if (ipa_zero(c->link_addr))
1767 log(L_WARN "%s: Missing link-local address", p->p.name);
1768 }
1769
1770 /* Link local address is already in c->link_addr */
1771 if (ipa_is_link_local(c->next_hop_addr))
1772 c->next_hop_addr = IPA_NONE;
1773
1774 return 0; /* XXXX: Currently undefined */
1775 }
1776
1777 static void
1778 bgp_channel_shutdown(struct channel *C)
1779 {
1780 struct bgp_channel *c = (void *) C;
1781
1782 c->next_hop_addr = IPA_NONE;
1783 c->link_addr = IPA_NONE;
1784 c->packets_to_send = 0;
1785 }
1786
1787 static void
1788 bgp_channel_cleanup(struct channel *C)
1789 {
1790 struct bgp_channel *c = (void *) C;
1791
1792 if (c->igp_table_ip4)
1793 rt_unlock_table(c->igp_table_ip4);
1794
1795 if (c->igp_table_ip6)
1796 rt_unlock_table(c->igp_table_ip6);
1797
1798 c->index = 0;
1799
1800 /* Cleanup rest of bgp_channel starting at pool field */
1801 memset(&(c->pool), 0, sizeof(struct bgp_channel) - OFFSETOF(struct bgp_channel, pool));
1802 }
1803
1804 static inline struct bgp_channel_config *
1805 bgp_find_channel_config(struct bgp_config *cf, u32 afi)
1806 {
1807 struct bgp_channel_config *cc;
1808
1809 WALK_LIST(cc, cf->c.channels)
1810 if (cc->afi == afi)
1811 return cc;
1812
1813 return NULL;
1814 }
1815
1816 struct rtable_config *
1817 bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type)
1818 {
1819 struct bgp_channel_config *cc2;
1820 struct rtable_config *tab;
1821
1822 /* First, try table connected by the channel */
1823 if (cc->c.table->addr_type == type)
1824 return cc->c.table;
1825
1826 /* Find paired channel with the same SAFI but the other AFI */
1827 u32 afi2 = cc->afi ^ 0x30000;
1828 cc2 = bgp_find_channel_config(cf, afi2);
1829
1830 /* Second, try IGP table configured in the paired channel */
1831 if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6))
1832 return tab;
1833
1834 /* Third, try table connected by the paired channel */
1835 if (cc2 && (cc2->c.table->addr_type == type))
1836 return cc2->c.table;
1837
1838 /* Last, try default table of given type */
1839 if (tab = cf->c.global->def_tables[type])
1840 return tab;
1841
1842 cf_error("Undefined IGP table");
1843 }
1844
1845
1846 void
1847 bgp_postconfig(struct proto_config *CF)
1848 {
1849 struct bgp_config *cf = (void *) CF;
1850
1851 /* Do not check templates at all */
1852 if (cf->c.class == SYM_TEMPLATE)
1853 return;
1854
1855
1856 /* Handle undefined remote_as, zero should mean unspecified external */
1857 if (!cf->remote_as && (cf->peer_type == BGP_PT_INTERNAL))
1858 cf->remote_as = cf->local_as;
1859
1860 int internal = (cf->local_as == cf->remote_as);
1861 int interior = internal || cf->confederation_member;
1862
1863 /* EBGP direct by default, IBGP multihop by default */
1864 if (cf->multihop < 0)
1865 cf->multihop = internal ? 64 : 0;
1866
1867 /* LLGR mode default based on GR mode */
1868 if (cf->llgr_mode < 0)
1869 cf->llgr_mode = cf->gr_mode ? BGP_LLGR_AWARE : 0;
1870
1871 /* Link check for single-hop BGP by default */
1872 if (cf->check_link < 0)
1873 cf->check_link = !cf->multihop;
1874
1875
1876 if (!cf->local_as)
1877 cf_error("Local AS number must be set");
1878
1879 if (ipa_zero(cf->remote_ip) && !cf->remote_range)
1880 cf_error("Neighbor must be configured");
1881
1882 if (ipa_zero(cf->local_ip) && cf->strict_bind)
1883 cf_error("Local address must be configured for strict bind");
1884
1885 if (!cf->remote_as && !cf->peer_type)
1886 cf_error("Remote AS number (or peer type) must be set");
1887
1888 if ((cf->peer_type == BGP_PT_INTERNAL) && !internal)
1889 cf_error("IBGP cannot have different ASNs");
1890
1891 if ((cf->peer_type == BGP_PT_EXTERNAL) && internal)
1892 cf_error("EBGP cannot have the same ASNs");
1893
1894 if (!cf->iface && (ipa_is_link_local(cf->local_ip) ||
1895 ipa_is_link_local(cf->remote_ip)))
1896 cf_error("Link-local addresses require defined interface");
1897
1898 if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF))
1899 cf_error("Neighbor AS number out of range (AS4 not available)");
1900
1901 if (!internal && cf->rr_client)
1902 cf_error("Only internal neighbor can be RR client");
1903
1904 if (internal && cf->rs_client)
1905 cf_error("Only external neighbor can be RS client");
1906
1907 if (!cf->confederation && cf->confederation_member)
1908 cf_error("Confederation ID must be set for member sessions");
1909
1910 if (cf->multihop && (ipa_is_link_local(cf->local_ip) ||
1911 ipa_is_link_local(cf->remote_ip)))
1912 cf_error("Multihop BGP cannot be used with link-local addresses");
1913
1914 if (cf->multihop && cf->iface)
1915 cf_error("Multihop BGP cannot be bound to interface");
1916
1917 if (cf->multihop && cf->check_link)
1918 cf_error("Multihop BGP cannot depend on link state");
1919
1920 if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip))
1921 cf_error("Multihop BGP with BFD requires specified local address");
1922
1923 if (!cf->gr_mode && cf->llgr_mode)
1924 cf_error("Long-lived graceful restart requires basic graceful restart");
1925
1926
1927 struct bgp_channel_config *cc;
1928 WALK_LIST(cc, CF->channels)
1929 {
1930 /* Handle undefined import filter */
1931 if (cc->c.in_filter == FILTER_UNDEF)
1932 if (interior)
1933 cc->c.in_filter = FILTER_ACCEPT;
1934 else
1935 cf_error("EBGP requires explicit import policy");
1936
1937 /* Handle undefined export filter */
1938 if (cc->c.out_filter == FILTER_UNDEF)
1939 if (interior)
1940 cc->c.out_filter = FILTER_REJECT;
1941 else
1942 cf_error("EBGP requires explicit export policy");
1943
1944 /* Disable after error incompatible with restart limit action */
1945 if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error)
1946 cc->c.in_limit.action = PLA_DISABLE;
1947
1948 /* Different default based on rr_client, rs_client */
1949 if (cc->next_hop_keep == 0xff)
1950 cc->next_hop_keep = cf->rr_client ? NH_IBGP : (cf->rs_client ? NH_ALL : NH_NO);
1951
1952 /* Different default based on rs_client */
1953 if (!cc->missing_lladdr)
1954 cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF;
1955
1956 /* Different default for gw_mode */
1957 if (!cc->gw_mode)
1958 cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT;
1959
1960 /* Defaults based on proto config */
1961 if (cc->gr_able == 0xff)
1962 cc->gr_able = (cf->gr_mode == BGP_GR_ABLE);
1963
1964 if (cc->llgr_able == 0xff)
1965 cc->llgr_able = (cf->llgr_mode == BGP_LLGR_ABLE);
1966
1967 if (cc->llgr_time == ~0U)
1968 cc->llgr_time = cf->llgr_time;
1969
1970 /* Default values of IGP tables */
1971 if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp)
1972 {
1973 if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop))
1974 cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4);
1975
1976 if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop))
1977 cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6);
1978
1979 if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop)
1980 cf_error("Mismatched IGP table type");
1981
1982 if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop)
1983 cf_error("Mismatched IGP table type");
1984 }
1985
1986 if (cf->multihop && (cc->gw_mode == GW_DIRECT))
1987 cf_error("Multihop BGP cannot use direct gateway mode");
1988
1989 if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted)
1990 cf_error("BGP in recursive mode prohibits sorted table");
1991
1992 if (cf->deterministic_med && cc->c.table->sorted)
1993 cf_error("BGP with deterministic MED prohibits sorted table");
1994
1995 if (cc->secondary && !cc->c.table->sorted)
1996 cf_error("BGP with secondary option requires sorted table");
1997 }
1998 }
1999
2000 static int
2001 bgp_reconfigure(struct proto *P, struct proto_config *CF)
2002 {
2003 struct bgp_proto *p = (void *) P;
2004 const struct bgp_config *new = (void *) CF;
2005 const struct bgp_config *old = p->cf;
2006
2007 if (proto_get_router_id(CF) != p->local_id)
2008 return 0;
2009
2010 int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
2011 ((byte *) new) + sizeof(struct proto_config),
2012 // password item is last and must be checked separately
2013 OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
2014 && ((!old->password && !new->password)
2015 || (old->password && new->password && !strcmp(old->password, new->password)))
2016 && ((!old->remote_range && !new->remote_range)
2017 || (old->remote_range && new->remote_range && net_equal(old->remote_range, new->remote_range)))
2018 && ((!old->dynamic_name && !new->dynamic_name)
2019 || (old->dynamic_name && new->dynamic_name && !strcmp(old->dynamic_name, new->dynamic_name)))
2020 && (old->dynamic_name_digits == new->dynamic_name_digits);
2021
2022 /* FIXME: Move channel reconfiguration to generic protocol code ? */
2023 struct channel *C, *C2;
2024 struct bgp_channel_config *cc;
2025
2026 WALK_LIST(C, p->p.channels)
2027 C->stale = 1;
2028
2029 WALK_LIST(cc, new->c.channels)
2030 {
2031 C = (struct channel *) bgp_find_channel(p, cc->afi);
2032 same = proto_configure_channel(P, &C, &cc->c) && same;
2033
2034 if (C)
2035 C->stale = 0;
2036 }
2037
2038 WALK_LIST_DELSAFE(C, C2, p->p.channels)
2039 if (C->stale)
2040 same = proto_configure_channel(P, &C, NULL) && same;
2041
2042
2043 if (same && (p->start_state > BSS_PREPARE))
2044 bgp_update_bfd(p, new->bfd);
2045
2046 /* We should update our copy of configuration ptr as old configuration will be freed */
2047 if (same)
2048 p->cf = new;
2049
2050 /* Reset name counter */
2051 p->dynamic_name_counter = 0;
2052
2053 return same;
2054 }
2055
2056 #define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL )
2057
2058 static int
2059 bgp_channel_reconfigure(struct channel *C, struct channel_config *CC)
2060 {
2061 struct bgp_channel *c = (void *) C;
2062 struct bgp_channel_config *new = (void *) CC;
2063 struct bgp_channel_config *old = c->cf;
2064
2065 if (memcmp(((byte *) old) + sizeof(struct channel_config),
2066 ((byte *) new) + sizeof(struct channel_config),
2067 /* Remaining items must be checked separately */
2068 OFFSETOF(struct bgp_channel_config, rest) - sizeof(struct channel_config)))
2069 return 0;
2070
2071 /* Check change in IGP tables */
2072 if ((IGP_TABLE(old, ip4) != IGP_TABLE(new, ip4)) ||
2073 (IGP_TABLE(old, ip6) != IGP_TABLE(new, ip6)))
2074 return 0;
2075
2076 c->cf = new;
2077 return 1;
2078 }
2079
2080 static void
2081 bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED)
2082 {
2083 /* Just a shallow copy */
2084 }
2085
2086
2087 /**
2088 * bgp_error - report a protocol error
2089 * @c: connection
2090 * @code: error code (according to the RFC)
2091 * @subcode: error sub-code
2092 * @data: data to be passed in the Notification message
2093 * @len: length of the data
2094 *
2095 * bgp_error() sends a notification packet to tell the other side that a protocol
2096 * error has occurred (including the data considered erroneous if possible) and
2097 * closes the connection.
2098 */
2099 void
2100 bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len)
2101 {
2102 struct bgp_proto *p = c->bgp;
2103
2104 if (c->state == BS_CLOSE)
2105 return;
2106
2107 bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len));
2108 bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
2109 bgp_conn_enter_close_state(c);
2110
2111 c->notify_code = code;
2112 c->notify_subcode = subcode;
2113 c->notify_data = data;
2114 c->notify_size = (len > 0) ? len : 0;
2115 bgp_schedule_packet(c, NULL, PKT_NOTIFICATION);
2116
2117 if (code != 6)
2118 {
2119 bgp_update_startup_delay(p);
2120 bgp_stop(p, 0, NULL, 0);
2121 }
2122 }
2123
2124 /**
2125 * bgp_store_error - store last error for status report
2126 * @p: BGP instance
2127 * @c: connection
2128 * @class: error class (BE_xxx constants)
2129 * @code: error code (class specific)
2130 *
2131 * bgp_store_error() decides whether given error is interesting enough
2132 * and store that error to last_error variables of @p
2133 */
2134 void
2135 bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
2136 {
2137 /* During PS_UP, we ignore errors on secondary connection */
2138 if ((p->p.proto_state == PS_UP) && c && (c != p->conn))
2139 return;
2140
2141 /* During PS_STOP, we ignore any errors, as we want to report
2142 * the error that caused transition to PS_STOP
2143 */
2144 if (p->p.proto_state == PS_STOP)
2145 return;
2146
2147 p->last_error_class = class;
2148 p->last_error_code = code;
2149 }
2150
2151 static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
2152 static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
2153 static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "Link down", "BFD session down", "Graceful restart"};
2154 static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
2155 static char *bgp_gr_states[] = { "None", "Regular", "Long-lived"};
2156
2157 static const char *
2158 bgp_last_errmsg(struct bgp_proto *p)
2159 {
2160 switch (p->last_error_class)
2161 {
2162 case BE_MISC:
2163 return bgp_misc_errors[p->last_error_code];
2164 case BE_SOCKET:
2165 return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
2166 case BE_BGP_RX:
2167 case BE_BGP_TX:
2168 return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
2169 case BE_AUTO_DOWN:
2170 return bgp_auto_errors[p->last_error_code];
2171 default:
2172 return "";
2173 }
2174 }
2175
2176 static const char *
2177 bgp_state_dsc(struct bgp_proto *p)
2178 {
2179 if (p->p.proto_state == PS_DOWN)
2180 return "Down";
2181
2182 int state = MAX(p->incoming_conn.state, p->outgoing_conn.state);
2183 if ((state == BS_IDLE) && (p->start_state >= BSS_CONNECT) && p->passive)
2184 return "Passive";
2185
2186 return bgp_state_names[state];
2187 }
2188
2189 static void
2190 bgp_get_status(struct proto *P, byte *buf)
2191 {
2192 struct bgp_proto *p = (struct bgp_proto *) P;
2193
2194 const char *err1 = bgp_err_classes[p->last_error_class];
2195 const char *err2 = bgp_last_errmsg(p);
2196
2197 if (P->proto_state == PS_DOWN)
2198 bsprintf(buf, "%s%s", err1, err2);
2199 else
2200 bsprintf(buf, "%-14s%s%s", bgp_state_dsc(p), err1, err2);
2201 }
2202
2203 static void
2204 bgp_show_afis(int code, char *s, u32 *afis, uint count)
2205 {
2206 buffer b;
2207 LOG_BUFFER_INIT(b);
2208
2209 buffer_puts(&b, s);
2210
2211 for (u32 *af = afis; af < (afis + count); af++)
2212 {
2213 const struct bgp_af_desc *desc = bgp_get_af_desc(*af);
2214 if (desc)
2215 buffer_print(&b, " %s", desc->name);
2216 else
2217 buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af));
2218 }
2219
2220 if (b.pos == b.end)
2221 strcpy(b.end - 32, " ... <too long>");
2222
2223 cli_msg(code, b.start);
2224 }
2225
2226 static void
2227 bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
2228 {
2229 struct bgp_af_caps *ac;
2230 uint any_mp_bgp = 0;
2231 uint any_gr_able = 0;
2232 uint any_add_path = 0;
2233 uint any_ext_next_hop = 0;
2234 uint any_llgr_able = 0;
2235 u32 *afl1 = alloca(caps->af_count * sizeof(u32));
2236 u32 *afl2 = alloca(caps->af_count * sizeof(u32));
2237 uint afn1, afn2;
2238
2239 WALK_AF_CAPS(caps, ac)
2240 {
2241 any_mp_bgp |= ac->ready;
2242 any_gr_able |= ac->gr_able;
2243 any_add_path |= ac->add_path;
2244 any_ext_next_hop |= ac->ext_next_hop;
2245 any_llgr_able |= ac->llgr_able;
2246 }
2247
2248 if (any_mp_bgp)
2249 {
2250 cli_msg(-1006, " Multiprotocol");
2251
2252 afn1 = 0;
2253 WALK_AF_CAPS(caps, ac)
2254 if (ac->ready)
2255 afl1[afn1++] = ac->afi;
2256
2257 bgp_show_afis(-1006, " AF announced:", afl1, afn1);
2258 }
2259
2260 if (caps->route_refresh)
2261 cli_msg(-1006, " Route refresh");
2262
2263 if (any_ext_next_hop)
2264 {
2265 cli_msg(-1006, " Extended next hop");
2266
2267 afn1 = 0;
2268 WALK_AF_CAPS(caps, ac)
2269 if (ac->ext_next_hop)
2270 afl1[afn1++] = ac->afi;
2271
2272 bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1);
2273 }
2274
2275 if (caps->ext_messages)
2276 cli_msg(-1006, " Extended message");
2277
2278 if (caps->gr_aware)
2279 cli_msg(-1006, " Graceful restart");
2280
2281 if (any_gr_able)
2282 {
2283 /* Continues from gr_aware */
2284 cli_msg(-1006, " Restart time: %u", caps->gr_time);
2285 if (caps->gr_flags & BGP_GRF_RESTART)
2286 cli_msg(-1006, " Restart recovery");
2287
2288 afn1 = afn2 = 0;
2289 WALK_AF_CAPS(caps, ac)
2290 {
2291 if (ac->gr_able)
2292 afl1[afn1++] = ac->afi;
2293
2294 if (ac->gr_af_flags & BGP_GRF_FORWARDING)
2295 afl2[afn2++] = ac->afi;
2296 }
2297
2298 bgp_show_afis(-1006, " AF supported:", afl1, afn1);
2299 bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
2300 }
2301
2302 if (caps->as4_support)
2303 cli_msg(-1006, " 4-octet AS numbers");
2304
2305 if (any_add_path)
2306 {
2307 cli_msg(-1006, " ADD-PATH");
2308
2309 afn1 = afn2 = 0;
2310 WALK_AF_CAPS(caps, ac)
2311 {
2312 if (ac->add_path & BGP_ADD_PATH_RX)
2313 afl1[afn1++] = ac->afi;
2314
2315 if (ac->add_path & BGP_ADD_PATH_TX)
2316 afl2[afn2++] = ac->afi;
2317 }
2318
2319 bgp_show_afis(-1006, " RX:", afl1, afn1);
2320 bgp_show_afis(-1006, " TX:", afl2, afn2);
2321 }
2322
2323 if (caps->enhanced_refresh)
2324 cli_msg(-1006, " Enhanced refresh");
2325
2326 if (caps->llgr_aware)
2327 cli_msg(-1006, " Long-lived graceful restart");
2328
2329 if (any_llgr_able)
2330 {
2331 u32 stale_time = 0;
2332
2333 afn1 = afn2 = 0;
2334 WALK_AF_CAPS(caps, ac)
2335 {
2336 stale_time = MAX(stale_time, ac->llgr_time);
2337
2338 if (ac->llgr_able && ac->llgr_time)
2339 afl1[afn1++] = ac->afi;
2340
2341 if (ac->llgr_flags & BGP_GRF_FORWARDING)
2342 afl2[afn2++] = ac->afi;
2343 }
2344
2345 /* Continues from llgr_aware */
2346 cli_msg(-1006, " LL stale time: %u", stale_time);
2347
2348 bgp_show_afis(-1006, " AF supported:", afl1, afn1);
2349 bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
2350 }
2351 }
2352
2353 static void
2354 bgp_show_proto_info(struct proto *P)
2355 {
2356 struct bgp_proto *p = (struct bgp_proto *) P;
2357
2358 cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p));
2359
2360 if (bgp_is_dynamic(p) && p->cf->remote_range)
2361 cli_msg(-1006, " Neighbor range: %N", p->cf->remote_range);
2362 else
2363 cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface);
2364
2365 cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
2366 cli_msg(-1006, " Local AS: %u", p->cf->local_as);
2367
2368 if (p->gr_active_num)
2369 cli_msg(-1006, " Neighbor graceful restart active");
2370
2371 if (P->proto_state == PS_START)
2372 {
2373 struct bgp_conn *oc = &p->outgoing_conn;
2374
2375 if ((p->start_state < BSS_CONNECT) &&
2376 (tm_active(p->startup_timer)))
2377 cli_msg(-1006, " Error wait: %t/%u",
2378 tm_remains(p->startup_timer), p->startup_delay);
2379
2380 if ((oc->state == BS_ACTIVE) &&
2381 (tm_active(oc->connect_timer)))
2382 cli_msg(-1006, " Connect delay: %t/%u",
2383 tm_remains(oc->connect_timer), p->cf->connect_delay_time);
2384
2385 if (p->gr_active_num && tm_active(p->gr_timer))
2386 cli_msg(-1006, " Restart timer: %t/-",
2387 tm_remains(p->gr_timer));
2388 }
2389 else if (P->proto_state == PS_UP)
2390 {
2391 cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
2392 cli_msg(-1006, " Local capabilities");
2393 bgp_show_capabilities(p, p->conn->local_caps);
2394 cli_msg(-1006, " Neighbor capabilities");
2395 bgp_show_capabilities(p, p->conn->remote_caps);
2396 cli_msg(-1006, " Session: %s%s%s%s%s",
2397 p->is_internal ? "internal" : "external",
2398 p->cf->multihop ? " multihop" : "",
2399 p->rr_client ? " route-reflector" : "",
2400 p->rs_client ? " route-server" : "",
2401 p->as4_session ? " AS4" : "");
2402 cli_msg(-1006, " Source address: %I", p->local_ip);
2403 cli_msg(-1006, " Hold timer: %t/%u",
2404 tm_remains(p->conn->hold_timer), p->conn->hold_time);
2405 cli_msg(-1006, " Keepalive timer: %t/%u",
2406 tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time);
2407 }
2408
2409 if ((p->last_error_class != BE_NONE) &&
2410 (p->last_error_class != BE_MAN_DOWN))
2411 {
2412 const char *err1 = bgp_err_classes[p->last_error_class];
2413 const char *err2 = bgp_last_errmsg(p);
2414 cli_msg(-1006, " Last error: %s%s", err1, err2);
2415 }
2416
2417 {
2418 struct bgp_channel *c;
2419 WALK_LIST(c, p->p.channels)
2420 {
2421 channel_show_info(&c->c);
2422
2423 if (p->gr_active_num)
2424 cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]);
2425
2426 if (c->stale_timer && tm_active(c->stale_timer))
2427 cli_msg(-1006, " LL stale timer: %t/-", tm_remains(c->stale_timer));
2428
2429 if (c->c.channel_state == CS_UP)
2430 {
2431 if (ipa_zero(c->link_addr))
2432 cli_msg(-1006, " BGP Next hop: %I", c->next_hop_addr);
2433 else
2434 cli_msg(-1006, " BGP Next hop: %I %I", c->next_hop_addr, c->link_addr);
2435 }
2436
2437 if (c->igp_table_ip4)
2438 cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name);
2439
2440 if (c->igp_table_ip6)
2441 cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name);
2442 }
2443 }
2444 }
2445
2446 struct channel_class channel_bgp = {
2447 .channel_size = sizeof(struct bgp_channel),
2448 .config_size = sizeof(struct bgp_channel_config),
2449 .init = bgp_channel_init,
2450 .start = bgp_channel_start,
2451 .shutdown = bgp_channel_shutdown,
2452 .cleanup = bgp_channel_cleanup,
2453 .reconfigure = bgp_channel_reconfigure,
2454 };
2455
2456 struct protocol proto_bgp = {
2457 .name = "BGP",
2458 .template = "bgp%d",
2459 .class = PROTOCOL_BGP,
2460 .preference = DEF_PREF_BGP,
2461 .channel_mask = NB_IP | NB_VPN | NB_FLOW,
2462 .proto_size = sizeof(struct bgp_proto),
2463 .config_size = sizeof(struct bgp_config),
2464 .postconfig = bgp_postconfig,
2465 .init = bgp_init,
2466 .start = bgp_start,
2467 .shutdown = bgp_shutdown,
2468 .reconfigure = bgp_reconfigure,
2469 .copy_config = bgp_copy_config,
2470 .get_status = bgp_get_status,
2471 .get_attr = bgp_get_attr,
2472 .get_route_info = bgp_get_route_info,
2473 .show_proto_info = bgp_show_proto_info
2474 };