]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/bgp.c
053016dd2a73683abac65ad32d30a488983dea0b
[thirdparty/bird.git] / proto / bgp / bgp.c
1 /*
2 * BIRD -- The Border Gateway Protocol
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 /**
12 * DOC: Border Gateway Protocol
13 *
14 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
15 * the connection and most of the interface with BIRD core, |packets.c| handling
16 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
17 * manipulation with BGP attribute lists.
18 *
19 * As opposed to the other existing routing daemons, BIRD has a sophisticated
20 * core architecture which is able to keep all the information needed by BGP in
21 * the primary routing table, therefore no complex data structures like a
22 * central BGP table are needed. This increases memory footprint of a BGP router
23 * with many connections, but not too much and, which is more important, it
24 * makes BGP much easier to implement.
25 *
26 * Each instance of BGP (corresponding to a single BGP peer) is described by a
27 * &bgp_proto structure to which are attached individual connections represented
28 * by &bgp_connection (usually, there exists only one connection, but during BGP
29 * session setup, there can be more of them). The connections are handled
30 * according to the BGP state machine defined in the RFC with all the timers and
31 * all the parameters configurable.
32 *
33 * In incoming direction, we listen on the connection's socket and each time we
34 * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
35 * markers and passes complete packets to bgp_rx_packet() which distributes the
36 * packet according to its type.
37 *
38 * In outgoing direction, we gather all the routing updates and sort them to
39 * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
40 * fast comparison of &rta's and a &fib which helps us to find if we already
41 * have another route for the same destination queued for sending, so that we
42 * can replace it with the new one immediately instead of sending both
43 * updates). There also exists a special bucket holding all the route
44 * withdrawals which cannot be queued anywhere else as they don't have any
45 * attributes. If we have any packet to send (due to either new routes or the
46 * connection tracking code wanting to send a Open, Keepalive or Notification
47 * message), we call bgp_schedule_packet() which sets the corresponding bit in a
48 * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
49 * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
50 * packet type bits and calls the corresponding bgp_create_xx() functions,
51 * eventually rescheduling the same packet type if we have more data of the same
52 * type to send.
53 *
54 * The processing of attributes consists of two functions: bgp_decode_attrs()
55 * for checking of the attribute blocks and translating them to the language of
56 * BIRD's extended attributes and bgp_encode_attrs() which does the
57 * converse. Both functions are built around a @bgp_attr_table array describing
58 * all important characteristics of all known attributes. Unknown transitive
59 * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
60 *
61 * BGP protocol implements graceful restart in both restarting (local restart)
62 * and receiving (neighbor restart) roles. The first is handled mostly by the
63 * graceful restart code in the nest, BGP protocol just handles capabilities,
64 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
65 * The second is implemented by internal restart of the BGP state to %BS_IDLE
66 * and protocol state to %PS_START, but keeping the protocol up from the core
67 * point of view and therefore maintaining received routes. Routing table
68 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
69 * stale routes after reestablishment of BGP session during graceful restart.
70 *
71 * Supported standards:
72 * RFC 4271 - Border Gateway Protocol 4 (BGP)
73 * RFC 1997 - BGP Communities Attribute
74 * RFC 2385 - Protection of BGP Sessions via TCP MD5 Signature
75 * RFC 2545 - Use of BGP Multiprotocol Extensions for IPv6
76 * RFC 2918 - Route Refresh Capability
77 * RFC 3107 - Carrying Label Information in BGP
78 * RFC 4360 - BGP Extended Communities Attribute
79 * RFC 4364 - BGP/MPLS IPv4 Virtual Private Networks
80 * RFC 4456 - BGP Route Reflection
81 * RFC 4486 - Subcodes for BGP Cease Notification Message
82 * RFC 4659 - BGP/MPLS IPv6 Virtual Private Networks
83 * RFC 4724 - Graceful Restart Mechanism for BGP
84 * RFC 4760 - Multiprotocol extensions for BGP
85 * RFC 4798 - Connecting IPv6 Islands over IPv4 MPLS
86 * RFC 5065 - AS confederations for BGP
87 * RFC 5082 - Generalized TTL Security Mechanism
88 * RFC 5492 - Capabilities Advertisement with BGP
89 * RFC 5549 - Advertising IPv4 NLRI with an IPv6 Next Hop
90 * RFC 5575 - Dissemination of Flow Specification Rules
91 * RFC 5668 - 4-Octet AS Specific BGP Extended Community
92 * RFC 6286 - AS-Wide Unique BGP Identifier
93 * RFC 6608 - Subcodes for BGP Finite State Machine Error
94 * RFC 6793 - BGP Support for 4-Octet AS Numbers
95 * RFC 7311 - Accumulated IGP Metric Attribute for BGP
96 * RFC 7313 - Enhanced Route Refresh Capability for BGP
97 * RFC 7606 - Revised Error Handling for BGP UPDATE Messages
98 * RFC 7911 - Advertisement of Multiple Paths in BGP
99 * RFC 7947 - Internet Exchange BGP Route Server
100 * RFC 8092 - BGP Large Communities Attribute
101 * RFC 8203 - BGP Administrative Shutdown Communication
102 * RFC 8212 - Default EBGP Route Propagation Behavior without Policies
103 * RFC 8654 - Extended Message Support for BGP
104 * draft-ietf-idr-ext-opt-param-07
105 * draft-uttaro-idr-bgp-persistence-04
106 */
107
108 #undef LOCAL_DEBUG
109
110 #include <stdlib.h>
111
112 #include "nest/bird.h"
113 #include "nest/iface.h"
114 #include "nest/protocol.h"
115 #include "nest/route.h"
116 #include "nest/cli.h"
117 #include "nest/locks.h"
118 #include "conf/conf.h"
119 #include "filter/filter.h"
120 #include "lib/socket.h"
121 #include "lib/resource.h"
122 #include "lib/string.h"
123
124 #include "bgp.h"
125
126
127 struct linpool *bgp_linpool; /* Global temporary pool */
128 struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */
129 static list bgp_sockets; /* Global list of listening sockets */
130
131
132 static void bgp_connect(struct bgp_proto *p);
133 static void bgp_active(struct bgp_proto *p);
134 static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn);
135 static void bgp_setup_sk(struct bgp_conn *conn, sock *s);
136 static void bgp_send_open(struct bgp_conn *conn);
137 static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
138
139 static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
140 static void bgp_listen_sock_err(sock *sk UNUSED, int err);
141
142 /**
143 * bgp_open - open a BGP instance
144 * @p: BGP instance
145 *
146 * This function allocates and configures shared BGP resources, mainly listening
147 * sockets. Should be called as the last step during initialization (when lock
148 * is acquired and neighbor is ready). When error, caller should change state to
149 * PS_DOWN and return immediately.
150 */
151 static int
152 bgp_open(struct bgp_proto *p)
153 {
154 struct bgp_socket *bs = NULL;
155 struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
156 ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
157 (p->ipv4 ? IPA_NONE4 : IPA_NONE6);
158 uint port = p->cf->local_port;
159
160 /* FIXME: Add some global init? */
161 if (!bgp_linpool)
162 init_list(&bgp_sockets);
163
164 /* We assume that cf->iface is defined iff cf->local_ip is link-local */
165
166 WALK_LIST(bs, bgp_sockets)
167 if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->sport == port) &&
168 (bs->sk->iface == ifa) && (bs->sk->vrf == p->p.vrf))
169 {
170 bs->uc++;
171 p->sock = bs;
172 return 0;
173 }
174
175 sock *sk = sk_new(proto_pool);
176 sk->type = SK_TCP_PASSIVE;
177 sk->ttl = 255;
178 sk->saddr = addr;
179 sk->sport = port;
180 sk->iface = ifa;
181 sk->vrf = p->p.vrf;
182 sk->flags = 0;
183 sk->tos = IP_PREC_INTERNET_CONTROL;
184 sk->rbsize = BGP_RX_BUFFER_SIZE;
185 sk->tbsize = BGP_TX_BUFFER_SIZE;
186 sk->rx_hook = bgp_incoming_connection;
187 sk->err_hook = bgp_listen_sock_err;
188
189 if (sk_open(sk) < 0)
190 goto err;
191
192 bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
193 bs->sk = sk;
194 bs->uc = 1;
195 p->sock = bs;
196 sk->data = bs;
197
198 add_tail(&bgp_sockets, &bs->n);
199
200 if (!bgp_linpool)
201 {
202 bgp_linpool = lp_new_default(proto_pool);
203 bgp_linpool2 = lp_new_default(proto_pool);
204 }
205
206 return 0;
207
208 err:
209 sk_log_error(sk, p->p.name);
210 log(L_ERR "%s: Cannot open listening socket", p->p.name);
211 rfree(sk);
212 return -1;
213 }
214
215 /**
216 * bgp_close - close a BGP instance
217 * @p: BGP instance
218 *
219 * This function frees and deconfigures shared BGP resources.
220 */
221 static void
222 bgp_close(struct bgp_proto *p)
223 {
224 struct bgp_socket *bs = p->sock;
225
226 ASSERT(bs && bs->uc);
227
228 if (--bs->uc)
229 return;
230
231 rfree(bs->sk);
232 rem_node(&bs->n);
233 mb_free(bs);
234
235 if (!EMPTY_LIST(bgp_sockets))
236 return;
237
238 rfree(bgp_linpool);
239 bgp_linpool = NULL;
240
241 rfree(bgp_linpool2);
242 bgp_linpool2 = NULL;
243 }
244
245 static inline int
246 bgp_setup_auth(struct bgp_proto *p, int enable)
247 {
248 if (p->cf->password)
249 {
250 int rv = sk_set_md5_auth(p->sock->sk,
251 p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
252 enable ? p->cf->password : NULL, p->cf->setkey);
253
254 if (rv < 0)
255 sk_log_error(p->sock->sk, p->p.name);
256
257 return rv;
258 }
259 else
260 return 0;
261 }
262
263 static inline struct bgp_channel *
264 bgp_find_channel(struct bgp_proto *p, u32 afi)
265 {
266 struct bgp_channel *c;
267 WALK_LIST(c, p->p.channels)
268 if (c->afi == afi)
269 return c;
270
271 return NULL;
272 }
273
274 static void
275 bgp_startup(struct bgp_proto *p)
276 {
277 BGP_TRACE(D_EVENTS, "Started");
278 p->start_state = BSS_CONNECT;
279
280 if (!p->passive)
281 bgp_active(p);
282
283 if (p->postponed_sk)
284 {
285 /* Apply postponed incoming connection */
286 bgp_setup_conn(p, &p->incoming_conn);
287 bgp_setup_sk(&p->incoming_conn, p->postponed_sk);
288 bgp_send_open(&p->incoming_conn);
289 p->postponed_sk = NULL;
290 }
291 }
292
293 static void
294 bgp_startup_timeout(timer *t)
295 {
296 bgp_startup(t->data);
297 }
298
299
300 static void
301 bgp_initiate(struct bgp_proto *p)
302 {
303 int err_val;
304
305 if (bgp_open(p) < 0)
306 { err_val = BEM_NO_SOCKET; goto err1; }
307
308 if (bgp_setup_auth(p, 1) < 0)
309 { err_val = BEM_INVALID_MD5; goto err2; }
310
311 if (p->cf->bfd)
312 bgp_update_bfd(p, p->cf->bfd);
313
314 if (p->startup_delay)
315 {
316 p->start_state = BSS_DELAY;
317 BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
318 bgp_start_timer(p->startup_timer, p->startup_delay);
319 }
320 else
321 bgp_startup(p);
322
323 return;
324
325 err2:
326 bgp_close(p);
327 err1:
328 p->p.disabled = 1;
329 bgp_store_error(p, NULL, BE_MISC, err_val);
330 proto_notify_state(&p->p, PS_DOWN);
331
332 return;
333 }
334
335 /**
336 * bgp_start_timer - start a BGP timer
337 * @t: timer
338 * @value: time (in seconds) to fire (0 to disable the timer)
339 *
340 * This functions calls tm_start() on @t with time @value and the amount of
341 * randomization suggested by the BGP standard. Please use it for all BGP
342 * timers.
343 */
344 void
345 bgp_start_timer(timer *t, uint value)
346 {
347 if (value)
348 {
349 /* The randomization procedure is specified in RFC 4271 section 10 */
350 btime time = value S;
351 btime randomize = random() % ((time / 4) + 1);
352 tm_start(t, time - randomize);
353 }
354 else
355 tm_stop(t);
356 }
357
358 /**
359 * bgp_close_conn - close a BGP connection
360 * @conn: connection to close
361 *
362 * This function takes a connection described by the &bgp_conn structure, closes
363 * its socket and frees all resources associated with it.
364 */
365 void
366 bgp_close_conn(struct bgp_conn *conn)
367 {
368 // struct bgp_proto *p = conn->bgp;
369
370 DBG("BGP: Closing connection\n");
371 conn->packets_to_send = 0;
372 conn->channels_to_send = 0;
373 rfree(conn->connect_timer);
374 conn->connect_timer = NULL;
375 rfree(conn->keepalive_timer);
376 conn->keepalive_timer = NULL;
377 rfree(conn->hold_timer);
378 conn->hold_timer = NULL;
379 rfree(conn->tx_ev);
380 conn->tx_ev = NULL;
381 rfree(conn->sk);
382 conn->sk = NULL;
383
384 mb_free(conn->local_caps);
385 conn->local_caps = NULL;
386 mb_free(conn->remote_caps);
387 conn->remote_caps = NULL;
388 }
389
390
391 /**
392 * bgp_update_startup_delay - update a startup delay
393 * @p: BGP instance
394 *
395 * This function updates a startup delay that is used to postpone next BGP
396 * connect. It also handles disable_after_error and might stop BGP instance
397 * when error happened and disable_after_error is on.
398 *
399 * It should be called when BGP protocol error happened.
400 */
401 void
402 bgp_update_startup_delay(struct bgp_proto *p)
403 {
404 const struct bgp_config *cf = p->cf;
405
406 DBG("BGP: Updating startup delay\n");
407
408 if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S))
409 p->startup_delay = 0;
410
411 p->last_proto_error = current_time();
412
413 if (cf->disable_after_error)
414 {
415 p->startup_delay = 0;
416 p->p.disabled = 1;
417 return;
418 }
419
420 if (!p->startup_delay)
421 p->startup_delay = cf->error_delay_time_min;
422 else
423 p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
424 }
425
426 static void
427 bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len)
428 {
429 switch (conn->state)
430 {
431 case BS_IDLE:
432 case BS_CLOSE:
433 return;
434
435 case BS_CONNECT:
436 case BS_ACTIVE:
437 bgp_conn_enter_idle_state(conn);
438 return;
439
440 case BS_OPENSENT:
441 case BS_OPENCONFIRM:
442 case BS_ESTABLISHED:
443 if (subcode < 0)
444 {
445 bgp_conn_enter_close_state(conn);
446 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
447 }
448 else
449 bgp_error(conn, 6, subcode, data, len);
450 return;
451
452 default:
453 bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
454 }
455 }
456
457 static void
458 bgp_down(struct bgp_proto *p)
459 {
460 if (p->start_state > BSS_PREPARE)
461 {
462 bgp_setup_auth(p, 0);
463 bgp_close(p);
464 }
465
466 BGP_TRACE(D_EVENTS, "Down");
467 proto_notify_state(&p->p, PS_DOWN);
468 }
469
470 static void
471 bgp_decision(void *vp)
472 {
473 struct bgp_proto *p = vp;
474
475 DBG("BGP: Decision start\n");
476 if ((p->p.proto_state == PS_START) &&
477 (p->outgoing_conn.state == BS_IDLE) &&
478 (p->incoming_conn.state != BS_OPENCONFIRM) &&
479 !p->passive)
480 bgp_active(p);
481
482 if ((p->p.proto_state == PS_STOP) &&
483 (p->outgoing_conn.state == BS_IDLE) &&
484 (p->incoming_conn.state == BS_IDLE))
485 bgp_down(p);
486 }
487
488 static struct bgp_proto *
489 bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip)
490 {
491 struct symbol *sym;
492 char fmt[SYM_MAX_LEN];
493
494 bsprintf(fmt, "%s%%0%dd", pp->cf->dynamic_name, pp->cf->dynamic_name_digits);
495
496 /* This is hack, we would like to share config, but we need to copy it now */
497 new_config = config;
498 cfg_mem = config->mem;
499 conf_this_scope = config->root_scope;
500 sym = cf_default_name(fmt, &(pp->dynamic_name_counter));
501 proto_clone_config(sym, pp->p.cf);
502 new_config = NULL;
503 cfg_mem = NULL;
504
505 /* Just pass remote_ip to bgp_init() */
506 ((struct bgp_config *) sym->proto)->remote_ip = remote_ip;
507
508 return (void *) proto_spawn(sym->proto, 0);
509 }
510
511 void
512 bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len)
513 {
514 proto_notify_state(&p->p, PS_STOP);
515 bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
516 bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
517 ev_schedule(p->event);
518 }
519
520 static inline void
521 bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
522 {
523 if (conn->bgp->p.mrtdump & MD_STATES)
524 bgp_dump_state_change(conn, conn->state, new_state);
525
526 conn->state = new_state;
527 }
528
529 void
530 bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
531 {
532 /* Really, most of the work is done in bgp_rx_open(). */
533 bgp_conn_set_state(conn, BS_OPENCONFIRM);
534 }
535
536 static const struct bgp_af_caps dummy_af_caps = { };
537 static const struct bgp_af_caps basic_af_caps = { .ready = 1 };
538
539 void
540 bgp_conn_enter_established_state(struct bgp_conn *conn)
541 {
542 struct bgp_proto *p = conn->bgp;
543 struct bgp_caps *local = conn->local_caps;
544 struct bgp_caps *peer = conn->remote_caps;
545 struct bgp_channel *c;
546
547 BGP_TRACE(D_EVENTS, "BGP session established");
548
549 /* For multi-hop BGP sessions */
550 if (ipa_zero(p->local_ip))
551 p->local_ip = conn->sk->saddr;
552
553 /* For promiscuous sessions */
554 if (!p->remote_as)
555 p->remote_as = conn->received_as;
556
557 /* In case of LLv6 is not valid during BGP start */
558 if (ipa_zero(p->link_addr) && p->neigh && p->neigh->iface && p->neigh->iface->llv6)
559 p->link_addr = p->neigh->iface->llv6->ip;
560
561 conn->sk->fast_rx = 0;
562
563 p->conn = conn;
564 p->last_error_class = 0;
565 p->last_error_code = 0;
566
567 p->as4_session = conn->as4_session;
568
569 p->route_refresh = peer->route_refresh;
570 p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
571
572 /* Whether we may handle possible GR/LLGR of peer (it has some AF GR-able) */
573 p->gr_ready = p->llgr_ready = 0; /* Updated later */
574
575 /* Whether peer is ready to handle our GR recovery */
576 int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
577
578 if (p->gr_active_num)
579 tm_stop(p->gr_timer);
580
581 /* Number of active channels */
582 int num = 0;
583
584 /* Summary state of ADD_PATH RX for active channels */
585 uint summary_add_path_rx = 0;
586
587 WALK_LIST(c, p->p.channels)
588 {
589 const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
590 const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi);
591
592 /* Use default if capabilities were not announced */
593 if (!local->length && (c->afi == BGP_AF_IPV4))
594 loc = &basic_af_caps;
595
596 if (!peer->length && (c->afi == BGP_AF_IPV4))
597 rem = &basic_af_caps;
598
599 /* Ignore AFIs that were not announced in multiprotocol capability */
600 if (!loc || !loc->ready)
601 loc = &dummy_af_caps;
602
603 if (!rem || !rem->ready)
604 rem = &dummy_af_caps;
605
606 int active = loc->ready && rem->ready;
607 c->c.disabled = !active;
608 c->c.reloadable = p->route_refresh || c->cf->import_table;
609
610 c->index = active ? num++ : 0;
611
612 c->feed_state = BFS_NONE;
613 c->load_state = BFS_NONE;
614
615 /* Channels where peer may do GR */
616 uint gr_ready = active && local->gr_aware && rem->gr_able;
617 uint llgr_ready = active && local->llgr_aware && rem->llgr_able;
618
619 c->gr_ready = gr_ready || llgr_ready;
620 p->gr_ready = p->gr_ready || c->gr_ready;
621 p->llgr_ready = p->llgr_ready || llgr_ready;
622
623 /* Remember last LLGR stale time */
624 c->stale_time = local->llgr_aware ? rem->llgr_time : 0;
625
626 /* Channels not able to recover gracefully */
627 if (p->p.gr_recovery && (!active || !peer_gr_ready))
628 channel_graceful_restart_unlock(&c->c);
629
630 /* Channels waiting for local convergence */
631 if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
632 c->c.gr_wait = 1;
633
634 /* Channels where regular graceful restart failed */
635 if ((c->gr_active == BGP_GRS_ACTIVE) &&
636 !(active && rem->gr_able && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
637 bgp_graceful_restart_done(c);
638
639 /* Channels where regular long-lived restart failed */
640 if ((c->gr_active == BGP_GRS_LLGR) &&
641 !(active && rem->llgr_able && (rem->gr_af_flags & BGP_LLGRF_FORWARDING)))
642 bgp_graceful_restart_done(c);
643
644 /* GR capability implies that neighbor will send End-of-RIB */
645 if (peer->gr_aware)
646 c->load_state = BFS_LOADING;
647
648 c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
649 c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
650 c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);
651
652 if (active)
653 summary_add_path_rx |= !c->add_path_rx ? 1 : 2;
654
655 /* Update RA mode */
656 if (c->add_path_tx)
657 c->c.ra_mode = RA_ANY;
658 else if (c->cf->secondary)
659 c->c.ra_mode = RA_ACCEPTED;
660 else
661 c->c.ra_mode = RA_OPTIMAL;
662 }
663
664 p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
665 p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
666 p->channel_count = num;
667 p->summary_add_path_rx = summary_add_path_rx;
668
669 WALK_LIST(c, p->p.channels)
670 {
671 if (c->c.disabled)
672 continue;
673
674 p->afi_map[c->index] = c->afi;
675 p->channel_map[c->index] = c;
676 }
677
678 /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
679
680 bgp_conn_set_state(conn, BS_ESTABLISHED);
681 proto_notify_state(&p->p, PS_UP);
682 }
683
684 static void
685 bgp_conn_leave_established_state(struct bgp_proto *p)
686 {
687 BGP_TRACE(D_EVENTS, "BGP session closed");
688 p->conn = NULL;
689
690 if (p->p.proto_state == PS_UP)
691 bgp_stop(p, 0, NULL, 0);
692 }
693
694 void
695 bgp_conn_enter_close_state(struct bgp_conn *conn)
696 {
697 struct bgp_proto *p = conn->bgp;
698 int os = conn->state;
699
700 bgp_conn_set_state(conn, BS_CLOSE);
701 tm_stop(conn->keepalive_timer);
702 conn->sk->rx_hook = NULL;
703
704 /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
705 bgp_start_timer(conn->hold_timer, 10);
706
707 if (os == BS_ESTABLISHED)
708 bgp_conn_leave_established_state(p);
709 }
710
711 void
712 bgp_conn_enter_idle_state(struct bgp_conn *conn)
713 {
714 struct bgp_proto *p = conn->bgp;
715 int os = conn->state;
716
717 bgp_close_conn(conn);
718 bgp_conn_set_state(conn, BS_IDLE);
719 ev_schedule(p->event);
720
721 if (os == BS_ESTABLISHED)
722 bgp_conn_leave_established_state(p);
723 }
724
725 /**
726 * bgp_handle_graceful_restart - handle detected BGP graceful restart
727 * @p: BGP instance
728 *
729 * This function is called when a BGP graceful restart of the neighbor is
730 * detected (when the TCP connection fails or when a new TCP connection
731 * appears). The function activates processing of the restart - starts routing
732 * table refresh cycle and activates BGP restart timer. The protocol state goes
733 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
734 * caller.
735 */
736 void
737 bgp_handle_graceful_restart(struct bgp_proto *p)
738 {
739 ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
740
741 BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
742 p->gr_active_num ? " - already pending" : "");
743
744 p->gr_active_num = 0;
745
746 struct bgp_channel *c;
747 WALK_LIST(c, p->p.channels)
748 {
749 /* FIXME: perhaps check for channel state instead of disabled flag? */
750 if (c->c.disabled)
751 continue;
752
753 if (c->gr_ready)
754 {
755 p->gr_active_num++;
756
757 switch (c->gr_active)
758 {
759 case BGP_GRS_NONE:
760 c->gr_active = BGP_GRS_ACTIVE;
761 rt_refresh_begin(c->c.table, &c->c);
762 break;
763
764 case BGP_GRS_ACTIVE:
765 rt_refresh_end(c->c.table, &c->c);
766 rt_refresh_begin(c->c.table, &c->c);
767 break;
768
769 case BGP_GRS_LLGR:
770 rt_refresh_begin(c->c.table, &c->c);
771 rt_modify_stale(c->c.table, &c->c);
772 break;
773 }
774 }
775 else
776 {
777 /* Just flush the routes */
778 rt_refresh_begin(c->c.table, &c->c);
779 rt_refresh_end(c->c.table, &c->c);
780 }
781
782 /* Reset bucket and prefix tables */
783 bgp_free_bucket_table(c);
784 bgp_free_prefix_table(c);
785 bgp_init_bucket_table(c);
786 bgp_init_prefix_table(c);
787 c->packets_to_send = 0;
788 }
789
790 /* p->gr_ready -> at least one active channel is c->gr_ready */
791 ASSERT(p->gr_active_num > 0);
792
793 proto_notify_state(&p->p, PS_START);
794 tm_start(p->gr_timer, p->conn->remote_caps->gr_time S);
795 }
796
797 /**
798 * bgp_graceful_restart_done - finish active BGP graceful restart
799 * @c: BGP channel
800 *
801 * This function is called when the active BGP graceful restart of the neighbor
802 * should be finished for channel @c - either successfully (the neighbor sends
803 * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
804 * unsuccessfully (the neighbor does not support BGP graceful restart on the new
805 * session). The function ends the routing table refresh cycle.
806 */
807 void
808 bgp_graceful_restart_done(struct bgp_channel *c)
809 {
810 struct bgp_proto *p = (void *) c->c.proto;
811
812 ASSERT(c->gr_active);
813 c->gr_active = 0;
814 p->gr_active_num--;
815
816 if (!p->gr_active_num)
817 BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
818
819 tm_stop(c->stale_timer);
820 rt_refresh_end(c->c.table, &c->c);
821 }
822
823 /**
824 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
825 * @t: timer
826 *
827 * This function is a timeout hook for @gr_timer, implementing BGP restart time
828 * limit for reestablisment of the BGP session after the graceful restart. When
829 * fired, we just proceed with the usual protocol restart.
830 */
831
832 static void
833 bgp_graceful_restart_timeout(timer *t)
834 {
835 struct bgp_proto *p = t->data;
836
837 BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
838
839 if (p->llgr_ready)
840 {
841 struct bgp_channel *c;
842 WALK_LIST(c, p->p.channels)
843 {
844 /* Channel is not in GR and is already flushed */
845 if (!c->gr_active)
846 continue;
847
848 /* Channel is already in LLGR from past restart */
849 if (c->gr_active == BGP_GRS_LLGR)
850 continue;
851
852 /* Channel is in GR, but does not support LLGR -> stop GR */
853 if (!c->stale_time)
854 {
855 bgp_graceful_restart_done(c);
856 continue;
857 }
858
859 /* Channel is in GR, and supports LLGR -> start LLGR */
860 c->gr_active = BGP_GRS_LLGR;
861 tm_start(c->stale_timer, c->stale_time S);
862 rt_modify_stale(c->c.table, &c->c);
863 }
864 }
865 else
866 bgp_stop(p, 0, NULL, 0);
867 }
868
869 static void
870 bgp_long_lived_stale_timeout(timer *t)
871 {
872 struct bgp_channel *c = t->data;
873 struct bgp_proto *p = (void *) c->c.proto;
874
875 BGP_TRACE(D_EVENTS, "Long-lived stale timeout");
876
877 bgp_graceful_restart_done(c);
878 }
879
880
881 /**
882 * bgp_refresh_begin - start incoming enhanced route refresh sequence
883 * @c: BGP channel
884 *
885 * This function is called when an incoming enhanced route refresh sequence is
886 * started by the neighbor, demarcated by the BoRR packet. The function updates
887 * the load state and starts the routing table refresh cycle. Note that graceful
888 * restart also uses routing table refresh cycle, but RFC 7313 and load states
889 * ensure that these two sequences do not overlap.
890 */
891 void
892 bgp_refresh_begin(struct bgp_channel *c)
893 {
894 struct bgp_proto *p = (void *) c->c.proto;
895
896 if (c->load_state == BFS_LOADING)
897 { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
898
899 c->load_state = BFS_REFRESHING;
900 rt_refresh_begin(c->c.table, &c->c);
901
902 if (c->c.in_table)
903 rt_refresh_begin(c->c.in_table, &c->c);
904 }
905
906 /**
907 * bgp_refresh_end - finish incoming enhanced route refresh sequence
908 * @c: BGP channel
909 *
910 * This function is called when an incoming enhanced route refresh sequence is
911 * finished by the neighbor, demarcated by the EoRR packet. The function updates
912 * the load state and ends the routing table refresh cycle. Routes not received
913 * during the sequence are removed by the nest.
914 */
915 void
916 bgp_refresh_end(struct bgp_channel *c)
917 {
918 struct bgp_proto *p = (void *) c->c.proto;
919
920 if (c->load_state != BFS_REFRESHING)
921 { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
922
923 c->load_state = BFS_NONE;
924 rt_refresh_end(c->c.table, &c->c);
925
926 if (c->c.in_table)
927 rt_prune_sync(c->c.in_table, 0);
928 }
929
930
931 static void
932 bgp_send_open(struct bgp_conn *conn)
933 {
934 DBG("BGP: Sending open\n");
935 conn->sk->rx_hook = bgp_rx;
936 conn->sk->tx_hook = bgp_tx;
937 tm_stop(conn->connect_timer);
938 bgp_prepare_capabilities(conn);
939 bgp_schedule_packet(conn, NULL, PKT_OPEN);
940 bgp_conn_set_state(conn, BS_OPENSENT);
941 bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
942 }
943
944 static void
945 bgp_connected(sock *sk)
946 {
947 struct bgp_conn *conn = sk->data;
948 struct bgp_proto *p = conn->bgp;
949
950 BGP_TRACE(D_EVENTS, "Connected");
951 bgp_send_open(conn);
952 }
953
954 static void
955 bgp_connect_timeout(timer *t)
956 {
957 struct bgp_conn *conn = t->data;
958 struct bgp_proto *p = conn->bgp;
959
960 DBG("BGP: connect_timeout\n");
961 if (p->p.proto_state == PS_START)
962 {
963 bgp_close_conn(conn);
964 bgp_connect(p);
965 }
966 else
967 bgp_conn_enter_idle_state(conn);
968 }
969
970 static void
971 bgp_sock_err(sock *sk, int err)
972 {
973 struct bgp_conn *conn = sk->data;
974 struct bgp_proto *p = conn->bgp;
975
976 /*
977 * This error hook may be called either asynchronously from main
978 * loop, or synchronously from sk_send(). But sk_send() is called
979 * only from bgp_tx() and bgp_kick_tx(), which are both called
980 * asynchronously from main loop. Moreover, they end if err hook is
981 * called. Therefore, we could suppose that it is always called
982 * asynchronously.
983 */
984
985 bgp_store_error(p, conn, BE_SOCKET, err);
986
987 if (err)
988 BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
989 else
990 BGP_TRACE(D_EVENTS, "Connection closed");
991
992 if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
993 bgp_handle_graceful_restart(p);
994
995 bgp_conn_enter_idle_state(conn);
996 }
997
998 static void
999 bgp_hold_timeout(timer *t)
1000 {
1001 struct bgp_conn *conn = t->data;
1002 struct bgp_proto *p = conn->bgp;
1003
1004 DBG("BGP: Hold timeout\n");
1005
1006 /* We are already closing the connection - just do hangup */
1007 if (conn->state == BS_CLOSE)
1008 {
1009 BGP_TRACE(D_EVENTS, "Connection stalled");
1010 bgp_conn_enter_idle_state(conn);
1011 return;
1012 }
1013
1014 /* If there is something in input queue, we are probably congested
1015 and perhaps just not processed BGP packets in time. */
1016
1017 if (sk_rx_ready(conn->sk) > 0)
1018 bgp_start_timer(conn->hold_timer, 10);
1019 else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready)
1020 {
1021 BGP_TRACE(D_EVENTS, "Hold timer expired");
1022 bgp_handle_graceful_restart(p);
1023 bgp_conn_enter_idle_state(conn);
1024 }
1025 else
1026 bgp_error(conn, 4, 0, NULL, 0);
1027 }
1028
1029 static void
1030 bgp_keepalive_timeout(timer *t)
1031 {
1032 struct bgp_conn *conn = t->data;
1033
1034 DBG("BGP: Keepalive timer\n");
1035 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
1036
1037 /* Kick TX a bit faster */
1038 if (ev_active(conn->tx_ev))
1039 ev_run(conn->tx_ev);
1040 }
1041
1042 static void
1043 bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
1044 {
1045 conn->sk = NULL;
1046 conn->bgp = p;
1047
1048 conn->packets_to_send = 0;
1049 conn->channels_to_send = 0;
1050 conn->last_channel = 0;
1051 conn->last_channel_count = 0;
1052
1053 conn->connect_timer = tm_new_init(p->p.pool, bgp_connect_timeout, conn, 0, 0);
1054 conn->hold_timer = tm_new_init(p->p.pool, bgp_hold_timeout, conn, 0, 0);
1055 conn->keepalive_timer = tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
1056
1057 conn->tx_ev = ev_new_init(p->p.pool, bgp_kick_tx, conn);
1058 }
1059
1060 static void
1061 bgp_setup_sk(struct bgp_conn *conn, sock *s)
1062 {
1063 s->data = conn;
1064 s->err_hook = bgp_sock_err;
1065 s->fast_rx = 1;
1066 conn->sk = s;
1067 }
1068
1069 static void
1070 bgp_active(struct bgp_proto *p)
1071 {
1072 int delay = MAX(1, p->cf->connect_delay_time);
1073 struct bgp_conn *conn = &p->outgoing_conn;
1074
1075 BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
1076 bgp_setup_conn(p, conn);
1077 bgp_conn_set_state(conn, BS_ACTIVE);
1078 bgp_start_timer(conn->connect_timer, delay);
1079 }
1080
1081 /**
1082 * bgp_connect - initiate an outgoing connection
1083 * @p: BGP instance
1084 *
1085 * The bgp_connect() function creates a new &bgp_conn and initiates
1086 * a TCP connection to the peer. The rest of connection setup is governed
1087 * by the BGP state machine as described in the standard.
1088 */
1089 static void
1090 bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */
1091 {
1092 struct bgp_conn *conn = &p->outgoing_conn;
1093 int hops = p->cf->multihop ? : 1;
1094
1095 DBG("BGP: Connecting\n");
1096 sock *s = sk_new(p->p.pool);
1097 s->type = SK_TCP_ACTIVE;
1098 s->saddr = p->local_ip;
1099 s->daddr = p->remote_ip;
1100 s->dport = p->cf->remote_port;
1101 s->iface = p->neigh ? p->neigh->iface : NULL;
1102 s->vrf = p->p.vrf;
1103 s->ttl = p->cf->ttl_security ? 255 : hops;
1104 s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
1105 s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
1106 s->tos = IP_PREC_INTERNET_CONTROL;
1107 s->password = p->cf->password;
1108 s->tx_hook = bgp_connected;
1109 BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J",
1110 s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL,
1111 s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
1112 bgp_setup_conn(p, conn);
1113 bgp_setup_sk(conn, s);
1114 bgp_conn_set_state(conn, BS_CONNECT);
1115
1116 if (sk_open(s) < 0)
1117 goto err;
1118
1119 /* Set minimal receive TTL if needed */
1120 if (p->cf->ttl_security)
1121 if (sk_set_min_ttl(s, 256 - hops) < 0)
1122 goto err;
1123
1124 DBG("BGP: Waiting for connect success\n");
1125 bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
1126 return;
1127
1128 err:
1129 sk_log_error(s, p->p.name);
1130 bgp_sock_err(s, 0);
1131 return;
1132 }
1133
1134 static inline int bgp_is_dynamic(struct bgp_proto *p)
1135 { return ipa_zero(p->remote_ip); }
1136
1137 /**
1138 * bgp_find_proto - find existing proto for incoming connection
1139 * @sk: TCP socket
1140 *
1141 */
1142 static struct bgp_proto *
1143 bgp_find_proto(sock *sk)
1144 {
1145 struct bgp_proto *best = NULL;
1146 struct bgp_proto *p;
1147
1148 /* sk->iface is valid only if src or dst address is link-local */
1149 int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr);
1150
1151 WALK_LIST(p, proto_list)
1152 if ((p->p.proto == &proto_bgp) &&
1153 (ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) &&
1154 (!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) &&
1155 (p->p.vrf == sk->vrf) &&
1156 (p->cf->local_port == sk->sport) &&
1157 (!link || (p->cf->iface == sk->iface)) &&
1158 (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)))
1159 {
1160 best = p;
1161
1162 if (!bgp_is_dynamic(p))
1163 break;
1164 }
1165
1166 return best;
1167 }
1168
1169 /**
1170 * bgp_incoming_connection - handle an incoming connection
1171 * @sk: TCP socket
1172 * @dummy: unused
1173 *
1174 * This function serves as a socket hook for accepting of new BGP
1175 * connections. It searches a BGP instance corresponding to the peer
1176 * which has connected and if such an instance exists, it creates a
1177 * &bgp_conn structure, attaches it to the instance and either sends
1178 * an Open message or (if there already is an active connection) it
1179 * closes the new connection by sending a Notification message.
1180 */
1181 static int
1182 bgp_incoming_connection(sock *sk, uint dummy UNUSED)
1183 {
1184 struct bgp_proto *p;
1185 int acc, hops;
1186
1187 DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
1188 p = bgp_find_proto(sk);
1189 if (!p)
1190 {
1191 log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
1192 sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
1193 rfree(sk);
1194 return 0;
1195 }
1196
1197 /*
1198 * BIRD should keep multiple incoming connections in OpenSent state (for
1199 * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
1200 * connections are rejected istead. The exception is the case where an
1201 * incoming connection triggers a graceful restart.
1202 */
1203
1204 acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
1205 (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
1206
1207 if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
1208 {
1209 bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
1210 bgp_handle_graceful_restart(p);
1211 bgp_conn_enter_idle_state(p->conn);
1212 acc = 1;
1213
1214 /* There might be separate incoming connection in OpenSent state */
1215 if (p->incoming_conn.state > BS_ACTIVE)
1216 bgp_close_conn(&p->incoming_conn);
1217 }
1218
1219 BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
1220 sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
1221 sk->dport, acc ? "accepted" : "rejected");
1222
1223 if (!acc)
1224 {
1225 rfree(sk);
1226 return 0;
1227 }
1228
1229 hops = p->cf->multihop ? : 1;
1230
1231 if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
1232 goto err;
1233
1234 if (p->cf->ttl_security)
1235 if (sk_set_min_ttl(sk, 256 - hops) < 0)
1236 goto err;
1237
1238 if (p->cf->enable_extended_messages)
1239 {
1240 sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
1241 sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
1242 sk_reallocate(sk);
1243 }
1244
1245 /* For dynamic BGP, spawn new instance and postpone the socket */
1246 if (bgp_is_dynamic(p))
1247 {
1248 p = bgp_spawn(p, sk->daddr);
1249 p->postponed_sk = sk;
1250 rmove(sk, p->p.pool);
1251 return 0;
1252 }
1253
1254 rmove(sk, p->p.pool);
1255 bgp_setup_conn(p, &p->incoming_conn);
1256 bgp_setup_sk(&p->incoming_conn, sk);
1257 bgp_send_open(&p->incoming_conn);
1258 return 0;
1259
1260 err:
1261 sk_log_error(sk, p->p.name);
1262 log(L_ERR "%s: Incoming connection aborted", p->p.name);
1263 rfree(sk);
1264 return 0;
1265 }
1266
1267 static void
1268 bgp_listen_sock_err(sock *sk UNUSED, int err)
1269 {
1270 if (err == ECONNABORTED)
1271 log(L_WARN "BGP: Incoming connection aborted");
1272 else
1273 log(L_ERR "BGP: Error on listening socket: %M", err);
1274 }
1275
1276 static void
1277 bgp_start_neighbor(struct bgp_proto *p)
1278 {
1279 /* Called only for single-hop BGP sessions */
1280
1281 if (ipa_zero(p->local_ip))
1282 p->local_ip = p->neigh->ifa->ip;
1283
1284 if (ipa_is_link_local(p->local_ip))
1285 p->link_addr = p->local_ip;
1286 else if (p->neigh->iface->llv6)
1287 p->link_addr = p->neigh->iface->llv6->ip;
1288
1289 bgp_initiate(p);
1290 }
1291
1292 static void
1293 bgp_neigh_notify(neighbor *n)
1294 {
1295 struct bgp_proto *p = (struct bgp_proto *) n->proto;
1296 int ps = p->p.proto_state;
1297
1298 if (n != p->neigh)
1299 return;
1300
1301 if ((ps == PS_DOWN) || (ps == PS_STOP))
1302 return;
1303
1304 int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);
1305
1306 if (n->scope <= 0)
1307 {
1308 if (!prepare)
1309 {
1310 BGP_TRACE(D_EVENTS, "Neighbor lost");
1311 bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
1312 /* Perhaps also run bgp_update_startup_delay(p)? */
1313 bgp_stop(p, 0, NULL, 0);
1314 }
1315 }
1316 else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
1317 {
1318 if (!prepare)
1319 {
1320 BGP_TRACE(D_EVENTS, "Link down");
1321 bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
1322 if (ps == PS_UP)
1323 bgp_update_startup_delay(p);
1324 bgp_stop(p, 0, NULL, 0);
1325 }
1326 }
1327 else
1328 {
1329 if (prepare)
1330 {
1331 BGP_TRACE(D_EVENTS, "Neighbor ready");
1332 bgp_start_neighbor(p);
1333 }
1334 }
1335 }
1336
1337 static void
1338 bgp_bfd_notify(struct bfd_request *req)
1339 {
1340 struct bgp_proto *p = req->data;
1341 int ps = p->p.proto_state;
1342
1343 if (req->down && ((ps == PS_START) || (ps == PS_UP)))
1344 {
1345 BGP_TRACE(D_EVENTS, "BFD session down");
1346 bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
1347
1348 if (p->cf->bfd == BGP_BFD_GRACEFUL)
1349 {
1350 /* Trigger graceful restart */
1351 if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
1352 bgp_handle_graceful_restart(p);
1353
1354 if (p->incoming_conn.state > BS_IDLE)
1355 bgp_conn_enter_idle_state(&p->incoming_conn);
1356
1357 if (p->outgoing_conn.state > BS_IDLE)
1358 bgp_conn_enter_idle_state(&p->outgoing_conn);
1359 }
1360 else
1361 {
1362 /* Trigger session down */
1363 if (ps == PS_UP)
1364 bgp_update_startup_delay(p);
1365 bgp_stop(p, 0, NULL, 0);
1366 }
1367 }
1368 }
1369
1370 static void
1371 bgp_update_bfd(struct bgp_proto *p, int use_bfd)
1372 {
1373 if (use_bfd && !p->bfd_req && !bgp_is_dynamic(p))
1374 p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip,
1375 p->cf->multihop ? NULL : p->neigh->iface,
1376 p->p.vrf, bgp_bfd_notify, p);
1377
1378 if (!use_bfd && p->bfd_req)
1379 {
1380 rfree(p->bfd_req);
1381 p->bfd_req = NULL;
1382 }
1383 }
1384
1385 static void
1386 bgp_reload_routes(struct channel *C)
1387 {
1388 struct bgp_proto *p = (void *) C->proto;
1389 struct bgp_channel *c = (void *) C;
1390
1391 ASSERT(p->conn && (p->route_refresh || c->c.in_table));
1392
1393 if (c->c.in_table)
1394 channel_schedule_reload(C);
1395 else
1396 bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
1397 }
1398
1399 static void
1400 bgp_feed_begin(struct channel *C, int initial)
1401 {
1402 struct bgp_proto *p = (void *) C->proto;
1403 struct bgp_channel *c = (void *) C;
1404
1405 /* This should not happen */
1406 if (!p->conn)
1407 return;
1408
1409 if (initial && p->cf->gr_mode)
1410 c->feed_state = BFS_LOADING;
1411
1412 /* It is refeed and both sides support enhanced route refresh */
1413 if (!initial && p->enhanced_refresh)
1414 {
1415 /* BoRR must not be sent before End-of-RIB */
1416 if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
1417 return;
1418
1419 c->feed_state = BFS_REFRESHING;
1420 bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
1421 }
1422 }
1423
1424 static void
1425 bgp_feed_end(struct channel *C)
1426 {
1427 struct bgp_proto *p = (void *) C->proto;
1428 struct bgp_channel *c = (void *) C;
1429
1430 /* This should not happen */
1431 if (!p->conn)
1432 return;
1433
1434 /* Non-demarcated feed ended, nothing to do */
1435 if (c->feed_state == BFS_NONE)
1436 return;
1437
1438 /* Schedule End-of-RIB packet */
1439 if (c->feed_state == BFS_LOADING)
1440 c->feed_state = BFS_LOADED;
1441
1442 /* Schedule EoRR packet */
1443 if (c->feed_state == BFS_REFRESHING)
1444 c->feed_state = BFS_REFRESHED;
1445
1446 /* Kick TX hook */
1447 bgp_schedule_packet(p->conn, c, PKT_UPDATE);
1448 }
1449
1450
1451 static void
1452 bgp_start_locked(struct object_lock *lock)
1453 {
1454 struct bgp_proto *p = lock->data;
1455 const struct bgp_config *cf = p->cf;
1456
1457 if (p->p.proto_state != PS_START)
1458 {
1459 DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
1460 return;
1461 }
1462
1463 DBG("BGP: Got lock\n");
1464
1465 if (cf->multihop || bgp_is_dynamic(p))
1466 {
1467 /* Multi-hop sessions do not use neighbor entries */
1468 bgp_initiate(p);
1469 return;
1470 }
1471
1472 neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY);
1473 if (!n)
1474 {
1475 log(L_ERR "%s: Invalid remote address %I%J", p->p.name, p->remote_ip, cf->iface);
1476 /* As we do not start yet, we can just disable protocol */
1477 p->p.disabled = 1;
1478 bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
1479 proto_notify_state(&p->p, PS_DOWN);
1480 return;
1481 }
1482
1483 p->neigh = n;
1484
1485 if (n->scope <= 0)
1486 BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", p->remote_ip, cf->iface);
1487 else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
1488 BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
1489 else
1490 bgp_start_neighbor(p);
1491 }
1492
1493 static int
1494 bgp_start(struct proto *P)
1495 {
1496 struct bgp_proto *p = (struct bgp_proto *) P;
1497 const struct bgp_config *cf = p->cf;
1498
1499 p->local_ip = cf->local_ip;
1500 p->local_as = cf->local_as;
1501 p->remote_as = cf->remote_as;
1502 p->public_as = cf->local_as;
1503
1504 /* For dynamic BGP childs, remote_ip is already set */
1505 if (ipa_nonzero(cf->remote_ip))
1506 p->remote_ip = cf->remote_ip;
1507
1508 /* Confederation ID is used for truly external peers */
1509 if (p->cf->confederation && !p->is_interior)
1510 p->public_as = cf->confederation;
1511
1512 p->passive = cf->passive || bgp_is_dynamic(p);
1513
1514 p->start_state = BSS_PREPARE;
1515 p->outgoing_conn.state = BS_IDLE;
1516 p->incoming_conn.state = BS_IDLE;
1517 p->neigh = NULL;
1518 p->bfd_req = NULL;
1519 p->postponed_sk = NULL;
1520 p->gr_ready = 0;
1521 p->gr_active_num = 0;
1522
1523 p->event = ev_new_init(p->p.pool, bgp_decision, p);
1524 p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0);
1525 p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0);
1526
1527 p->local_id = proto_get_router_id(P->cf);
1528 if (p->rr_client)
1529 p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
1530
1531 p->remote_id = 0;
1532 p->link_addr = IPA_NONE;
1533
1534 /* Lock all channels when in GR recovery mode */
1535 if (p->p.gr_recovery && p->cf->gr_mode)
1536 {
1537 struct bgp_channel *c;
1538 WALK_LIST(c, p->p.channels)
1539 channel_graceful_restart_lock(&c->c);
1540 }
1541
1542 /*
1543 * Before attempting to create the connection, we need to lock the port,
1544 * so that we are the only instance attempting to talk with that neighbor.
1545 */
1546 struct object_lock *lock;
1547 lock = p->lock = olock_new(P->pool);
1548 lock->addr = p->remote_ip;
1549 lock->port = p->cf->remote_port;
1550 lock->iface = p->cf->iface;
1551 lock->vrf = p->cf->iface ? NULL : p->p.vrf;
1552 lock->type = OBJLOCK_TCP;
1553 lock->hook = bgp_start_locked;
1554 lock->data = p;
1555
1556 /* For dynamic BGP, we use inst 1 to avoid collisions with regular BGP */
1557 if (bgp_is_dynamic(p))
1558 {
1559 lock->addr = net_prefix(p->cf->remote_range);
1560 lock->inst = 1;
1561 }
1562
1563 olock_acquire(lock);
1564
1565 return PS_START;
1566 }
1567
1568 extern int proto_restart;
1569
1570 static int
1571 bgp_shutdown(struct proto *P)
1572 {
1573 struct bgp_proto *p = (struct bgp_proto *) P;
1574 int subcode = 0;
1575
1576 char *message = NULL;
1577 byte *data = NULL;
1578 uint len = 0;
1579
1580 BGP_TRACE(D_EVENTS, "Shutdown requested");
1581
1582 switch (P->down_code)
1583 {
1584 case PDC_CF_REMOVE:
1585 case PDC_CF_DISABLE:
1586 subcode = 3; // Errcode 6, 3 - peer de-configured
1587 break;
1588
1589 case PDC_CF_RESTART:
1590 subcode = 6; // Errcode 6, 6 - other configuration change
1591 break;
1592
1593 case PDC_CMD_DISABLE:
1594 case PDC_CMD_SHUTDOWN:
1595 shutdown:
1596 subcode = 2; // Errcode 6, 2 - administrative shutdown
1597 message = P->message;
1598 break;
1599
1600 case PDC_CMD_RESTART:
1601 subcode = 4; // Errcode 6, 4 - administrative reset
1602 message = P->message;
1603 break;
1604
1605 case PDC_CMD_GR_DOWN:
1606 if ((p->cf->gr_mode != BGP_GR_ABLE) &&
1607 (p->cf->llgr_mode != BGP_LLGR_ABLE))
1608 goto shutdown;
1609
1610 subcode = -1; // Do not send NOTIFICATION, just close the connection
1611 break;
1612
1613 case PDC_RX_LIMIT_HIT:
1614 case PDC_IN_LIMIT_HIT:
1615 subcode = 1; // Errcode 6, 1 - max number of prefixes reached
1616 /* log message for compatibility */
1617 log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
1618 goto limit;
1619
1620 case PDC_OUT_LIMIT_HIT:
1621 subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
1622
1623 limit:
1624 bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
1625 if (proto_restart)
1626 bgp_update_startup_delay(p);
1627 else
1628 p->startup_delay = 0;
1629 goto done;
1630 }
1631
1632 bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
1633 p->startup_delay = 0;
1634
1635 /* RFC 8203 - shutdown communication */
1636 if (message)
1637 {
1638 uint msg_len = strlen(message);
1639 msg_len = MIN(msg_len, 255);
1640
1641 /* Buffer will be freed automatically by protocol shutdown */
1642 data = mb_alloc(p->p.pool, msg_len + 1);
1643 len = msg_len + 1;
1644
1645 data[0] = msg_len;
1646 memcpy(data+1, message, msg_len);
1647 }
1648
1649 done:
1650 bgp_stop(p, subcode, data, len);
1651 return p->p.proto_state;
1652 }
1653
1654 static struct proto *
1655 bgp_init(struct proto_config *CF)
1656 {
1657 struct proto *P = proto_new(CF);
1658 struct bgp_proto *p = (struct bgp_proto *) P;
1659 struct bgp_config *cf = (struct bgp_config *) CF;
1660
1661 P->rt_notify = bgp_rt_notify;
1662 P->preexport = bgp_preexport;
1663 P->neigh_notify = bgp_neigh_notify;
1664 P->reload_routes = bgp_reload_routes;
1665 P->feed_begin = bgp_feed_begin;
1666 P->feed_end = bgp_feed_end;
1667 P->rte_better = bgp_rte_better;
1668 P->rte_mergable = bgp_rte_mergable;
1669 P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;
1670 P->rte_modify = bgp_rte_modify_stale;
1671
1672 p->cf = cf;
1673 p->is_internal = (cf->local_as == cf->remote_as);
1674 p->is_interior = p->is_internal || cf->confederation_member;
1675 p->rs_client = cf->rs_client;
1676 p->rr_client = cf->rr_client;
1677
1678 p->ipv4 = ipa_nonzero(cf->remote_ip) ?
1679 ipa_is_ip4(cf->remote_ip) :
1680 (cf->remote_range && (cf->remote_range->type == NET_IP4));
1681
1682 p->remote_ip = cf->remote_ip;
1683 p->remote_as = cf->remote_as;
1684
1685 /* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */
1686 if (cf->c.parent)
1687 cf->remote_ip = IPA_NONE;
1688
1689 /* Add all channels */
1690 struct bgp_channel_config *cc;
1691 WALK_LIST(cc, CF->channels)
1692 proto_add_channel(P, &cc->c);
1693
1694 return P;
1695 }
1696
1697 static void
1698 bgp_channel_init(struct channel *C, struct channel_config *CF)
1699 {
1700 struct bgp_channel *c = (void *) C;
1701 struct bgp_channel_config *cf = (void *) CF;
1702
1703 c->cf = cf;
1704 c->afi = cf->afi;
1705 c->desc = cf->desc;
1706
1707 if (cf->igp_table_ip4)
1708 c->igp_table_ip4 = cf->igp_table_ip4->table;
1709
1710 if (cf->igp_table_ip6)
1711 c->igp_table_ip6 = cf->igp_table_ip6->table;
1712 }
1713
1714 static int
1715 bgp_channel_start(struct channel *C)
1716 {
1717 struct bgp_proto *p = (void *) C->proto;
1718 struct bgp_channel *c = (void *) C;
1719 ip_addr src = p->local_ip;
1720
1721 if (c->igp_table_ip4)
1722 rt_lock_table(c->igp_table_ip4);
1723
1724 if (c->igp_table_ip6)
1725 rt_lock_table(c->igp_table_ip6);
1726
1727 c->pool = p->p.pool; // XXXX
1728 bgp_init_bucket_table(c);
1729 bgp_init_prefix_table(c);
1730
1731 if (c->cf->import_table)
1732 channel_setup_in_table(C);
1733
1734 if (c->cf->export_table)
1735 channel_setup_out_table(C);
1736
1737 c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
1738
1739 c->next_hop_addr = c->cf->next_hop_addr;
1740 c->link_addr = IPA_NONE;
1741 c->packets_to_send = 0;
1742
1743 /* Try to use source address as next hop address */
1744 if (ipa_zero(c->next_hop_addr))
1745 {
1746 if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop))
1747 c->next_hop_addr = src;
1748
1749 if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop))
1750 c->next_hop_addr = src;
1751 }
1752
1753 /* Use preferred addresses associated with interface / source address */
1754 if (ipa_zero(c->next_hop_addr))
1755 {
1756 /* We know the iface for single-hop, we make lookup for multihop */
1757 struct neighbor *nbr = p->neigh ?: neigh_find(&p->p, src, NULL, 0);
1758 struct iface *iface = nbr ? nbr->iface : NULL;
1759
1760 if (bgp_channel_is_ipv4(c) && iface && iface->addr4)
1761 c->next_hop_addr = iface->addr4->ip;
1762
1763 if (bgp_channel_is_ipv6(c) && iface && iface->addr6)
1764 c->next_hop_addr = iface->addr6->ip;
1765 }
1766
1767 /* Exit if no feasible next hop address is found */
1768 if (ipa_zero(c->next_hop_addr))
1769 {
1770 log(L_WARN "%s: Missing next hop address", p->p.name);
1771 return 0;
1772 }
1773
1774 /* Set link-local address for IPv6 single-hop BGP */
1775 if (ipa_is_ip6(c->next_hop_addr) && p->neigh)
1776 {
1777 c->link_addr = p->link_addr;
1778
1779 if (ipa_zero(c->link_addr))
1780 log(L_WARN "%s: Missing link-local address", p->p.name);
1781 }
1782
1783 /* Link local address is already in c->link_addr */
1784 if (ipa_is_link_local(c->next_hop_addr))
1785 c->next_hop_addr = IPA_NONE;
1786
1787 return 0; /* XXXX: Currently undefined */
1788 }
1789
1790 static void
1791 bgp_channel_shutdown(struct channel *C)
1792 {
1793 struct bgp_channel *c = (void *) C;
1794
1795 c->next_hop_addr = IPA_NONE;
1796 c->link_addr = IPA_NONE;
1797 c->packets_to_send = 0;
1798 }
1799
1800 static void
1801 bgp_channel_cleanup(struct channel *C)
1802 {
1803 struct bgp_channel *c = (void *) C;
1804
1805 if (c->igp_table_ip4)
1806 rt_unlock_table(c->igp_table_ip4);
1807
1808 if (c->igp_table_ip6)
1809 rt_unlock_table(c->igp_table_ip6);
1810
1811 c->index = 0;
1812
1813 /* Cleanup rest of bgp_channel starting at pool field */
1814 memset(&(c->pool), 0, sizeof(struct bgp_channel) - OFFSETOF(struct bgp_channel, pool));
1815 }
1816
1817 static inline struct bgp_channel_config *
1818 bgp_find_channel_config(struct bgp_config *cf, u32 afi)
1819 {
1820 struct bgp_channel_config *cc;
1821
1822 WALK_LIST(cc, cf->c.channels)
1823 if (cc->afi == afi)
1824 return cc;
1825
1826 return NULL;
1827 }
1828
1829 struct rtable_config *
1830 bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type)
1831 {
1832 struct bgp_channel_config *cc2;
1833 struct rtable_config *tab;
1834
1835 /* First, try table connected by the channel */
1836 if (cc->c.table->addr_type == type)
1837 return cc->c.table;
1838
1839 /* Find paired channel with the same SAFI but the other AFI */
1840 u32 afi2 = cc->afi ^ 0x30000;
1841 cc2 = bgp_find_channel_config(cf, afi2);
1842
1843 /* Second, try IGP table configured in the paired channel */
1844 if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6))
1845 return tab;
1846
1847 /* Third, try table connected by the paired channel */
1848 if (cc2 && (cc2->c.table->addr_type == type))
1849 return cc2->c.table;
1850
1851 /* Last, try default table of given type */
1852 if (tab = cf->c.global->def_tables[type])
1853 return tab;
1854
1855 cf_error("Undefined IGP table");
1856 }
1857
1858
1859 void
1860 bgp_postconfig(struct proto_config *CF)
1861 {
1862 struct bgp_config *cf = (void *) CF;
1863
1864 /* Do not check templates at all */
1865 if (cf->c.class == SYM_TEMPLATE)
1866 return;
1867
1868
1869 /* Handle undefined remote_as, zero should mean unspecified external */
1870 if (!cf->remote_as && (cf->peer_type == BGP_PT_INTERNAL))
1871 cf->remote_as = cf->local_as;
1872
1873 int internal = (cf->local_as == cf->remote_as);
1874 int interior = internal || cf->confederation_member;
1875
1876 /* EBGP direct by default, IBGP multihop by default */
1877 if (cf->multihop < 0)
1878 cf->multihop = internal ? 64 : 0;
1879
1880 /* LLGR mode default based on GR mode */
1881 if (cf->llgr_mode < 0)
1882 cf->llgr_mode = cf->gr_mode ? BGP_LLGR_AWARE : 0;
1883
1884 /* Link check for single-hop BGP by default */
1885 if (cf->check_link < 0)
1886 cf->check_link = !cf->multihop;
1887
1888
1889 if (!cf->local_as)
1890 cf_error("Local AS number must be set");
1891
1892 if (ipa_zero(cf->remote_ip) && !cf->remote_range)
1893 cf_error("Neighbor must be configured");
1894
1895 if (ipa_zero(cf->local_ip) && cf->strict_bind)
1896 cf_error("Local address must be configured for strict bind");
1897
1898 if (!cf->remote_as && !cf->peer_type)
1899 cf_error("Remote AS number (or peer type) must be set");
1900
1901 if ((cf->peer_type == BGP_PT_INTERNAL) && !internal)
1902 cf_error("IBGP cannot have different ASNs");
1903
1904 if ((cf->peer_type == BGP_PT_EXTERNAL) && internal)
1905 cf_error("EBGP cannot have the same ASNs");
1906
1907 if (!cf->iface && (ipa_is_link_local(cf->local_ip) ||
1908 ipa_is_link_local(cf->remote_ip)))
1909 cf_error("Link-local addresses require defined interface");
1910
1911 if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF))
1912 cf_error("Neighbor AS number out of range (AS4 not available)");
1913
1914 if (!internal && cf->rr_client)
1915 cf_error("Only internal neighbor can be RR client");
1916
1917 if (internal && cf->rs_client)
1918 cf_error("Only external neighbor can be RS client");
1919
1920 if (!cf->confederation && cf->confederation_member)
1921 cf_error("Confederation ID must be set for member sessions");
1922
1923 if (cf->multihop && (ipa_is_link_local(cf->local_ip) ||
1924 ipa_is_link_local(cf->remote_ip)))
1925 cf_error("Multihop BGP cannot be used with link-local addresses");
1926
1927 if (cf->multihop && cf->iface)
1928 cf_error("Multihop BGP cannot be bound to interface");
1929
1930 if (cf->multihop && cf->check_link)
1931 cf_error("Multihop BGP cannot depend on link state");
1932
1933 if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip))
1934 cf_error("Multihop BGP with BFD requires specified local address");
1935
1936 if (!cf->gr_mode && cf->llgr_mode)
1937 cf_error("Long-lived graceful restart requires basic graceful restart");
1938
1939
1940 struct bgp_channel_config *cc;
1941 WALK_LIST(cc, CF->channels)
1942 {
1943 /* Handle undefined import filter */
1944 if (cc->c.in_filter == FILTER_UNDEF)
1945 if (interior)
1946 cc->c.in_filter = FILTER_ACCEPT;
1947 else
1948 cf_error("EBGP requires explicit import policy");
1949
1950 /* Handle undefined export filter */
1951 if (cc->c.out_filter == FILTER_UNDEF)
1952 if (interior)
1953 cc->c.out_filter = FILTER_REJECT;
1954 else
1955 cf_error("EBGP requires explicit export policy");
1956
1957 /* Disable after error incompatible with restart limit action */
1958 if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error)
1959 cc->c.in_limit.action = PLA_DISABLE;
1960
1961 /* Different default based on rr_client, rs_client */
1962 if (cc->next_hop_keep == 0xff)
1963 cc->next_hop_keep = cf->rr_client ? NH_IBGP : (cf->rs_client ? NH_ALL : NH_NO);
1964
1965 /* Different default based on rs_client */
1966 if (!cc->missing_lladdr)
1967 cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF;
1968
1969 /* Different default for gw_mode */
1970 if (!cc->gw_mode)
1971 cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT;
1972
1973 /* Defaults based on proto config */
1974 if (cc->gr_able == 0xff)
1975 cc->gr_able = (cf->gr_mode == BGP_GR_ABLE);
1976
1977 if (cc->llgr_able == 0xff)
1978 cc->llgr_able = (cf->llgr_mode == BGP_LLGR_ABLE);
1979
1980 if (cc->llgr_time == ~0U)
1981 cc->llgr_time = cf->llgr_time;
1982
1983 /* AIGP enabled by default on interior sessions */
1984 if (cc->aigp == 0xff)
1985 cc->aigp = interior;
1986
1987 /* Default values of IGP tables */
1988 if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp)
1989 {
1990 if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop))
1991 cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4);
1992
1993 if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop))
1994 cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6);
1995
1996 if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop)
1997 cf_error("Mismatched IGP table type");
1998
1999 if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop)
2000 cf_error("Mismatched IGP table type");
2001 }
2002
2003 if (cf->multihop && (cc->gw_mode == GW_DIRECT))
2004 cf_error("Multihop BGP cannot use direct gateway mode");
2005
2006 if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted)
2007 cf_error("BGP in recursive mode prohibits sorted table");
2008
2009 if (cf->deterministic_med && cc->c.table->sorted)
2010 cf_error("BGP with deterministic MED prohibits sorted table");
2011
2012 if (cc->secondary && !cc->c.table->sorted)
2013 cf_error("BGP with secondary option requires sorted table");
2014 }
2015 }
2016
2017 static int
2018 bgp_reconfigure(struct proto *P, struct proto_config *CF)
2019 {
2020 struct bgp_proto *p = (void *) P;
2021 const struct bgp_config *new = (void *) CF;
2022 const struct bgp_config *old = p->cf;
2023
2024 if (proto_get_router_id(CF) != p->local_id)
2025 return 0;
2026
2027 int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
2028 ((byte *) new) + sizeof(struct proto_config),
2029 // password item is last and must be checked separately
2030 OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
2031 && !bstrcmp(old->password, new->password)
2032 && ((!old->remote_range && !new->remote_range)
2033 || (old->remote_range && new->remote_range && net_equal(old->remote_range, new->remote_range)))
2034 && !bstrcmp(old->dynamic_name, new->dynamic_name)
2035 && (old->dynamic_name_digits == new->dynamic_name_digits);
2036
2037 /* FIXME: Move channel reconfiguration to generic protocol code ? */
2038 struct channel *C, *C2;
2039 struct bgp_channel_config *cc;
2040
2041 WALK_LIST(C, p->p.channels)
2042 C->stale = 1;
2043
2044 WALK_LIST(cc, new->c.channels)
2045 {
2046 C = (struct channel *) bgp_find_channel(p, cc->afi);
2047 same = proto_configure_channel(P, &C, &cc->c) && same;
2048
2049 if (C)
2050 C->stale = 0;
2051 }
2052
2053 WALK_LIST_DELSAFE(C, C2, p->p.channels)
2054 if (C->stale)
2055 same = proto_configure_channel(P, &C, NULL) && same;
2056
2057
2058 if (same && (p->start_state > BSS_PREPARE))
2059 bgp_update_bfd(p, new->bfd);
2060
2061 /* We should update our copy of configuration ptr as old configuration will be freed */
2062 if (same)
2063 p->cf = new;
2064
2065 /* Reset name counter */
2066 p->dynamic_name_counter = 0;
2067
2068 return same;
2069 }
2070
2071 #define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL )
2072
2073 static int
2074 bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed)
2075 {
2076 struct bgp_proto *p = (void *) C->proto;
2077 struct bgp_channel *c = (void *) C;
2078 struct bgp_channel_config *new = (void *) CC;
2079 struct bgp_channel_config *old = c->cf;
2080
2081 if ((new->secondary != old->secondary) ||
2082 (new->gr_able != old->gr_able) ||
2083 (new->llgr_able != old->llgr_able) ||
2084 (new->llgr_time != old->llgr_time) ||
2085 (new->ext_next_hop != old->ext_next_hop) ||
2086 (new->add_path != old->add_path) ||
2087 (new->import_table != old->import_table) ||
2088 (new->export_table != old->export_table) ||
2089 (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) ||
2090 (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6)))
2091 return 0;
2092
2093 if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP))
2094 return 0;
2095
2096 if ((new->gw_mode != old->gw_mode) ||
2097 (new->aigp != old->aigp) ||
2098 (new->cost != old->cost))
2099 {
2100 /* import_changed itself does not force ROUTE_REFRESH when import_table is active */
2101 if (c->c.in_table && (c->c.channel_state == CS_UP))
2102 bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
2103
2104 *import_changed = 1;
2105 }
2106
2107 if (!ipa_equal(new->next_hop_addr, old->next_hop_addr) ||
2108 (new->next_hop_self != old->next_hop_self) ||
2109 (new->next_hop_keep != old->next_hop_keep) ||
2110 (new->missing_lladdr != old->missing_lladdr) ||
2111 (new->aigp != old->aigp) ||
2112 (new->aigp_originate != old->aigp_originate))
2113 *export_changed = 1;
2114
2115 c->cf = new;
2116 return 1;
2117 }
2118
2119 static void
2120 bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED)
2121 {
2122 /* Just a shallow copy */
2123 }
2124
2125
2126 /**
2127 * bgp_error - report a protocol error
2128 * @c: connection
2129 * @code: error code (according to the RFC)
2130 * @subcode: error sub-code
2131 * @data: data to be passed in the Notification message
2132 * @len: length of the data
2133 *
2134 * bgp_error() sends a notification packet to tell the other side that a protocol
2135 * error has occurred (including the data considered erroneous if possible) and
2136 * closes the connection.
2137 */
2138 void
2139 bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len)
2140 {
2141 struct bgp_proto *p = c->bgp;
2142
2143 if (c->state == BS_CLOSE)
2144 return;
2145
2146 bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len));
2147 bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
2148 bgp_conn_enter_close_state(c);
2149
2150 c->notify_code = code;
2151 c->notify_subcode = subcode;
2152 c->notify_data = data;
2153 c->notify_size = (len > 0) ? len : 0;
2154 bgp_schedule_packet(c, NULL, PKT_NOTIFICATION);
2155
2156 if (code != 6)
2157 {
2158 bgp_update_startup_delay(p);
2159 bgp_stop(p, 0, NULL, 0);
2160 }
2161 }
2162
2163 /**
2164 * bgp_store_error - store last error for status report
2165 * @p: BGP instance
2166 * @c: connection
2167 * @class: error class (BE_xxx constants)
2168 * @code: error code (class specific)
2169 *
2170 * bgp_store_error() decides whether given error is interesting enough
2171 * and store that error to last_error variables of @p
2172 */
2173 void
2174 bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
2175 {
2176 /* During PS_UP, we ignore errors on secondary connection */
2177 if ((p->p.proto_state == PS_UP) && c && (c != p->conn))
2178 return;
2179
2180 /* During PS_STOP, we ignore any errors, as we want to report
2181 * the error that caused transition to PS_STOP
2182 */
2183 if (p->p.proto_state == PS_STOP)
2184 return;
2185
2186 p->last_error_class = class;
2187 p->last_error_code = code;
2188 }
2189
2190 static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
2191 static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
2192 static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "Link down", "BFD session down", "Graceful restart"};
2193 static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
2194 static char *bgp_gr_states[] = { "None", "Regular", "Long-lived"};
2195
2196 static const char *
2197 bgp_last_errmsg(struct bgp_proto *p)
2198 {
2199 switch (p->last_error_class)
2200 {
2201 case BE_MISC:
2202 return bgp_misc_errors[p->last_error_code];
2203 case BE_SOCKET:
2204 return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
2205 case BE_BGP_RX:
2206 case BE_BGP_TX:
2207 return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
2208 case BE_AUTO_DOWN:
2209 return bgp_auto_errors[p->last_error_code];
2210 default:
2211 return "";
2212 }
2213 }
2214
2215 static const char *
2216 bgp_state_dsc(struct bgp_proto *p)
2217 {
2218 if (p->p.proto_state == PS_DOWN)
2219 return "Down";
2220
2221 int state = MAX(p->incoming_conn.state, p->outgoing_conn.state);
2222 if ((state == BS_IDLE) && (p->start_state >= BSS_CONNECT) && p->passive)
2223 return "Passive";
2224
2225 return bgp_state_names[state];
2226 }
2227
2228 static void
2229 bgp_get_status(struct proto *P, byte *buf)
2230 {
2231 struct bgp_proto *p = (struct bgp_proto *) P;
2232
2233 const char *err1 = bgp_err_classes[p->last_error_class];
2234 const char *err2 = bgp_last_errmsg(p);
2235
2236 if (P->proto_state == PS_DOWN)
2237 bsprintf(buf, "%s%s", err1, err2);
2238 else
2239 bsprintf(buf, "%-14s%s%s", bgp_state_dsc(p), err1, err2);
2240 }
2241
2242 static void
2243 bgp_show_afis(int code, char *s, u32 *afis, uint count)
2244 {
2245 buffer b;
2246 LOG_BUFFER_INIT(b);
2247
2248 buffer_puts(&b, s);
2249
2250 for (u32 *af = afis; af < (afis + count); af++)
2251 {
2252 const struct bgp_af_desc *desc = bgp_get_af_desc(*af);
2253 if (desc)
2254 buffer_print(&b, " %s", desc->name);
2255 else
2256 buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af));
2257 }
2258
2259 if (b.pos == b.end)
2260 strcpy(b.end - 32, " ... <too long>");
2261
2262 cli_msg(code, b.start);
2263 }
2264
2265 static void
2266 bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
2267 {
2268 struct bgp_af_caps *ac;
2269 uint any_mp_bgp = 0;
2270 uint any_gr_able = 0;
2271 uint any_add_path = 0;
2272 uint any_ext_next_hop = 0;
2273 uint any_llgr_able = 0;
2274 u32 *afl1 = alloca(caps->af_count * sizeof(u32));
2275 u32 *afl2 = alloca(caps->af_count * sizeof(u32));
2276 uint afn1, afn2;
2277
2278 WALK_AF_CAPS(caps, ac)
2279 {
2280 any_mp_bgp |= ac->ready;
2281 any_gr_able |= ac->gr_able;
2282 any_add_path |= ac->add_path;
2283 any_ext_next_hop |= ac->ext_next_hop;
2284 any_llgr_able |= ac->llgr_able;
2285 }
2286
2287 if (any_mp_bgp)
2288 {
2289 cli_msg(-1006, " Multiprotocol");
2290
2291 afn1 = 0;
2292 WALK_AF_CAPS(caps, ac)
2293 if (ac->ready)
2294 afl1[afn1++] = ac->afi;
2295
2296 bgp_show_afis(-1006, " AF announced:", afl1, afn1);
2297 }
2298
2299 if (caps->route_refresh)
2300 cli_msg(-1006, " Route refresh");
2301
2302 if (any_ext_next_hop)
2303 {
2304 cli_msg(-1006, " Extended next hop");
2305
2306 afn1 = 0;
2307 WALK_AF_CAPS(caps, ac)
2308 if (ac->ext_next_hop)
2309 afl1[afn1++] = ac->afi;
2310
2311 bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1);
2312 }
2313
2314 if (caps->ext_messages)
2315 cli_msg(-1006, " Extended message");
2316
2317 if (caps->gr_aware)
2318 cli_msg(-1006, " Graceful restart");
2319
2320 if (any_gr_able)
2321 {
2322 /* Continues from gr_aware */
2323 cli_msg(-1006, " Restart time: %u", caps->gr_time);
2324 if (caps->gr_flags & BGP_GRF_RESTART)
2325 cli_msg(-1006, " Restart recovery");
2326
2327 afn1 = afn2 = 0;
2328 WALK_AF_CAPS(caps, ac)
2329 {
2330 if (ac->gr_able)
2331 afl1[afn1++] = ac->afi;
2332
2333 if (ac->gr_af_flags & BGP_GRF_FORWARDING)
2334 afl2[afn2++] = ac->afi;
2335 }
2336
2337 bgp_show_afis(-1006, " AF supported:", afl1, afn1);
2338 bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
2339 }
2340
2341 if (caps->as4_support)
2342 cli_msg(-1006, " 4-octet AS numbers");
2343
2344 if (any_add_path)
2345 {
2346 cli_msg(-1006, " ADD-PATH");
2347
2348 afn1 = afn2 = 0;
2349 WALK_AF_CAPS(caps, ac)
2350 {
2351 if (ac->add_path & BGP_ADD_PATH_RX)
2352 afl1[afn1++] = ac->afi;
2353
2354 if (ac->add_path & BGP_ADD_PATH_TX)
2355 afl2[afn2++] = ac->afi;
2356 }
2357
2358 bgp_show_afis(-1006, " RX:", afl1, afn1);
2359 bgp_show_afis(-1006, " TX:", afl2, afn2);
2360 }
2361
2362 if (caps->enhanced_refresh)
2363 cli_msg(-1006, " Enhanced refresh");
2364
2365 if (caps->llgr_aware)
2366 cli_msg(-1006, " Long-lived graceful restart");
2367
2368 if (any_llgr_able)
2369 {
2370 u32 stale_time = 0;
2371
2372 afn1 = afn2 = 0;
2373 WALK_AF_CAPS(caps, ac)
2374 {
2375 stale_time = MAX(stale_time, ac->llgr_time);
2376
2377 if (ac->llgr_able && ac->llgr_time)
2378 afl1[afn1++] = ac->afi;
2379
2380 if (ac->llgr_flags & BGP_GRF_FORWARDING)
2381 afl2[afn2++] = ac->afi;
2382 }
2383
2384 /* Continues from llgr_aware */
2385 cli_msg(-1006, " LL stale time: %u", stale_time);
2386
2387 bgp_show_afis(-1006, " AF supported:", afl1, afn1);
2388 bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
2389 }
2390 }
2391
2392 static void
2393 bgp_show_proto_info(struct proto *P)
2394 {
2395 struct bgp_proto *p = (struct bgp_proto *) P;
2396
2397 cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p));
2398
2399 if (bgp_is_dynamic(p) && p->cf->remote_range)
2400 cli_msg(-1006, " Neighbor range: %N", p->cf->remote_range);
2401 else
2402 cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface);
2403
2404 cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
2405 cli_msg(-1006, " Local AS: %u", p->cf->local_as);
2406
2407 if (p->gr_active_num)
2408 cli_msg(-1006, " Neighbor graceful restart active");
2409
2410 if (P->proto_state == PS_START)
2411 {
2412 struct bgp_conn *oc = &p->outgoing_conn;
2413
2414 if ((p->start_state < BSS_CONNECT) &&
2415 (tm_active(p->startup_timer)))
2416 cli_msg(-1006, " Error wait: %t/%u",
2417 tm_remains(p->startup_timer), p->startup_delay);
2418
2419 if ((oc->state == BS_ACTIVE) &&
2420 (tm_active(oc->connect_timer)))
2421 cli_msg(-1006, " Connect delay: %t/%u",
2422 tm_remains(oc->connect_timer), p->cf->connect_delay_time);
2423
2424 if (p->gr_active_num && tm_active(p->gr_timer))
2425 cli_msg(-1006, " Restart timer: %t/-",
2426 tm_remains(p->gr_timer));
2427 }
2428 else if (P->proto_state == PS_UP)
2429 {
2430 cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
2431 cli_msg(-1006, " Local capabilities");
2432 bgp_show_capabilities(p, p->conn->local_caps);
2433 cli_msg(-1006, " Neighbor capabilities");
2434 bgp_show_capabilities(p, p->conn->remote_caps);
2435 cli_msg(-1006, " Session: %s%s%s%s%s",
2436 p->is_internal ? "internal" : "external",
2437 p->cf->multihop ? " multihop" : "",
2438 p->rr_client ? " route-reflector" : "",
2439 p->rs_client ? " route-server" : "",
2440 p->as4_session ? " AS4" : "");
2441 cli_msg(-1006, " Source address: %I", p->local_ip);
2442 cli_msg(-1006, " Hold timer: %t/%u",
2443 tm_remains(p->conn->hold_timer), p->conn->hold_time);
2444 cli_msg(-1006, " Keepalive timer: %t/%u",
2445 tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time);
2446 }
2447
2448 if ((p->last_error_class != BE_NONE) &&
2449 (p->last_error_class != BE_MAN_DOWN))
2450 {
2451 const char *err1 = bgp_err_classes[p->last_error_class];
2452 const char *err2 = bgp_last_errmsg(p);
2453 cli_msg(-1006, " Last error: %s%s", err1, err2);
2454 }
2455
2456 {
2457 struct bgp_channel *c;
2458 WALK_LIST(c, p->p.channels)
2459 {
2460 channel_show_info(&c->c);
2461
2462 if (p->gr_active_num)
2463 cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]);
2464
2465 if (c->stale_timer && tm_active(c->stale_timer))
2466 cli_msg(-1006, " LL stale timer: %t/-", tm_remains(c->stale_timer));
2467
2468 if (c->c.channel_state == CS_UP)
2469 {
2470 if (ipa_zero(c->link_addr))
2471 cli_msg(-1006, " BGP Next hop: %I", c->next_hop_addr);
2472 else
2473 cli_msg(-1006, " BGP Next hop: %I %I", c->next_hop_addr, c->link_addr);
2474 }
2475
2476 if (c->igp_table_ip4)
2477 cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name);
2478
2479 if (c->igp_table_ip6)
2480 cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name);
2481 }
2482 }
2483 }
2484
2485 struct channel_class channel_bgp = {
2486 .channel_size = sizeof(struct bgp_channel),
2487 .config_size = sizeof(struct bgp_channel_config),
2488 .init = bgp_channel_init,
2489 .start = bgp_channel_start,
2490 .shutdown = bgp_channel_shutdown,
2491 .cleanup = bgp_channel_cleanup,
2492 .reconfigure = bgp_channel_reconfigure,
2493 };
2494
2495 struct protocol proto_bgp = {
2496 .name = "BGP",
2497 .template = "bgp%d",
2498 .class = PROTOCOL_BGP,
2499 .preference = DEF_PREF_BGP,
2500 .channel_mask = NB_IP | NB_VPN | NB_FLOW,
2501 .proto_size = sizeof(struct bgp_proto),
2502 .config_size = sizeof(struct bgp_config),
2503 .postconfig = bgp_postconfig,
2504 .init = bgp_init,
2505 .start = bgp_start,
2506 .shutdown = bgp_shutdown,
2507 .reconfigure = bgp_reconfigure,
2508 .copy_config = bgp_copy_config,
2509 .get_status = bgp_get_status,
2510 .get_attr = bgp_get_attr,
2511 .get_route_info = bgp_get_route_info,
2512 .show_proto_info = bgp_show_proto_info
2513 };