]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
BGP: split tx explicitly
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 #undef LOCAL_DEBUG
12
13 #include <stdlib.h>
14
15 #include "nest/bird.h"
16 #include "nest/iface.h"
17 #include "nest/protocol.h"
18 #include "nest/route.h"
19 #include "nest/attrs.h"
20 #include "proto/mrt/mrt.h"
21 #include "conf/conf.h"
22 #include "lib/unaligned.h"
23 #include "lib/flowspec.h"
24 #include "lib/socket.h"
25
26 #include "nest/cli.h"
27
28 #include "bgp.h"
29
30
31 #define BGP_RR_REQUEST 0
32 #define BGP_RR_BEGIN 1
33 #define BGP_RR_END 2
34
35 #define BGP_NLRI_MAX (4 + 1 + 32)
36
37 #define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
38 #define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
39 #define BGP_MPLS_NULL 3 /* Implicit NULL label */
40 #define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
41
42
43 static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44 static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45
46 /* Table for state -> RFC 6608 FSM error subcodes */
47 static byte fsm_err_subcode[BS_MAX] = {
48 [BS_OPENSENT] = 1,
49 [BS_OPENCONFIRM] = 2,
50 [BS_ESTABLISHED] = 3
51 };
52
53
54 static struct bgp_channel *
55 bgp_get_channel(struct bgp_proto *p, u32 afi)
56 {
57 uint i;
58
59 for (i = 0; i < p->channel_count; i++)
60 if (p->afi_map[i] == afi)
61 return p->channel_map[i];
62
63 return NULL;
64 }
65
66 static inline void
67 put_af3(byte *buf, u32 id)
68 {
69 put_u16(buf, id >> 16);
70 buf[2] = id & 0xff;
71 }
72
73 static inline void
74 put_af4(byte *buf, u32 id)
75 {
76 put_u16(buf, id >> 16);
77 buf[2] = 0;
78 buf[3] = id & 0xff;
79 }
80
81 static inline u32
82 get_af3(byte *buf)
83 {
84 return (get_u16(buf) << 16) | buf[2];
85 }
86
87 static inline u32
88 get_af4(byte *buf)
89 {
90 return (get_u16(buf) << 16) | buf[3];
91 }
92
93 static void
94 init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95 {
96 struct bgp_proto *p = conn->bgp;
97 int p_ok = conn->state >= BS_OPENCONFIRM;
98
99 memset(d, 0, sizeof(struct mrt_bgp_data));
100 d->peer_as = p->remote_as;
101 d->local_as = p->local_as;
102 d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103 d->af = ipa_is_ip4(p->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104 d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105 d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106 d->as4 = p_ok ? p->as4_session : 0;
107 }
108
109 static uint bgp_find_update_afi(byte *pos, uint len);
110
111 static int
112 bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113 {
114 /* No need to estimate it for other messages than UPDATE */
115 if (pkt[18] != PKT_UPDATE)
116 return 0;
117
118 /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119 if (p->summary_add_path_rx < 3)
120 return p->summary_add_path_rx == 2;
121
122 uint afi = bgp_find_update_afi(pkt, len);
123 struct bgp_channel *c = bgp_get_channel(p, afi);
124 if (!c)
125 {
126 /* Either frame error (if !afi) or unknown AFI/SAFI,
127 will be reported later in regular parsing */
128 BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129 return 0;
130 }
131
132 return c->add_path_rx;
133 }
134
135 static void
136 bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137 {
138 struct mrt_bgp_data d;
139 init_mrt_bgp_data(conn, &d);
140
141 d.message = pkt;
142 d.msg_len = len;
143 d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144
145 mrt_dump_bgp_message(&d);
146 }
147
148 void
149 bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150 {
151 struct mrt_bgp_data d;
152 init_mrt_bgp_data(conn, &d);
153
154 d.old_state = old;
155 d.new_state = new;
156
157 mrt_dump_bgp_state_change(&d);
158 }
159
160 static byte *
161 bgp_create_notification(struct bgp_conn *conn, byte *buf)
162 {
163 struct bgp_proto *p = conn->bgp;
164
165 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166 buf[0] = conn->notify_code;
167 buf[1] = conn->notify_subcode;
168 memcpy(buf+2, conn->notify_data, conn->notify_size);
169 return buf + 2 + conn->notify_size;
170 }
171
172
173 /* Capability negotiation as per RFC 5492 */
174
175 const struct bgp_af_caps *
176 bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177 {
178 struct bgp_af_caps *ac;
179
180 WALK_AF_CAPS(caps, ac)
181 if (ac->afi == afi)
182 return ac;
183
184 return NULL;
185 }
186
187 static struct bgp_af_caps *
188 bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
189 {
190 struct bgp_af_caps *ac;
191
192 WALK_AF_CAPS(caps, ac)
193 if (ac->afi == afi)
194 return ac;
195
196 ac = &caps->af_data[caps->af_count++];
197 memset(ac, 0, sizeof(struct bgp_af_caps));
198 ac->afi = afi;
199
200 return ac;
201 }
202
203 static int
204 bgp_af_caps_cmp(const void *X, const void *Y)
205 {
206 const struct bgp_af_caps *x = X, *y = Y;
207 return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
208 }
209
210
211 void
212 bgp_prepare_capabilities(struct bgp_conn *conn)
213 {
214 struct bgp_proto *p = conn->bgp;
215 struct bgp_channel *c;
216 struct bgp_caps *caps;
217 struct bgp_af_caps *ac;
218
219 if (!p->cf->capabilities)
220 {
221 /* Just prepare empty local_caps */
222 conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
223 return;
224 }
225
226 /* Prepare bgp_caps structure */
227 int n = list_length(&p->p.channels);
228 caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
229 conn->local_caps = caps;
230
231 caps->as4_support = p->cf->enable_as4;
232 caps->ext_messages = p->cf->enable_extended_messages;
233 caps->route_refresh = p->cf->enable_refresh;
234 caps->enhanced_refresh = p->cf->enable_refresh;
235
236 if (caps->as4_support)
237 caps->as4_number = p->public_as;
238
239 if (p->cf->gr_mode)
240 {
241 caps->gr_aware = 1;
242 caps->gr_time = p->cf->gr_time;
243 caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
244 }
245
246 if (p->cf->llgr_mode)
247 caps->llgr_aware = 1;
248
249 /* Allocate and fill per-AF fields */
250 WALK_LIST(c, p->p.channels)
251 {
252 ac = &caps->af_data[caps->af_count++];
253 ac->afi = c->afi;
254 ac->ready = 1;
255
256 ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
257 caps->any_ext_next_hop |= ac->ext_next_hop;
258
259 ac->add_path = c->cf->add_path;
260 caps->any_add_path |= ac->add_path;
261
262 if (c->cf->gr_able)
263 {
264 ac->gr_able = 1;
265
266 if (p->p.gr_recovery)
267 ac->gr_af_flags |= BGP_GRF_FORWARDING;
268 }
269
270 if (c->cf->llgr_able)
271 {
272 ac->llgr_able = 1;
273 ac->llgr_time = c->cf->llgr_time;
274
275 if (p->p.gr_recovery)
276 ac->llgr_flags |= BGP_LLGRF_FORWARDING;
277 }
278 }
279
280 /* Sort capability fields by AFI/SAFI */
281 qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
282 }
283
284 static byte *
285 bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
286 {
287 struct bgp_proto *p = conn->bgp;
288 struct bgp_caps *caps = conn->local_caps;
289 struct bgp_af_caps *ac;
290 byte *buf_head = buf;
291 byte *data;
292
293 /* Create capability list in buffer */
294
295 /*
296 * Note that max length is ~ 22+21*af_count. With max 12 channels that is
297 * 274. Option limit is 253 and buffer size is 4096, so we cannot overflow
298 * unless we add new capabilities or more AFs. XXXXX
299 */
300
301 WALK_AF_CAPS(caps, ac)
302 if (ac->ready)
303 {
304 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
305 *buf++ = 4; /* Capability data length */
306 put_af4(buf, ac->afi);
307 buf += 4;
308 }
309
310 if (caps->route_refresh)
311 {
312 *buf++ = 2; /* Capability 2: Support for route refresh */
313 *buf++ = 0; /* Capability data length */
314 }
315
316 if (caps->any_ext_next_hop)
317 {
318 *buf++ = 5; /* Capability 5: Support for extended next hop */
319 *buf++ = 0; /* Capability data length, will be fixed later */
320 data = buf;
321
322 WALK_AF_CAPS(caps, ac)
323 if (ac->ext_next_hop)
324 {
325 put_af4(buf, ac->afi);
326 put_u16(buf+4, BGP_AFI_IPV6);
327 buf += 6;
328 }
329
330 data[-1] = buf - data;
331 }
332
333 if (caps->ext_messages)
334 {
335 *buf++ = 6; /* Capability 6: Support for extended messages */
336 *buf++ = 0; /* Capability data length */
337 }
338
339 if (caps->gr_aware)
340 {
341 *buf++ = 64; /* Capability 64: Support for graceful restart */
342 *buf++ = 0; /* Capability data length, will be fixed later */
343 data = buf;
344
345 put_u16(buf, caps->gr_time);
346 buf[0] |= caps->gr_flags;
347 buf += 2;
348
349 WALK_AF_CAPS(caps, ac)
350 if (ac->gr_able)
351 {
352 put_af3(buf, ac->afi);
353 buf[3] = ac->gr_af_flags;
354 buf += 4;
355 }
356
357 data[-1] = buf - data;
358 }
359
360 if (caps->as4_support)
361 {
362 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
363 *buf++ = 4; /* Capability data length */
364 put_u32(buf, p->public_as);
365 buf += 4;
366 }
367
368 if (caps->any_add_path)
369 {
370 *buf++ = 69; /* Capability 69: Support for ADD-PATH */
371 *buf++ = 0; /* Capability data length, will be fixed later */
372 data = buf;
373
374 WALK_AF_CAPS(caps, ac)
375 if (ac->add_path)
376 {
377 put_af3(buf, ac->afi);
378 buf[3] = ac->add_path;
379 buf += 4;
380 }
381
382 data[-1] = buf - data;
383 }
384
385 if (caps->enhanced_refresh)
386 {
387 *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
388 *buf++ = 0; /* Capability data length */
389 }
390
391 if (caps->llgr_aware)
392 {
393 *buf++ = 71; /* Capability 71: Support for long-lived graceful restart */
394 *buf++ = 0; /* Capability data length, will be fixed later */
395 data = buf;
396
397 WALK_AF_CAPS(caps, ac)
398 if (ac->llgr_able)
399 {
400 put_af3(buf, ac->afi);
401 buf[3] = ac->llgr_flags;
402 put_u24(buf+4, ac->llgr_time);
403 buf += 7;
404 }
405
406 data[-1] = buf - data;
407 }
408
409 caps->length = buf - buf_head;
410
411 return buf;
412 }
413
414 static void
415 bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
416 {
417 struct bgp_proto *p = conn->bgp;
418 struct bgp_af_caps *ac;
419 int i, cl;
420 u32 af;
421
422 caps->length += len;
423
424 while (len > 0)
425 {
426 if (len < 2 || len < (2 + pos[1]))
427 goto err;
428
429 /* Capability length */
430 cl = pos[1];
431
432 /* Capability type */
433 switch (pos[0])
434 {
435 case 1: /* Multiprotocol capability, RFC 4760 */
436 if (cl != 4)
437 goto err;
438
439 af = get_af4(pos+2);
440 ac = bgp_get_af_caps(caps, af);
441 ac->ready = 1;
442 break;
443
444 case 2: /* Route refresh capability, RFC 2918 */
445 if (cl != 0)
446 goto err;
447
448 caps->route_refresh = 1;
449 break;
450
451 case 5: /* Extended next hop encoding capability, RFC 5549 */
452 if (cl % 6)
453 goto err;
454
455 for (i = 0; i < cl; i += 6)
456 {
457 /* Specified only for IPv4 prefixes with IPv6 next hops */
458 if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
459 (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
460 continue;
461
462 af = get_af4(pos+2+i);
463 ac = bgp_get_af_caps(caps, af);
464 ac->ext_next_hop = 1;
465 }
466 break;
467
468 case 6: /* Extended message length capability, RFC draft */
469 if (cl != 0)
470 goto err;
471
472 caps->ext_messages = 1;
473 break;
474
475 case 64: /* Graceful restart capability, RFC 4724 */
476 if (cl % 4 != 2)
477 goto err;
478
479 /* Only the last instance is valid */
480 WALK_AF_CAPS(caps, ac)
481 {
482 ac->gr_able = 0;
483 ac->gr_af_flags = 0;
484 }
485
486 caps->gr_aware = 1;
487 caps->gr_flags = pos[2] & 0xf0;
488 caps->gr_time = get_u16(pos + 2) & 0x0fff;
489
490 for (i = 2; i < cl; i += 4)
491 {
492 af = get_af3(pos+2+i);
493 ac = bgp_get_af_caps(caps, af);
494 ac->gr_able = 1;
495 ac->gr_af_flags = pos[2+i+3];
496 }
497 break;
498
499 case 65: /* AS4 capability, RFC 6793 */
500 if (cl != 4)
501 goto err;
502
503 caps->as4_support = 1;
504 caps->as4_number = get_u32(pos + 2);
505 break;
506
507 case 69: /* ADD-PATH capability, RFC 7911 */
508 if (cl % 4)
509 goto err;
510
511 for (i = 0; i < cl; i += 4)
512 {
513 byte val = pos[2+i+3];
514 if (!val || (val > BGP_ADD_PATH_FULL))
515 {
516 log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
517 p->p.name, val);
518 break;
519 }
520 }
521
522 for (i = 0; i < cl; i += 4)
523 {
524 af = get_af3(pos+2+i);
525 ac = bgp_get_af_caps(caps, af);
526 ac->add_path = pos[2+i+3];
527 }
528 break;
529
530 case 70: /* Enhanced route refresh capability, RFC 7313 */
531 if (cl != 0)
532 goto err;
533
534 caps->enhanced_refresh = 1;
535 break;
536
537 case 71: /* Long lived graceful restart capability, RFC draft */
538 if (cl % 7)
539 goto err;
540
541 /* Presumably, only the last instance is valid */
542 WALK_AF_CAPS(caps, ac)
543 {
544 ac->llgr_able = 0;
545 ac->llgr_flags = 0;
546 ac->llgr_time = 0;
547 }
548
549 caps->llgr_aware = 1;
550
551 for (i = 0; i < cl; i += 7)
552 {
553 af = get_af3(pos+2+i);
554 ac = bgp_get_af_caps(caps, af);
555 ac->llgr_able = 1;
556 ac->llgr_flags = pos[2+i+3];
557 ac->llgr_time = get_u24(pos + 2+i+4);
558 }
559 break;
560
561 /* We can safely ignore all other capabilities */
562 }
563
564 ADVANCE(pos, len, 2 + cl);
565 }
566
567 /* The LLGR capability must be advertised together with the GR capability,
568 otherwise it must be disregarded */
569 if (!caps->gr_aware && caps->llgr_aware)
570 {
571 caps->llgr_aware = 0;
572 WALK_AF_CAPS(caps, ac)
573 {
574 ac->llgr_able = 0;
575 ac->llgr_flags = 0;
576 ac->llgr_time = 0;
577 }
578 }
579
580 return;
581
582 err:
583 bgp_error(conn, 2, 0, NULL, 0);
584 return;
585 }
586
587 static int
588 bgp_check_capabilities(struct bgp_conn *conn)
589 {
590 struct bgp_proto *p = conn->bgp;
591 struct bgp_caps *local = conn->local_caps;
592 struct bgp_caps *remote = conn->remote_caps;
593 struct bgp_channel *c;
594 int count = 0;
595
596 /* This is partially overlapping with bgp_conn_enter_established_state(),
597 but we need to run this just after we receive OPEN message */
598
599 WALK_LIST(c, p->p.channels)
600 {
601 const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
602 const struct bgp_af_caps *rem = bgp_find_af_caps(remote, c->afi);
603
604 /* Find out whether this channel will be active */
605 int active = loc && loc->ready &&
606 ((rem && rem->ready) || (!remote->length && (c->afi == BGP_AF_IPV4)));
607
608 /* Mandatory must be active */
609 if (c->cf->mandatory && !active)
610 return 0;
611
612 if (active)
613 count++;
614 }
615
616 /* We need at least one channel active */
617 if (!count)
618 return 0;
619
620 return 1;
621 }
622
623 static int
624 bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
625 {
626 struct bgp_proto *p = conn->bgp;
627 struct bgp_caps *caps;
628 int ol;
629
630 /* Max number of announced AFIs is limited by max option length (255) */
631 caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
632 memset(caps, 0, sizeof(struct bgp_caps));
633
634 while (len > 0)
635 {
636 if ((len < 2) || (len < (2 + pos[1])))
637 { bgp_error(conn, 2, 0, NULL, 0); return -1; }
638
639 ol = pos[1];
640 if (pos[0] == 2)
641 {
642 /* BGP capabilities, RFC 5492 */
643 if (p->cf->capabilities)
644 bgp_read_capabilities(conn, caps, pos + 2, ol);
645 }
646 else
647 {
648 /* Unknown option */
649 bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
650 return -1;
651 }
652
653 ADVANCE(pos, len, 2 + ol);
654 }
655
656 uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
657 conn->remote_caps = mb_allocz(p->p.pool, n);
658 memcpy(conn->remote_caps, caps, n);
659
660 return 0;
661 }
662
663 static byte *
664 bgp_create_open(struct bgp_conn *conn, byte *buf)
665 {
666 struct bgp_proto *p = conn->bgp;
667
668 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
669 BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
670
671 buf[0] = BGP_VERSION;
672 put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
673 put_u16(buf+3, p->cf->hold_time);
674 put_u32(buf+5, p->local_id);
675
676 if (p->cf->capabilities)
677 {
678 /* Prepare local_caps and write capabilities to buffer */
679 byte *end = bgp_write_capabilities(conn, buf+12);
680 uint len = end - (buf+12);
681
682 buf[9] = len + 2; /* Optional parameters length */
683 buf[10] = 2; /* Option 2: Capability list */
684 buf[11] = len; /* Option data length */
685
686 return end;
687 }
688 else
689 {
690 buf[9] = 0; /* No optional parameters */
691 return buf + 10;
692 }
693
694 return buf;
695 }
696
697 static void
698 bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
699 {
700 struct bgp_proto *p = conn->bgp;
701 struct bgp_conn *other;
702 u32 asn, hold, id;
703
704 /* Check state */
705 if (conn->state != BS_OPENSENT)
706 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
707
708 /* Check message contents */
709 if (len < 29 || len != 29 + (uint) pkt[28])
710 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
711
712 if (pkt[19] != BGP_VERSION)
713 { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
714
715 asn = get_u16(pkt+20);
716 hold = get_u16(pkt+22);
717 id = get_u32(pkt+24);
718 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
719
720 if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
721 return;
722
723 if (hold > 0 && hold < 3)
724 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
725
726 /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
727 if (!id || (p->is_internal && id == p->local_id))
728 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
729
730 /* RFC 5492 4 - check for required capabilities */
731 if (p->cf->capabilities && !bgp_check_capabilities(conn))
732 { bgp_error(conn, 2, 7, NULL, 0); return; }
733
734 struct bgp_caps *caps = conn->remote_caps;
735
736 if (caps->as4_support)
737 {
738 u32 as4 = caps->as4_number;
739
740 if ((as4 != asn) && (asn != AS_TRANS))
741 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
742
743 /* When remote ASN is unspecified, it must be external one */
744 if (p->remote_as ? (as4 != p->remote_as) : (as4 == p->local_as))
745 { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
746
747 conn->received_as = as4;
748 }
749 else
750 {
751 if (p->remote_as ? (asn != p->remote_as) : (asn == p->local_as))
752 { bgp_error(conn, 2, 2, pkt+20, 2); return; }
753
754 conn->received_as = asn;
755 }
756
757 /* Check the other connection */
758 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
759 switch (other->state)
760 {
761 case BS_CONNECT:
762 case BS_ACTIVE:
763 /* Stop outgoing connection attempts */
764 bgp_conn_enter_idle_state(other);
765 break;
766
767 case BS_IDLE:
768 case BS_OPENSENT:
769 case BS_CLOSE:
770 break;
771
772 case BS_OPENCONFIRM:
773 /*
774 * Description of collision detection rules in RFC 4271 is confusing and
775 * contradictory, but it is essentially:
776 *
777 * 1. Router with higher ID is dominant
778 * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
779 * 3. When both connections are in OpenConfirm state, one initiated by
780 * the dominant router is kept.
781 *
782 * The first line in the expression below evaluates whether the neighbor
783 * is dominant, the second line whether the new connection was initiated
784 * by the neighbor. If both are true (or both are false), we keep the new
785 * connection, otherwise we keep the old one.
786 */
787 if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
788 == (conn == &p->incoming_conn))
789 {
790 /* Should close the other connection */
791 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
792 bgp_error(other, 6, 7, NULL, 0);
793 break;
794 }
795 /* Fall thru */
796 case BS_ESTABLISHED:
797 /* Should close this connection */
798 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
799 bgp_error(conn, 6, 7, NULL, 0);
800 return;
801
802 default:
803 bug("bgp_rx_open: Unknown state");
804 }
805
806 /* Update our local variables */
807 conn->hold_time = MIN(hold, p->cf->hold_time);
808 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
809 conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
810 conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
811 p->remote_id = id;
812
813 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
814 conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
815
816 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
817 bgp_start_timer(conn->hold_timer, conn->hold_time);
818 bgp_conn_enter_openconfirm_state(conn);
819 }
820
821
822 /*
823 * Next hop handling
824 */
825
826 #define REPORT(msg, args...) \
827 ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
828
829 #define DISCARD(msg, args...) \
830 ({ REPORT(msg, ## args); return; })
831
832 #define WITHDRAW(msg, args...) \
833 ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
834
835 #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
836 #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
837 #define NO_NEXT_HOP "Missing NEXT_HOP attribute"
838 #define NO_LABEL_STACK "Missing MPLS stack"
839
840
841 static void
842 bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
843 {
844 struct bgp_proto *p = s->proto;
845 struct bgp_channel *c = s->channel;
846
847 if (c->cf->gw_mode == GW_DIRECT)
848 {
849 neighbor *nbr = NULL;
850
851 /* GW_DIRECT -> single_hop -> p->neigh != NULL */
852 if (ipa_nonzero(gw))
853 nbr = neigh_find(&p->p, gw, NULL, 0);
854 else if (ipa_nonzero(ll))
855 nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
856
857 if (!nbr || (nbr->scope == SCOPE_HOST))
858 WITHDRAW(BAD_NEXT_HOP);
859
860 a->dest = RTD_UNICAST;
861 a->nh.gw = nbr->addr;
862 a->nh.iface = nbr->iface;
863 }
864 else /* GW_RECURSIVE */
865 {
866 if (ipa_zero(gw))
867 WITHDRAW(BAD_NEXT_HOP);
868
869 rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
870 s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
871
872 if (!s->mpls)
873 rta_apply_hostentry(a, s->hostentry, NULL);
874
875 /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
876 }
877 }
878
879 static void
880 bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
881 {
882 if (lnum > MPLS_MAX_LABEL_STACK)
883 {
884 REPORT("Too many MPLS labels ($u)", lnum);
885
886 a->dest = RTD_UNREACHABLE;
887 a->hostentry = NULL;
888 a->nh = (struct nexthop) { };
889 return;
890 }
891
892 /* Handle implicit NULL as empty MPLS stack */
893 if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
894 lnum = 0;
895
896 if (s->channel->cf->gw_mode == GW_DIRECT)
897 {
898 a->nh.labels = lnum;
899 memcpy(a->nh.label, labels, 4*lnum);
900 }
901 else /* GW_RECURSIVE */
902 {
903 mpls_label_stack ms;
904
905 ms.len = lnum;
906 memcpy(ms.stack, labels, 4*lnum);
907 rta_apply_hostentry(a, s->hostentry, &ms);
908 }
909 }
910
911
912 static int
913 bgp_match_src(struct bgp_export_state *s, int mode)
914 {
915 switch (mode)
916 {
917 case NH_NO: return 0;
918 case NH_ALL: return 1;
919 case NH_IBGP: return s->src && s->src->is_internal;
920 case NH_EBGP: return s->src && !s->src->is_internal;
921 default: return 0;
922 }
923 }
924
925 static inline int
926 bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
927 {
928 struct bgp_proto *p = s->proto;
929 struct bgp_channel *c = s->channel;
930 ip_addr *nh = (void *) a->u.ptr->data;
931
932 /* Handle next hop self option */
933 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
934 return 0;
935
936 /* Handle next hop keep option */
937 if (c->cf->next_hop_keep && bgp_match_src(s, c->cf->next_hop_keep))
938 return 1;
939
940 /* Keep it when explicitly set in export filter */
941 if (a->type & EAF_FRESH)
942 return 1;
943
944 /* Check for non-matching AF */
945 if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
946 return 0;
947
948 /* Keep it when exported to internal peers */
949 if (p->is_interior && ipa_nonzero(*nh))
950 return 1;
951
952 /* Keep it when forwarded between single-hop BGPs on the same iface */
953 struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
954 return p->neigh && (p->neigh->iface == ifa);
955 }
956
957 static inline int
958 bgp_use_gateway(struct bgp_export_state *s)
959 {
960 struct bgp_proto *p = s->proto;
961 struct bgp_channel *c = s->channel;
962 rta *ra = s->route->attrs;
963
964 /* Handle next hop self option - also applies to gateway */
965 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
966 return 0;
967
968 /* We need one valid global gateway */
969 if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
970 return 0;
971
972 /* Check for non-matching AF */
973 if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
974 return 0;
975
976 /* Use it when exported to internal peers */
977 if (p->is_interior)
978 return 1;
979
980 /* Use it when forwarded to single-hop BGP peer on on the same iface */
981 return p->neigh && (p->neigh->iface == ra->nh.iface);
982 }
983
984 static void
985 bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
986 {
987 if (!a || !bgp_use_next_hop(s, a))
988 {
989 if (bgp_use_gateway(s))
990 {
991 rta *ra = s->route->attrs;
992 ip_addr nh[1] = { ra->nh.gw };
993 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
994
995 if (s->mpls)
996 {
997 u32 implicit_null = BGP_MPLS_NULL;
998 u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
999 uint lnum = ra->nh.labels ? ra->nh.labels : 1;
1000 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
1001 }
1002 }
1003 else
1004 {
1005 ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
1006 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
1007
1008 /* TODO: Use local MPLS assigned label */
1009 if (s->mpls)
1010 {
1011 u32 implicit_null = BGP_MPLS_NULL;
1012 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
1013 }
1014 }
1015 }
1016
1017 /* Check if next hop is valid */
1018 a = bgp_find_attr(*to, BA_NEXT_HOP);
1019 if (!a)
1020 WITHDRAW(NO_NEXT_HOP);
1021
1022 ip_addr *nh = (void *) a->u.ptr->data;
1023 ip_addr peer = s->proto->remote_ip;
1024 uint len = a->u.ptr->length;
1025
1026 /* Forbid zero next hop */
1027 if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
1028 WITHDRAW(BAD_NEXT_HOP);
1029
1030 /* Forbid next hop equal to neighbor IP */
1031 if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
1032 WITHDRAW(BAD_NEXT_HOP);
1033
1034 /* Forbid next hop with non-matching AF */
1035 if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
1036 !s->channel->ext_next_hop)
1037 WITHDRAW(BAD_NEXT_HOP);
1038
1039 /* Just check if MPLS stack */
1040 if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
1041 WITHDRAW(NO_LABEL_STACK);
1042 }
1043
1044 static uint
1045 bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1046 {
1047 /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
1048 ip_addr *nh = (void *) a->u.ptr->data;
1049 uint len = a->u.ptr->length;
1050
1051 ASSERT((len == 16) || (len == 32));
1052
1053 /*
1054 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1055 * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
1056 * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
1057 * IPv6 address with IPv6 NLRI.
1058 */
1059
1060 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1061 {
1062 put_ip4(buf, ipa_to_ip4(nh[0]));
1063 return 4;
1064 }
1065
1066 put_ip6(buf, ipa_to_ip6(nh[0]));
1067
1068 if (len == 32)
1069 put_ip6(buf+16, ipa_to_ip6(nh[1]));
1070
1071 return len;
1072 }
1073
1074 static void
1075 bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1076 {
1077 struct bgp_channel *c = s->channel;
1078 struct adata *ad = lp_alloc_adata(s->pool, 32);
1079 ip_addr *nh = (void *) ad->data;
1080
1081 if (len == 4)
1082 {
1083 nh[0] = ipa_from_ip4(get_ip4(data));
1084 nh[1] = IPA_NONE;
1085 }
1086 else if (len == 16)
1087 {
1088 nh[0] = ipa_from_ip6(get_ip6(data));
1089 nh[1] = IPA_NONE;
1090
1091 if (ipa_is_link_local(nh[0]))
1092 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1093 }
1094 else if (len == 32)
1095 {
1096 nh[0] = ipa_from_ip6(get_ip6(data));
1097 nh[1] = ipa_from_ip6(get_ip6(data+16));
1098
1099 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1100 nh[1] = IPA_NONE;
1101 }
1102 else
1103 bgp_parse_error(s, 9);
1104
1105 if (ipa_zero(nh[1]))
1106 ad->length = 16;
1107
1108 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1109 WITHDRAW(BAD_NEXT_HOP);
1110
1111 // XXXX validate next hop
1112
1113 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1114 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1115 }
1116
1117 static uint
1118 bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1119 {
1120 ip_addr *nh = (void *) a->u.ptr->data;
1121 uint len = a->u.ptr->length;
1122
1123 ASSERT((len == 16) || (len == 32));
1124
1125 /*
1126 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1127 * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1128 * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1129 * IPv6 address with VPNv6 NLRI.
1130 */
1131
1132 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1133 {
1134 put_u64(buf, 0); /* VPN RD is 0 */
1135 put_ip4(buf+8, ipa_to_ip4(nh[0]));
1136 return 12;
1137 }
1138
1139 put_u64(buf, 0); /* VPN RD is 0 */
1140 put_ip6(buf+8, ipa_to_ip6(nh[0]));
1141
1142 if (len == 16)
1143 return 24;
1144
1145 put_u64(buf+24, 0); /* VPN RD is 0 */
1146 put_ip6(buf+32, ipa_to_ip6(nh[1]));
1147
1148 return 48;
1149 }
1150
1151 static void
1152 bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1153 {
1154 struct bgp_channel *c = s->channel;
1155 struct adata *ad = lp_alloc_adata(s->pool, 32);
1156 ip_addr *nh = (void *) ad->data;
1157
1158 if (len == 12)
1159 {
1160 nh[0] = ipa_from_ip4(get_ip4(data+8));
1161 nh[1] = IPA_NONE;
1162 }
1163 else if (len == 24)
1164 {
1165 nh[0] = ipa_from_ip6(get_ip6(data+8));
1166 nh[1] = IPA_NONE;
1167
1168 if (ipa_is_link_local(nh[0]))
1169 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1170 }
1171 else if (len == 48)
1172 {
1173 nh[0] = ipa_from_ip6(get_ip6(data+8));
1174 nh[1] = ipa_from_ip6(get_ip6(data+32));
1175
1176 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1177 nh[1] = IPA_NONE;
1178 }
1179 else
1180 bgp_parse_error(s, 9);
1181
1182 if (ipa_zero(nh[1]))
1183 ad->length = 16;
1184
1185 /* XXXX which error */
1186 if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1187 bgp_parse_error(s, 9);
1188
1189 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1190 WITHDRAW(BAD_NEXT_HOP);
1191
1192 // XXXX validate next hop
1193
1194 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1195 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1196 }
1197
1198
1199
1200 static uint
1201 bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1202 {
1203 return 0;
1204 }
1205
1206 static void
1207 bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1208 {
1209 /*
1210 * Although we expect no next hop and RFC 7606 7.11 states that attribute
1211 * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1212 * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1213 */
1214
1215 return;
1216 }
1217
1218 static void
1219 bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1220 {
1221 /* NEXT_HOP shall not pass */
1222 if (a)
1223 bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1224 }
1225
1226
1227 /*
1228 * UPDATE
1229 */
1230
1231 static void
1232 bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1233 {
1234 if (path_id != s->last_id)
1235 {
1236 s->last_src = rt_get_source(&s->proto->p, path_id);
1237 s->last_id = path_id;
1238
1239 rta_free(s->cached_rta);
1240 s->cached_rta = NULL;
1241 }
1242
1243 if (!a0)
1244 {
1245 /* Route withdraw */
1246 rte_update3(&s->channel->c, n, NULL, s->last_src);
1247 return;
1248 }
1249
1250 /* Prepare cached route attributes */
1251 if (s->cached_rta == NULL)
1252 {
1253 a0->src = s->last_src;
1254
1255 /* Workaround for rta_lookup() breaking eattrs */
1256 ea_list *ea = a0->eattrs;
1257 s->cached_rta = rta_lookup(a0);
1258 a0->eattrs = ea;
1259 }
1260
1261 rta *a = rta_clone(s->cached_rta);
1262 rte *e = rte_get_temp(a);
1263
1264 e->pflags = 0;
1265 e->u.bgp.suppressed = 0;
1266 e->u.bgp.stale = -1;
1267 rte_update3(&s->channel->c, n, e, s->last_src);
1268 }
1269
1270 static void
1271 bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1272 {
1273 u32 dummy = 0;
1274 u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1275 uint lnum = mpls ? (mpls->length / 4) : 1;
1276
1277 for (uint i = 0; i < lnum; i++)
1278 {
1279 put_u24(*pos, labels[i] << 4);
1280 ADVANCE(*pos, *size, 3);
1281 }
1282
1283 /* Add bottom-of-stack flag */
1284 (*pos)[-1] |= BGP_MPLS_BOS;
1285
1286 *pxlen += 24 * lnum;
1287 }
1288
1289 static void
1290 bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1291 {
1292 u32 labels[BGP_MPLS_MAX], label;
1293 uint lnum = 0;
1294
1295 do {
1296 if (*pxlen < 24)
1297 bgp_parse_error(s, 1);
1298
1299 label = get_u24(*pos);
1300 labels[lnum++] = label >> 4;
1301 ADVANCE(*pos, *len, 3);
1302 *pxlen -= 24;
1303
1304 /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1305 fixed-size 24-bit Compatibility field, which MUST be ignored */
1306 if (!a && !s->err_withdraw)
1307 return;
1308 }
1309 while (!(label & BGP_MPLS_BOS));
1310
1311 if (!a)
1312 return;
1313
1314 /* Attach MPLS attribute unless we already have one */
1315 if (!s->mpls_labels)
1316 {
1317 s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1318 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1319 }
1320
1321 /* Overwrite data in the attribute */
1322 s->mpls_labels->length = 4*lnum;
1323 memcpy(s->mpls_labels->data, labels, 4*lnum);
1324
1325 /* Update next hop entry in rta */
1326 bgp_apply_mpls_labels(s, a, labels, lnum);
1327
1328 /* Attributes were changed, invalidate cached entry */
1329 rta_free(s->cached_rta);
1330 s->cached_rta = NULL;
1331
1332 return;
1333 }
1334
1335 static uint
1336 bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1337 {
1338 byte *pos = buf;
1339
1340 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1341 {
1342 struct bgp_prefix *px = HEAD(buck->prefixes);
1343 struct net_addr_ip4 *net = (void *) px->net;
1344
1345 /* Encode path ID */
1346 if (s->add_path)
1347 {
1348 put_u32(pos, px->path_id);
1349 ADVANCE(pos, size, 4);
1350 }
1351
1352 /* Encode prefix length */
1353 *pos = net->pxlen;
1354 ADVANCE(pos, size, 1);
1355
1356 /* Encode MPLS labels */
1357 if (s->mpls)
1358 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1359
1360 /* Encode prefix body */
1361 ip4_addr a = ip4_hton(net->prefix);
1362 uint b = (net->pxlen + 7) / 8;
1363 memcpy(pos, &a, b);
1364 ADVANCE(pos, size, b);
1365
1366 bgp_free_prefix(s->channel, px);
1367 }
1368
1369 return pos - buf;
1370 }
1371
1372 static void
1373 bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1374 {
1375 while (len)
1376 {
1377 net_addr_ip4 net;
1378 u32 path_id = 0;
1379
1380 /* Decode path ID */
1381 if (s->add_path)
1382 {
1383 if (len < 5)
1384 bgp_parse_error(s, 1);
1385
1386 path_id = get_u32(pos);
1387 ADVANCE(pos, len, 4);
1388 }
1389
1390 /* Decode prefix length */
1391 uint l = *pos;
1392 ADVANCE(pos, len, 1);
1393
1394 if (len < ((l + 7) / 8))
1395 bgp_parse_error(s, 1);
1396
1397 /* Decode MPLS labels */
1398 if (s->mpls)
1399 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1400
1401 if (l > IP4_MAX_PREFIX_LENGTH)
1402 bgp_parse_error(s, 10);
1403
1404 /* Decode prefix body */
1405 ip4_addr addr = IP4_NONE;
1406 uint b = (l + 7) / 8;
1407 memcpy(&addr, pos, b);
1408 ADVANCE(pos, len, b);
1409
1410 net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1411 net_normalize_ip4(&net);
1412
1413 // XXXX validate prefix
1414
1415 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1416 }
1417 }
1418
1419
1420 static uint
1421 bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1422 {
1423 byte *pos = buf;
1424
1425 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1426 {
1427 struct bgp_prefix *px = HEAD(buck->prefixes);
1428 struct net_addr_ip6 *net = (void *) px->net;
1429
1430 /* Encode path ID */
1431 if (s->add_path)
1432 {
1433 put_u32(pos, px->path_id);
1434 ADVANCE(pos, size, 4);
1435 }
1436
1437 /* Encode prefix length */
1438 *pos = net->pxlen;
1439 ADVANCE(pos, size, 1);
1440
1441 /* Encode MPLS labels */
1442 if (s->mpls)
1443 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1444
1445 /* Encode prefix body */
1446 ip6_addr a = ip6_hton(net->prefix);
1447 uint b = (net->pxlen + 7) / 8;
1448 memcpy(pos, &a, b);
1449 ADVANCE(pos, size, b);
1450
1451 bgp_free_prefix(s->channel, px);
1452 }
1453
1454 return pos - buf;
1455 }
1456
1457 static void
1458 bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1459 {
1460 while (len)
1461 {
1462 net_addr_ip6 net;
1463 u32 path_id = 0;
1464
1465 /* Decode path ID */
1466 if (s->add_path)
1467 {
1468 if (len < 5)
1469 bgp_parse_error(s, 1);
1470
1471 path_id = get_u32(pos);
1472 ADVANCE(pos, len, 4);
1473 }
1474
1475 /* Decode prefix length */
1476 uint l = *pos;
1477 ADVANCE(pos, len, 1);
1478
1479 if (len < ((l + 7) / 8))
1480 bgp_parse_error(s, 1);
1481
1482 /* Decode MPLS labels */
1483 if (s->mpls)
1484 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1485
1486 if (l > IP6_MAX_PREFIX_LENGTH)
1487 bgp_parse_error(s, 10);
1488
1489 /* Decode prefix body */
1490 ip6_addr addr = IP6_NONE;
1491 uint b = (l + 7) / 8;
1492 memcpy(&addr, pos, b);
1493 ADVANCE(pos, len, b);
1494
1495 net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1496 net_normalize_ip6(&net);
1497
1498 // XXXX validate prefix
1499
1500 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1501 }
1502 }
1503
1504 static uint
1505 bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1506 {
1507 byte *pos = buf;
1508
1509 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1510 {
1511 struct bgp_prefix *px = HEAD(buck->prefixes);
1512 struct net_addr_vpn4 *net = (void *) px->net;
1513
1514 /* Encode path ID */
1515 if (s->add_path)
1516 {
1517 put_u32(pos, px->path_id);
1518 ADVANCE(pos, size, 4);
1519 }
1520
1521 /* Encode prefix length */
1522 *pos = 64 + net->pxlen;
1523 ADVANCE(pos, size, 1);
1524
1525 /* Encode MPLS labels */
1526 if (s->mpls)
1527 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1528
1529 /* Encode route distinguisher */
1530 put_u64(pos, net->rd);
1531 ADVANCE(pos, size, 8);
1532
1533 /* Encode prefix body */
1534 ip4_addr a = ip4_hton(net->prefix);
1535 uint b = (net->pxlen + 7) / 8;
1536 memcpy(pos, &a, b);
1537 ADVANCE(pos, size, b);
1538
1539 bgp_free_prefix(s->channel, px);
1540 }
1541
1542 return pos - buf;
1543 }
1544
1545 static void
1546 bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1547 {
1548 while (len)
1549 {
1550 net_addr_vpn4 net;
1551 u32 path_id = 0;
1552
1553 /* Decode path ID */
1554 if (s->add_path)
1555 {
1556 if (len < 5)
1557 bgp_parse_error(s, 1);
1558
1559 path_id = get_u32(pos);
1560 ADVANCE(pos, len, 4);
1561 }
1562
1563 /* Decode prefix length */
1564 uint l = *pos;
1565 ADVANCE(pos, len, 1);
1566
1567 if (len < ((l + 7) / 8))
1568 bgp_parse_error(s, 1);
1569
1570 /* Decode MPLS labels */
1571 if (s->mpls)
1572 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1573
1574 /* Decode route distinguisher */
1575 if (l < 64)
1576 bgp_parse_error(s, 1);
1577
1578 u64 rd = get_u64(pos);
1579 ADVANCE(pos, len, 8);
1580 l -= 64;
1581
1582 if (l > IP4_MAX_PREFIX_LENGTH)
1583 bgp_parse_error(s, 10);
1584
1585 /* Decode prefix body */
1586 ip4_addr addr = IP4_NONE;
1587 uint b = (l + 7) / 8;
1588 memcpy(&addr, pos, b);
1589 ADVANCE(pos, len, b);
1590
1591 net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1592 net_normalize_vpn4(&net);
1593
1594 // XXXX validate prefix
1595
1596 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1597 }
1598 }
1599
1600
1601 static uint
1602 bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1603 {
1604 byte *pos = buf;
1605
1606 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1607 {
1608 struct bgp_prefix *px = HEAD(buck->prefixes);
1609 struct net_addr_vpn6 *net = (void *) px->net;
1610
1611 /* Encode path ID */
1612 if (s->add_path)
1613 {
1614 put_u32(pos, px->path_id);
1615 ADVANCE(pos, size, 4);
1616 }
1617
1618 /* Encode prefix length */
1619 *pos = 64 + net->pxlen;
1620 ADVANCE(pos, size, 1);
1621
1622 /* Encode MPLS labels */
1623 if (s->mpls)
1624 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1625
1626 /* Encode route distinguisher */
1627 put_u64(pos, net->rd);
1628 ADVANCE(pos, size, 8);
1629
1630 /* Encode prefix body */
1631 ip6_addr a = ip6_hton(net->prefix);
1632 uint b = (net->pxlen + 7) / 8;
1633 memcpy(pos, &a, b);
1634 ADVANCE(pos, size, b);
1635
1636 bgp_free_prefix(s->channel, px);
1637 }
1638
1639 return pos - buf;
1640 }
1641
1642 static void
1643 bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1644 {
1645 while (len)
1646 {
1647 net_addr_vpn6 net;
1648 u32 path_id = 0;
1649
1650 /* Decode path ID */
1651 if (s->add_path)
1652 {
1653 if (len < 5)
1654 bgp_parse_error(s, 1);
1655
1656 path_id = get_u32(pos);
1657 ADVANCE(pos, len, 4);
1658 }
1659
1660 /* Decode prefix length */
1661 uint l = *pos;
1662 ADVANCE(pos, len, 1);
1663
1664 if (len < ((l + 7) / 8))
1665 bgp_parse_error(s, 1);
1666
1667 /* Decode MPLS labels */
1668 if (s->mpls)
1669 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1670
1671 /* Decode route distinguisher */
1672 if (l < 64)
1673 bgp_parse_error(s, 1);
1674
1675 u64 rd = get_u64(pos);
1676 ADVANCE(pos, len, 8);
1677 l -= 64;
1678
1679 if (l > IP6_MAX_PREFIX_LENGTH)
1680 bgp_parse_error(s, 10);
1681
1682 /* Decode prefix body */
1683 ip6_addr addr = IP6_NONE;
1684 uint b = (l + 7) / 8;
1685 memcpy(&addr, pos, b);
1686 ADVANCE(pos, len, b);
1687
1688 net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1689 net_normalize_vpn6(&net);
1690
1691 // XXXX validate prefix
1692
1693 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1694 }
1695 }
1696
1697
1698 static uint
1699 bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1700 {
1701 byte *pos = buf;
1702
1703 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1704 {
1705 struct bgp_prefix *px = HEAD(buck->prefixes);
1706 struct net_addr_flow4 *net = (void *) px->net;
1707 uint flen = net->length - sizeof(net_addr_flow4);
1708
1709 /* Encode path ID */
1710 if (s->add_path)
1711 {
1712 put_u32(pos, px->path_id);
1713 ADVANCE(pos, size, 4);
1714 }
1715
1716 if (flen > size)
1717 break;
1718
1719 /* Copy whole flow data including length */
1720 memcpy(pos, net->data, flen);
1721 ADVANCE(pos, size, flen);
1722
1723 bgp_free_prefix(s->channel, px);
1724 }
1725
1726 return pos - buf;
1727 }
1728
1729 static void
1730 bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1731 {
1732 while (len)
1733 {
1734 u32 path_id = 0;
1735
1736 /* Decode path ID */
1737 if (s->add_path)
1738 {
1739 if (len < 4)
1740 bgp_parse_error(s, 1);
1741
1742 path_id = get_u32(pos);
1743 ADVANCE(pos, len, 4);
1744 }
1745
1746 if (len < 2)
1747 bgp_parse_error(s, 1);
1748
1749 /* Decode flow length */
1750 uint hlen = flow_hdr_length(pos);
1751 uint dlen = flow_read_length(pos);
1752 uint flen = hlen + dlen;
1753 byte *data = pos + hlen;
1754
1755 if (len < flen)
1756 bgp_parse_error(s, 1);
1757
1758 /* Validate flow data */
1759 enum flow_validated_state r = flow4_validate(data, dlen);
1760 if (r != FLOW_ST_VALID)
1761 {
1762 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1763 bgp_parse_error(s, 1);
1764 }
1765
1766 if (data[0] != FLOW_TYPE_DST_PREFIX)
1767 {
1768 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1769 bgp_parse_error(s, 1);
1770 }
1771
1772 /* Decode dst prefix */
1773 ip4_addr px = IP4_NONE;
1774 uint pxlen = data[1];
1775
1776 // FIXME: Use some generic function
1777 memcpy(&px, data+2, BYTES(pxlen));
1778 px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen));
1779
1780 /* Prepare the flow */
1781 net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1782 net_fill_flow4(n, px, pxlen, pos, flen);
1783 ADVANCE(pos, len, flen);
1784
1785 bgp_rte_update(s, n, path_id, a);
1786 }
1787 }
1788
1789
1790 static uint
1791 bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1792 {
1793 byte *pos = buf;
1794
1795 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1796 {
1797 struct bgp_prefix *px = HEAD(buck->prefixes);
1798 struct net_addr_flow6 *net = (void *) px->net;
1799 uint flen = net->length - sizeof(net_addr_flow6);
1800
1801 /* Encode path ID */
1802 if (s->add_path)
1803 {
1804 put_u32(pos, px->path_id);
1805 ADVANCE(pos, size, 4);
1806 }
1807
1808 if (flen > size)
1809 break;
1810
1811 /* Copy whole flow data including length */
1812 memcpy(pos, net->data, flen);
1813 ADVANCE(pos, size, flen);
1814
1815 bgp_free_prefix(s->channel, px);
1816 }
1817
1818 return pos - buf;
1819 }
1820
1821 static void
1822 bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1823 {
1824 while (len)
1825 {
1826 u32 path_id = 0;
1827
1828 /* Decode path ID */
1829 if (s->add_path)
1830 {
1831 if (len < 4)
1832 bgp_parse_error(s, 1);
1833
1834 path_id = get_u32(pos);
1835 ADVANCE(pos, len, 4);
1836 }
1837
1838 if (len < 2)
1839 bgp_parse_error(s, 1);
1840
1841 /* Decode flow length */
1842 uint hlen = flow_hdr_length(pos);
1843 uint dlen = flow_read_length(pos);
1844 uint flen = hlen + dlen;
1845 byte *data = pos + hlen;
1846
1847 if (len < flen)
1848 bgp_parse_error(s, 1);
1849
1850 /* Validate flow data */
1851 enum flow_validated_state r = flow6_validate(data, dlen);
1852 if (r != FLOW_ST_VALID)
1853 {
1854 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1855 bgp_parse_error(s, 1);
1856 }
1857
1858 if (data[0] != FLOW_TYPE_DST_PREFIX)
1859 {
1860 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1861 bgp_parse_error(s, 1);
1862 }
1863
1864 /* Decode dst prefix */
1865 ip6_addr px = IP6_NONE;
1866 uint pxlen = data[1];
1867
1868 // FIXME: Use some generic function
1869 memcpy(&px, data+2, BYTES(pxlen));
1870 px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen));
1871
1872 /* Prepare the flow */
1873 net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1874 net_fill_flow6(n, px, pxlen, pos, flen);
1875 ADVANCE(pos, len, flen);
1876
1877 bgp_rte_update(s, n, path_id, a);
1878 }
1879 }
1880
1881
1882 static const struct bgp_af_desc bgp_af_table[] = {
1883 {
1884 .afi = BGP_AF_IPV4,
1885 .net = NET_IP4,
1886 .name = "ipv4",
1887 .encode_nlri = bgp_encode_nlri_ip4,
1888 .decode_nlri = bgp_decode_nlri_ip4,
1889 .encode_next_hop = bgp_encode_next_hop_ip,
1890 .decode_next_hop = bgp_decode_next_hop_ip,
1891 .update_next_hop = bgp_update_next_hop_ip,
1892 },
1893 {
1894 .afi = BGP_AF_IPV4_MC,
1895 .net = NET_IP4,
1896 .name = "ipv4-mc",
1897 .encode_nlri = bgp_encode_nlri_ip4,
1898 .decode_nlri = bgp_decode_nlri_ip4,
1899 .encode_next_hop = bgp_encode_next_hop_ip,
1900 .decode_next_hop = bgp_decode_next_hop_ip,
1901 .update_next_hop = bgp_update_next_hop_ip,
1902 },
1903 {
1904 .afi = BGP_AF_IPV4_MPLS,
1905 .net = NET_IP4,
1906 .mpls = 1,
1907 .name = "ipv4-mpls",
1908 .encode_nlri = bgp_encode_nlri_ip4,
1909 .decode_nlri = bgp_decode_nlri_ip4,
1910 .encode_next_hop = bgp_encode_next_hop_ip,
1911 .decode_next_hop = bgp_decode_next_hop_ip,
1912 .update_next_hop = bgp_update_next_hop_ip,
1913 },
1914 {
1915 .afi = BGP_AF_IPV6,
1916 .net = NET_IP6,
1917 .name = "ipv6",
1918 .encode_nlri = bgp_encode_nlri_ip6,
1919 .decode_nlri = bgp_decode_nlri_ip6,
1920 .encode_next_hop = bgp_encode_next_hop_ip,
1921 .decode_next_hop = bgp_decode_next_hop_ip,
1922 .update_next_hop = bgp_update_next_hop_ip,
1923 },
1924 {
1925 .afi = BGP_AF_IPV6_MC,
1926 .net = NET_IP6,
1927 .name = "ipv6-mc",
1928 .encode_nlri = bgp_encode_nlri_ip6,
1929 .decode_nlri = bgp_decode_nlri_ip6,
1930 .encode_next_hop = bgp_encode_next_hop_ip,
1931 .decode_next_hop = bgp_decode_next_hop_ip,
1932 .update_next_hop = bgp_update_next_hop_ip,
1933 },
1934 {
1935 .afi = BGP_AF_IPV6_MPLS,
1936 .net = NET_IP6,
1937 .mpls = 1,
1938 .name = "ipv6-mpls",
1939 .encode_nlri = bgp_encode_nlri_ip6,
1940 .decode_nlri = bgp_decode_nlri_ip6,
1941 .encode_next_hop = bgp_encode_next_hop_ip,
1942 .decode_next_hop = bgp_decode_next_hop_ip,
1943 .update_next_hop = bgp_update_next_hop_ip,
1944 },
1945 {
1946 .afi = BGP_AF_VPN4_MPLS,
1947 .net = NET_VPN4,
1948 .mpls = 1,
1949 .name = "vpn4-mpls",
1950 .encode_nlri = bgp_encode_nlri_vpn4,
1951 .decode_nlri = bgp_decode_nlri_vpn4,
1952 .encode_next_hop = bgp_encode_next_hop_vpn,
1953 .decode_next_hop = bgp_decode_next_hop_vpn,
1954 .update_next_hop = bgp_update_next_hop_ip,
1955 },
1956 {
1957 .afi = BGP_AF_VPN6_MPLS,
1958 .net = NET_VPN6,
1959 .mpls = 1,
1960 .name = "vpn6-mpls",
1961 .encode_nlri = bgp_encode_nlri_vpn6,
1962 .decode_nlri = bgp_decode_nlri_vpn6,
1963 .encode_next_hop = bgp_encode_next_hop_vpn,
1964 .decode_next_hop = bgp_decode_next_hop_vpn,
1965 .update_next_hop = bgp_update_next_hop_ip,
1966 },
1967 {
1968 .afi = BGP_AF_VPN4_MC,
1969 .net = NET_VPN4,
1970 .name = "vpn4-mc",
1971 .encode_nlri = bgp_encode_nlri_vpn4,
1972 .decode_nlri = bgp_decode_nlri_vpn4,
1973 .encode_next_hop = bgp_encode_next_hop_vpn,
1974 .decode_next_hop = bgp_decode_next_hop_vpn,
1975 .update_next_hop = bgp_update_next_hop_ip,
1976 },
1977 {
1978 .afi = BGP_AF_VPN6_MC,
1979 .net = NET_VPN6,
1980 .name = "vpn6-mc",
1981 .encode_nlri = bgp_encode_nlri_vpn6,
1982 .decode_nlri = bgp_decode_nlri_vpn6,
1983 .encode_next_hop = bgp_encode_next_hop_vpn,
1984 .decode_next_hop = bgp_decode_next_hop_vpn,
1985 .update_next_hop = bgp_update_next_hop_ip,
1986 },
1987 {
1988 .afi = BGP_AF_FLOW4,
1989 .net = NET_FLOW4,
1990 .no_igp = 1,
1991 .name = "flow4",
1992 .encode_nlri = bgp_encode_nlri_flow4,
1993 .decode_nlri = bgp_decode_nlri_flow4,
1994 .encode_next_hop = bgp_encode_next_hop_none,
1995 .decode_next_hop = bgp_decode_next_hop_none,
1996 .update_next_hop = bgp_update_next_hop_none,
1997 },
1998 {
1999 .afi = BGP_AF_FLOW6,
2000 .net = NET_FLOW6,
2001 .no_igp = 1,
2002 .name = "flow6",
2003 .encode_nlri = bgp_encode_nlri_flow6,
2004 .decode_nlri = bgp_decode_nlri_flow6,
2005 .encode_next_hop = bgp_encode_next_hop_none,
2006 .decode_next_hop = bgp_decode_next_hop_none,
2007 .update_next_hop = bgp_update_next_hop_none,
2008 },
2009 };
2010
2011 const struct bgp_af_desc *
2012 bgp_get_af_desc(u32 afi)
2013 {
2014 uint i;
2015 for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
2016 if (bgp_af_table[i].afi == afi)
2017 return &bgp_af_table[i];
2018
2019 return NULL;
2020 }
2021
2022 static inline uint
2023 bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2024 {
2025 return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
2026 }
2027
2028 static inline uint
2029 bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
2030 {
2031 return s->channel->desc->encode_next_hop(s, nh, buf, 255);
2032 }
2033
2034 void
2035 bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
2036 {
2037 s->channel->desc->update_next_hop(s, a, to);
2038 }
2039
2040 #define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
2041
2042 static byte *
2043 bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2044 {
2045 /*
2046 * 2 B Withdrawn Routes Length (zero)
2047 * --- IPv4 Withdrawn Routes NLRI (unused)
2048 * 2 B Total Path Attribute Length
2049 * var Path Attributes
2050 * var IPv4 Network Layer Reachability Information
2051 */
2052
2053 int lr, la;
2054
2055 la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
2056 if (la < 0)
2057 {
2058 /* Attribute list too long */
2059 bgp_withdraw_bucket(s->channel, buck);
2060 return NULL;
2061 }
2062
2063 put_u16(buf+0, 0);
2064 put_u16(buf+2, la);
2065
2066 lr = bgp_encode_nlri(s, buck, buf+4+la, end);
2067
2068 return buf+4+la+lr;
2069 }
2070
2071 static byte *
2072 bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2073 {
2074 /*
2075 * 2 B IPv4 Withdrawn Routes Length (zero)
2076 * --- IPv4 Withdrawn Routes NLRI (unused)
2077 * 2 B Total Path Attribute Length
2078 * 1 B MP_REACH_NLRI hdr - Attribute Flags
2079 * 1 B MP_REACH_NLRI hdr - Attribute Type Code
2080 * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
2081 * 2 B MP_REACH_NLRI data - Address Family Identifier
2082 * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
2083 * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
2084 * var MP_REACH_NLRI data - Network Address of Next Hop
2085 * 1 B MP_REACH_NLRI data - Reserved (zero)
2086 * var MP_REACH_NLRI data - Network Layer Reachability Information
2087 * var Rest of Path Attributes
2088 * --- IPv4 Network Layer Reachability Information (unused)
2089 */
2090
2091 int lh, lr, la; /* Lengths of next hop, NLRI and attributes */
2092
2093 /* Begin of MP_REACH_NLRI atribute */
2094 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2095 buf[5] = BA_MP_REACH_NLRI;
2096 put_u16(buf+6, 0); /* Will be fixed later */
2097 put_af3(buf+8, s->channel->afi);
2098 byte *pos = buf+11;
2099
2100 /* Encode attributes to temporary buffer */
2101 byte *abuf = alloca(MAX_ATTRS_LENGTH);
2102 la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2103 if (la < 0)
2104 {
2105 /* Attribute list too long */
2106 bgp_withdraw_bucket(s->channel, buck);
2107 return NULL;
2108 }
2109
2110 /* Encode the next hop */
2111 lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2112 *pos = lh;
2113 pos += 1+lh;
2114
2115 /* Reserved field */
2116 *pos++ = 0;
2117
2118 /* Encode the NLRI */
2119 lr = bgp_encode_nlri(s, buck, pos, end - la);
2120 pos += lr;
2121
2122 /* End of MP_REACH_NLRI atribute, update data length */
2123 put_u16(buf+6, pos-buf-8);
2124
2125 /* Copy remaining attributes */
2126 memcpy(pos, abuf, la);
2127 pos += la;
2128
2129 /* Initial UPDATE fields */
2130 put_u16(buf+0, 0);
2131 put_u16(buf+2, pos-buf-4);
2132
2133 return pos;
2134 }
2135
2136 #undef MAX_ATTRS_LENGTH
2137
2138 static byte *
2139 bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2140 {
2141 /*
2142 * 2 B Withdrawn Routes Length
2143 * var IPv4 Withdrawn Routes NLRI
2144 * 2 B Total Path Attribute Length (zero)
2145 * --- Path Attributes (unused)
2146 * --- IPv4 Network Layer Reachability Information (unused)
2147 */
2148
2149 uint len = bgp_encode_nlri(s, buck, buf+2, end);
2150
2151 put_u16(buf+0, len);
2152 put_u16(buf+2+len, 0);
2153
2154 return buf+4+len;
2155 }
2156
2157 static byte *
2158 bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2159 {
2160 /*
2161 * 2 B Withdrawn Routes Length (zero)
2162 * --- IPv4 Withdrawn Routes NLRI (unused)
2163 * 2 B Total Path Attribute Length
2164 * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
2165 * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
2166 * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
2167 * 2 B MP_UNREACH_NLRI data - Address Family Identifier
2168 * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2169 * var MP_UNREACH_NLRI data - Network Layer Reachability Information
2170 * --- IPv4 Network Layer Reachability Information (unused)
2171 */
2172
2173 uint len = bgp_encode_nlri(s, buck, buf+11, end);
2174
2175 put_u16(buf+0, 0);
2176 put_u16(buf+2, 7+len);
2177
2178 /* Begin of MP_UNREACH_NLRI atribute */
2179 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2180 buf[5] = BA_MP_UNREACH_NLRI;
2181 put_u16(buf+6, 3+len);
2182 put_af3(buf+8, s->channel->afi);
2183
2184 return buf+11+len;
2185 }
2186
2187 static byte *
2188 bgp_create_update(struct bgp_channel *c, byte *buf)
2189 {
2190 struct bgp_proto *p = (void *) c->c.proto;
2191 struct bgp_bucket *buck;
2192 byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2193 byte *res = NULL;
2194
2195 again: ;
2196
2197 /* Initialize write state */
2198 struct bgp_write_state s = {
2199 .proto = p,
2200 .channel = c,
2201 .pool = bgp_linpool,
2202 .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2203 .as4_session = p->as4_session,
2204 .add_path = c->add_path_tx,
2205 .mpls = c->desc->mpls,
2206 };
2207
2208 /* Try unreachable bucket */
2209 if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2210 {
2211 res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2212 bgp_create_ip_unreach(&s, buck, buf, end):
2213 bgp_create_mp_unreach(&s, buck, buf, end);
2214
2215 goto done;
2216 }
2217
2218 /* Try reachable buckets */
2219 if (!EMPTY_LIST(c->bucket_queue))
2220 {
2221 buck = HEAD(c->bucket_queue);
2222
2223 /* Cleanup empty buckets */
2224 if (EMPTY_LIST(buck->prefixes))
2225 {
2226 bgp_free_bucket(c, buck);
2227 goto again;
2228 }
2229
2230 res = !s.mp_reach ?
2231 bgp_create_ip_reach(&s, buck, buf, end):
2232 bgp_create_mp_reach(&s, buck, buf, end);
2233
2234 if (EMPTY_LIST(buck->prefixes))
2235 bgp_free_bucket(c, buck);
2236 else
2237 bgp_defer_bucket(c, buck);
2238
2239 if (!res)
2240 goto again;
2241
2242 goto done;
2243 }
2244
2245 /* No more prefixes to send */
2246 return NULL;
2247
2248 done:
2249 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2250 lp_flush(s.pool);
2251
2252 return res;
2253 }
2254
2255 static byte *
2256 bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2257 {
2258 /* Empty update packet */
2259 put_u32(buf, 0);
2260
2261 return buf+4;
2262 }
2263
2264 static byte *
2265 bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2266 {
2267 put_u16(buf+0, 0);
2268 put_u16(buf+2, 6); /* length 4--9 */
2269
2270 /* Empty MP_UNREACH_NLRI atribute */
2271 buf[4] = BAF_OPTIONAL;
2272 buf[5] = BA_MP_UNREACH_NLRI;
2273 buf[6] = 3; /* Length 7--9 */
2274 put_af3(buf+7, c->afi);
2275
2276 return buf+10;
2277 }
2278
2279 static byte *
2280 bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2281 {
2282 struct bgp_proto *p = (void *) c->c.proto;
2283
2284 BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2285
2286 return (c->afi == BGP_AF_IPV4) ?
2287 bgp_create_ip_end_mark(c, buf):
2288 bgp_create_mp_end_mark(c, buf);
2289 }
2290
2291 static inline void
2292 bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2293 {
2294 struct bgp_proto *p = s->proto;
2295 struct bgp_channel *c = bgp_get_channel(p, afi);
2296
2297 BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2298
2299 if (!c)
2300 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2301
2302 if (c->load_state == BFS_LOADING)
2303 c->load_state = BFS_NONE;
2304
2305 if (p->p.gr_recovery)
2306 channel_graceful_restart_unlock(&c->c);
2307
2308 if (c->gr_active)
2309 bgp_graceful_restart_done(c);
2310 }
2311
2312 static inline void
2313 bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2314 {
2315 struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2316 rta *a = NULL;
2317
2318 if (!c)
2319 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2320
2321 s->channel = c;
2322 s->add_path = c->add_path_rx;
2323 s->mpls = c->desc->mpls;
2324
2325 s->last_id = 0;
2326 s->last_src = s->proto->p.main_source;
2327
2328 /*
2329 * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2330 * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2331 * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2332 * decode_next_hop hooks) by restoring a->eattrs afterwards.
2333 */
2334
2335 if (ea)
2336 {
2337 a = allocz(RTA_MAX_SIZE);
2338
2339 a->source = RTS_BGP;
2340 a->scope = SCOPE_UNIVERSE;
2341 a->from = s->proto->remote_ip;
2342 a->eattrs = ea;
2343
2344 c->desc->decode_next_hop(s, nh, nh_len, a);
2345
2346 /* Handle withdraw during next hop decoding */
2347 if (s->err_withdraw)
2348 a = NULL;
2349 }
2350
2351 c->desc->decode_nlri(s, nlri, len, a);
2352
2353 rta_free(s->cached_rta);
2354 s->cached_rta = NULL;
2355 }
2356
2357 static void
2358 bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2359 {
2360 struct bgp_proto *p = conn->bgp;
2361 ea_list *ea = NULL;
2362
2363 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2364
2365 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2366 if (conn->state == BS_OPENCONFIRM)
2367 bgp_conn_enter_established_state(conn);
2368
2369 if (conn->state != BS_ESTABLISHED)
2370 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2371
2372 bgp_start_timer(conn->hold_timer, conn->hold_time);
2373
2374 /* Initialize parse state */
2375 struct bgp_parse_state s = {
2376 .proto = p,
2377 .pool = bgp_linpool,
2378 .as4_session = p->as4_session,
2379 };
2380
2381 /* Parse error handler */
2382 if (setjmp(s.err_jmpbuf))
2383 {
2384 bgp_error(conn, 3, s.err_subcode, NULL, 0);
2385 goto done;
2386 }
2387
2388 /* Check minimal length */
2389 if (len < 23)
2390 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2391
2392 /* Skip fixed header */
2393 uint pos = 19;
2394
2395 /*
2396 * UPDATE message format
2397 *
2398 * 2 B IPv4 Withdrawn Routes Length
2399 * var IPv4 Withdrawn Routes NLRI
2400 * 2 B Total Path Attribute Length
2401 * var Path Attributes
2402 * var IPv4 Reachable Routes NLRI
2403 */
2404
2405 s.ip_unreach_len = get_u16(pkt + pos);
2406 s.ip_unreach_nlri = pkt + pos + 2;
2407 pos += 2 + s.ip_unreach_len;
2408
2409 if (pos + 2 > len)
2410 bgp_parse_error(&s, 1);
2411
2412 s.attr_len = get_u16(pkt + pos);
2413 s.attrs = pkt + pos + 2;
2414 pos += 2 + s.attr_len;
2415
2416 if (pos > len)
2417 bgp_parse_error(&s, 1);
2418
2419 s.ip_reach_len = len - pos;
2420 s.ip_reach_nlri = pkt + pos;
2421
2422
2423 if (s.attr_len)
2424 ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2425 else
2426 ea = NULL;
2427
2428 /* Check for End-of-RIB marker */
2429 if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2430 { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2431
2432 /* Check for MP End-of-RIB marker */
2433 if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2434 !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2435 { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2436
2437 if (s.ip_unreach_len)
2438 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2439
2440 if (s.mp_unreach_len)
2441 bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2442
2443 if (s.ip_reach_len)
2444 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2445 ea, s.ip_next_hop_data, s.ip_next_hop_len);
2446
2447 if (s.mp_reach_len)
2448 bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2449 ea, s.mp_next_hop_data, s.mp_next_hop_len);
2450
2451 done:
2452 rta_free(s.cached_rta);
2453 lp_flush(s.pool);
2454 return;
2455 }
2456
2457 static uint
2458 bgp_find_update_afi(byte *pos, uint len)
2459 {
2460 /*
2461 * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2462 * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2463 * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2464 */
2465 if (len < 23)
2466 return 0;
2467
2468 /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2469 uint wlen = get_u16(pos + 19);
2470 uint alen = get_u16(pos + 21);
2471 ADVANCE(pos, len, 23);
2472
2473 /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2474 if ((wlen != 0) || (alen < len) || !alen)
2475 return BGP_AF_IPV4;
2476
2477 if (alen > len)
2478 return 0;
2479
2480 /* Process attribute list (alen == len) */
2481 while (len)
2482 {
2483 if (len < 2)
2484 return 0;
2485
2486 uint flags = pos[0];
2487 uint code = pos[1];
2488 ADVANCE(pos, len, 2);
2489
2490 uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2491 if (len < ll)
2492 return 0;
2493
2494 /* Read attribute length and move to attribute body */
2495 alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2496 ADVANCE(pos, len, ll);
2497
2498 if (len < alen)
2499 return 0;
2500
2501 /* Found MP NLRI */
2502 if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2503 {
2504 if (alen < 3)
2505 return 0;
2506
2507 return BGP_AF(get_u16(pos), pos[2]);
2508 }
2509
2510 /* Move to the next attribute */
2511 ADVANCE(pos, len, alen);
2512 }
2513
2514 /* No basic or MP NLRI, but there are some attributes -> error */
2515 return 0;
2516 }
2517
2518
2519 /*
2520 * ROUTE-REFRESH
2521 */
2522
2523 static inline byte *
2524 bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2525 {
2526 struct bgp_proto *p = (void *) c->c.proto;
2527
2528 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2529
2530 /* Original route refresh request, RFC 2918 */
2531 put_af4(buf, c->afi);
2532 buf[2] = BGP_RR_REQUEST;
2533
2534 return buf+4;
2535 }
2536
2537 static inline byte *
2538 bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2539 {
2540 struct bgp_proto *p = (void *) c->c.proto;
2541
2542 BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2543
2544 /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2545 put_af4(buf, c->afi);
2546 buf[2] = BGP_RR_BEGIN;
2547
2548 return buf+4;
2549 }
2550
2551 static inline byte *
2552 bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2553 {
2554 struct bgp_proto *p = (void *) c->c.proto;
2555
2556 BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2557
2558 /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2559 put_af4(buf, c->afi);
2560 buf[2] = BGP_RR_END;
2561
2562 return buf+4;
2563 }
2564
2565 static void
2566 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2567 {
2568 struct bgp_proto *p = conn->bgp;
2569
2570 if (conn->state != BS_ESTABLISHED)
2571 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2572
2573 if (!conn->local_caps->route_refresh)
2574 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2575
2576 if (len < (BGP_HEADER_LENGTH + 4))
2577 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2578
2579 if (len > (BGP_HEADER_LENGTH + 4))
2580 { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2581
2582 struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2583 if (!c)
2584 {
2585 log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2586 p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2587 return;
2588 }
2589
2590 /* RFC 7313 redefined reserved field as RR message subtype */
2591 uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2592
2593 switch (subtype)
2594 {
2595 case BGP_RR_REQUEST:
2596 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2597 channel_request_feeding(&c->c);
2598 break;
2599
2600 case BGP_RR_BEGIN:
2601 BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2602 bgp_refresh_begin(c);
2603 break;
2604
2605 case BGP_RR_END:
2606 BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2607 bgp_refresh_end(c);
2608 break;
2609
2610 default:
2611 log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2612 p->p.name, subtype);
2613 break;
2614 }
2615 }
2616
2617 static inline struct bgp_channel *
2618 bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2619 {
2620 uint i = conn->last_channel;
2621
2622 /* Try the last channel, but at most several times */
2623 if ((conn->channels_to_send & (1 << i)) &&
2624 (conn->last_channel_count < 16))
2625 goto found;
2626
2627 /* Find channel with non-zero channels_to_send */
2628 do
2629 {
2630 i++;
2631 if (i >= p->channel_count)
2632 i = 0;
2633 }
2634 while (! (conn->channels_to_send & (1 << i)));
2635
2636 /* Use that channel */
2637 conn->last_channel = i;
2638 conn->last_channel_count = 0;
2639
2640 found:
2641 conn->last_channel_count++;
2642 return p->channel_map[i];
2643 }
2644
2645 static inline int
2646 bgp_send(struct bgp_conn *conn, uint type, uint len)
2647 {
2648 sock *sk = conn->sk;
2649 byte *buf = sk->tbuf;
2650
2651 memset(buf, 0xff, 16); /* Marker */
2652 put_u16(buf+16, len);
2653 buf[18] = type;
2654
2655 return sk_send(sk, len);
2656 }
2657
2658 /**
2659 * bgp_fire_tx - transmit packets
2660 * @conn: connection
2661 *
2662 * Whenever the transmit buffers of the underlying TCP connection
2663 * are free and we have any packets queued for sending, the socket functions
2664 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2665 * queued (Notification > Keepalive > Open > Update), assembling its header
2666 * and body and sending it to the connection.
2667 */
2668 static int
2669 bgp_fire_tx(struct bgp_conn *conn)
2670 {
2671 struct bgp_proto *p = conn->bgp;
2672 struct bgp_channel *c;
2673 byte *buf, *pkt, *end;
2674 uint s;
2675
2676 if (!conn->sk)
2677 return 0;
2678
2679 buf = conn->sk->tbuf;
2680 pkt = buf + BGP_HEADER_LENGTH;
2681 s = conn->packets_to_send;
2682
2683 if (s & (1 << PKT_SCHEDULE_CLOSE))
2684 {
2685 /* We can finally close connection and enter idle state */
2686 bgp_conn_enter_idle_state(conn);
2687 return 0;
2688 }
2689 if (s & (1 << PKT_NOTIFICATION))
2690 {
2691 conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2692 end = bgp_create_notification(conn, pkt);
2693 return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2694 }
2695 else if (s & (1 << PKT_OPEN))
2696 {
2697 conn->packets_to_send &= ~(1 << PKT_OPEN);
2698 end = bgp_create_open(conn, pkt);
2699 return bgp_send(conn, PKT_OPEN, end - buf);
2700 }
2701 else if (s & (1 << PKT_KEEPALIVE))
2702 {
2703 conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2704 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2705 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2706 return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2707 }
2708 else while (conn->channels_to_send)
2709 {
2710 c = bgp_get_channel_to_send(p, conn);
2711 s = c->packets_to_send;
2712
2713 if (s & (1 << PKT_ROUTE_REFRESH))
2714 {
2715 c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2716 end = bgp_create_route_refresh(c, pkt);
2717 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2718 }
2719 else if (s & (1 << PKT_BEGIN_REFRESH))
2720 {
2721 /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2722 c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2723 end = bgp_create_begin_refresh(c, pkt);
2724 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2725 }
2726 else if (s & (1 << PKT_UPDATE))
2727 {
2728 end = bgp_create_update(c, pkt);
2729 if (end)
2730 return bgp_send(conn, PKT_UPDATE, end - buf);
2731
2732 /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2733 c->packets_to_send = 0;
2734 conn->channels_to_send &= ~(1 << c->index);
2735
2736 if (c->feed_state == BFS_LOADED)
2737 {
2738 c->feed_state = BFS_NONE;
2739 end = bgp_create_end_mark(c, pkt);
2740 return bgp_send(conn, PKT_UPDATE, end - buf);
2741 }
2742
2743 else if (c->feed_state == BFS_REFRESHED)
2744 {
2745 c->feed_state = BFS_NONE;
2746 end = bgp_create_end_refresh(c, pkt);
2747 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2748 }
2749 }
2750 else if (s)
2751 bug("Channel packets_to_send: %x", s);
2752
2753 c->packets_to_send = 0;
2754 conn->channels_to_send &= ~(1 << c->index);
2755 }
2756
2757 return 0;
2758 }
2759
2760 /**
2761 * bgp_schedule_packet - schedule a packet for transmission
2762 * @conn: connection
2763 * @c: channel
2764 * @type: packet type
2765 *
2766 * Schedule a packet of type @type to be sent as soon as possible.
2767 */
2768 void
2769 bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2770 {
2771 ASSERT(conn->sk);
2772
2773 DBG("BGP: Scheduling packet type %d\n", type);
2774
2775 if (c)
2776 {
2777 if (! conn->channels_to_send)
2778 {
2779 conn->last_channel = c->index;
2780 conn->last_channel_count = 0;
2781 }
2782
2783 c->packets_to_send |= 1 << type;
2784 conn->channels_to_send |= 1 << c->index;
2785 }
2786 else
2787 conn->packets_to_send |= 1 << type;
2788
2789 if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2790 ev_schedule(conn->tx_ev);
2791 }
2792 void
2793 bgp_kick_tx(void *vconn)
2794 {
2795 struct bgp_conn *conn = vconn;
2796
2797 DBG("BGP: kicking TX\n");
2798 uint max = 1024;
2799 while (--max && (bgp_fire_tx(conn) > 0))
2800 ;
2801
2802 if (!max && !ev_active(conn->tx_ev))
2803 ev_schedule(conn->tx_ev);
2804 }
2805
2806 void
2807 bgp_tx(sock *sk)
2808 {
2809 struct bgp_conn *conn = sk->data;
2810
2811 DBG("BGP: TX hook\n");
2812 uint max = 1024;
2813 while (--max && (bgp_fire_tx(conn) > 0))
2814 ;
2815
2816 if (!max && !ev_active(conn->tx_ev))
2817 ev_schedule(conn->tx_ev);
2818 }
2819
2820
2821 static struct {
2822 byte major, minor;
2823 byte *msg;
2824 } bgp_msg_table[] = {
2825 { 1, 0, "Invalid message header" },
2826 { 1, 1, "Connection not synchronized" },
2827 { 1, 2, "Bad message length" },
2828 { 1, 3, "Bad message type" },
2829 { 2, 0, "Invalid OPEN message" },
2830 { 2, 1, "Unsupported version number" },
2831 { 2, 2, "Bad peer AS" },
2832 { 2, 3, "Bad BGP identifier" },
2833 { 2, 4, "Unsupported optional parameter" },
2834 { 2, 5, "Authentication failure" },
2835 { 2, 6, "Unacceptable hold time" },
2836 { 2, 7, "Required capability missing" }, /* [RFC5492] */
2837 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2838 { 3, 0, "Invalid UPDATE message" },
2839 { 3, 1, "Malformed attribute list" },
2840 { 3, 2, "Unrecognized well-known attribute" },
2841 { 3, 3, "Missing mandatory attribute" },
2842 { 3, 4, "Invalid attribute flags" },
2843 { 3, 5, "Invalid attribute length" },
2844 { 3, 6, "Invalid ORIGIN attribute" },
2845 { 3, 7, "AS routing loop" }, /* Deprecated */
2846 { 3, 8, "Invalid NEXT_HOP attribute" },
2847 { 3, 9, "Optional attribute error" },
2848 { 3, 10, "Invalid network field" },
2849 { 3, 11, "Malformed AS_PATH" },
2850 { 4, 0, "Hold timer expired" },
2851 { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2852 { 5, 1, "Unexpected message in OpenSent state" },
2853 { 5, 2, "Unexpected message in OpenConfirm state" },
2854 { 5, 3, "Unexpected message in Established state" },
2855 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2856 { 6, 1, "Maximum number of prefixes reached" },
2857 { 6, 2, "Administrative shutdown" },
2858 { 6, 3, "Peer de-configured" },
2859 { 6, 4, "Administrative reset" },
2860 { 6, 5, "Connection rejected" },
2861 { 6, 6, "Other configuration change" },
2862 { 6, 7, "Connection collision resolution" },
2863 { 6, 8, "Out of Resources" },
2864 { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2865 { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2866 };
2867
2868 /**
2869 * bgp_error_dsc - return BGP error description
2870 * @code: BGP error code
2871 * @subcode: BGP error subcode
2872 *
2873 * bgp_error_dsc() returns error description for BGP errors
2874 * which might be static string or given temporary buffer.
2875 */
2876 const char *
2877 bgp_error_dsc(uint code, uint subcode)
2878 {
2879 static char buff[32];
2880 uint i;
2881
2882 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2883 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2884 return bgp_msg_table[i].msg;
2885
2886 bsprintf(buff, "Unknown error %u.%u", code, subcode);
2887 return buff;
2888 }
2889
2890 /* RFC 8203 - shutdown communication message */
2891 static int
2892 bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2893 {
2894 byte *msg = data + 1;
2895 uint msg_len = data[0];
2896 uint i;
2897
2898 /* Handle zero length message */
2899 if (msg_len == 0)
2900 return 1;
2901
2902 /* Handle proper message */
2903 if ((msg_len > 255) && (msg_len + 1 > len))
2904 return 0;
2905
2906 /* Some elementary cleanup */
2907 for (i = 0; i < msg_len; i++)
2908 if (msg[i] < ' ')
2909 msg[i] = ' ';
2910
2911 proto_set_message(&p->p, msg, msg_len);
2912 *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2913 return 1;
2914 }
2915
2916 void
2917 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2918 {
2919 byte argbuf[256], *t = argbuf;
2920 uint i;
2921
2922 /* Don't report Cease messages generated by myself */
2923 if (code == 6 && class == BE_BGP_TX)
2924 return;
2925
2926 /* Reset shutdown message */
2927 if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2928 proto_set_message(&p->p, NULL, 0);
2929
2930 if (len)
2931 {
2932 /* Bad peer AS - we would like to print the AS */
2933 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2934 {
2935 t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2936 goto done;
2937 }
2938
2939 /* RFC 8203 - shutdown communication */
2940 if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2941 if (bgp_handle_message(p, data, len, &t))
2942 goto done;
2943
2944 *t++ = ':';
2945 *t++ = ' ';
2946 if (len > 16)
2947 len = 16;
2948 for (i=0; i<len; i++)
2949 t += bsprintf(t, "%02x", data[i]);
2950 }
2951
2952 done:
2953 *t = 0;
2954 const byte *dsc = bgp_error_dsc(code, subcode);
2955 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2956 }
2957
2958 static void
2959 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2960 {
2961 struct bgp_proto *p = conn->bgp;
2962
2963 if (len < 21)
2964 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2965
2966 uint code = pkt[19];
2967 uint subcode = pkt[20];
2968 int err = (code != 6);
2969
2970 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2971 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2972
2973 bgp_conn_enter_close_state(conn);
2974 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2975
2976 if (err)
2977 {
2978 bgp_update_startup_delay(p);
2979 bgp_stop(p, 0, NULL, 0);
2980 }
2981 else
2982 {
2983 uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
2984 if (p->cf->disable_after_cease & subcode_bit)
2985 {
2986 log(L_INFO "%s: Disabled after Cease notification", p->p.name);
2987 p->startup_delay = 0;
2988 p->p.disabled = 1;
2989 }
2990 }
2991 }
2992
2993 static void
2994 bgp_rx_keepalive(struct bgp_conn *conn)
2995 {
2996 struct bgp_proto *p = conn->bgp;
2997
2998 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2999 bgp_start_timer(conn->hold_timer, conn->hold_time);
3000
3001 if (conn->state == BS_OPENCONFIRM)
3002 { bgp_conn_enter_established_state(conn); return; }
3003
3004 if (conn->state != BS_ESTABLISHED)
3005 bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
3006 }
3007
3008
3009 /**
3010 * bgp_rx_packet - handle a received packet
3011 * @conn: BGP connection
3012 * @pkt: start of the packet
3013 * @len: packet size
3014 *
3015 * bgp_rx_packet() takes a newly received packet and calls the corresponding
3016 * packet handler according to the packet type.
3017 */
3018 static void
3019 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
3020 {
3021 byte type = pkt[18];
3022
3023 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
3024
3025 if (conn->bgp->p.mrtdump & MD_MESSAGES)
3026 bgp_dump_message(conn, pkt, len);
3027
3028 switch (type)
3029 {
3030 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
3031 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
3032 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
3033 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
3034 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
3035 default: bgp_error(conn, 1, 3, pkt+18, 1);
3036 }
3037 }
3038
3039 /**
3040 * bgp_rx - handle received data
3041 * @sk: socket
3042 * @size: amount of data received
3043 *
3044 * bgp_rx() is called by the socket layer whenever new data arrive from
3045 * the underlying TCP connection. It assembles the data fragments to packets,
3046 * checks their headers and framing and passes complete packets to
3047 * bgp_rx_packet().
3048 */
3049 int
3050 bgp_rx(sock *sk, uint size)
3051 {
3052 struct bgp_conn *conn = sk->data;
3053 byte *pkt_start = sk->rbuf;
3054 byte *end = pkt_start + size;
3055 uint i, len;
3056
3057 DBG("BGP: RX hook: Got %d bytes\n", size);
3058 while (end >= pkt_start + BGP_HEADER_LENGTH)
3059 {
3060 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
3061 return 0;
3062 for(i=0; i<16; i++)
3063 if (pkt_start[i] != 0xff)
3064 {
3065 bgp_error(conn, 1, 1, NULL, 0);
3066 break;
3067 }
3068 len = get_u16(pkt_start+16);
3069 if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
3070 {
3071 bgp_error(conn, 1, 2, pkt_start+16, 2);
3072 break;
3073 }
3074 if (end < pkt_start + len)
3075 break;
3076 bgp_rx_packet(conn, pkt_start, len);
3077 pkt_start += len;
3078 }
3079 if (pkt_start != sk->rbuf)
3080 {
3081 memmove(sk->rbuf, pkt_start, end - pkt_start);
3082 sk->rpos = sk->rbuf + (end - pkt_start);
3083 }
3084 return 0;
3085 }