]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
BGP: Fix handling of strange IPv6 link-local-only next hops
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 #undef LOCAL_DEBUG
12
13 #include <stdlib.h>
14
15 #include "nest/bird.h"
16 #include "nest/iface.h"
17 #include "nest/protocol.h"
18 #include "nest/route.h"
19 #include "nest/attrs.h"
20 #include "proto/mrt/mrt.h"
21 #include "conf/conf.h"
22 #include "lib/unaligned.h"
23 #include "lib/flowspec.h"
24 #include "lib/socket.h"
25
26 #include "nest/cli.h"
27
28 #include "bgp.h"
29
30
31 #define BGP_RR_REQUEST 0
32 #define BGP_RR_BEGIN 1
33 #define BGP_RR_END 2
34
35 #define BGP_NLRI_MAX (4 + 1 + 32)
36
37 #define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
38 #define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
39 #define BGP_MPLS_NULL 3 /* Implicit NULL label */
40 #define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
41
42
43 static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44 static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45
46 /* Table for state -> RFC 6608 FSM error subcodes */
47 static byte fsm_err_subcode[BS_MAX] = {
48 [BS_OPENSENT] = 1,
49 [BS_OPENCONFIRM] = 2,
50 [BS_ESTABLISHED] = 3
51 };
52
53
54 static struct bgp_channel *
55 bgp_get_channel(struct bgp_proto *p, u32 afi)
56 {
57 uint i;
58
59 for (i = 0; i < p->channel_count; i++)
60 if (p->afi_map[i] == afi)
61 return p->channel_map[i];
62
63 return NULL;
64 }
65
66 static inline void
67 put_af3(byte *buf, u32 id)
68 {
69 put_u16(buf, id >> 16);
70 buf[2] = id & 0xff;
71 }
72
73 static inline void
74 put_af4(byte *buf, u32 id)
75 {
76 put_u16(buf, id >> 16);
77 buf[2] = 0;
78 buf[3] = id & 0xff;
79 }
80
81 static inline u32
82 get_af3(byte *buf)
83 {
84 return (get_u16(buf) << 16) | buf[2];
85 }
86
87 static inline u32
88 get_af4(byte *buf)
89 {
90 return (get_u16(buf) << 16) | buf[3];
91 }
92
93 static void
94 init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95 {
96 struct bgp_proto *p = conn->bgp;
97 int p_ok = conn->state >= BS_OPENCONFIRM;
98
99 memset(d, 0, sizeof(struct mrt_bgp_data));
100 d->peer_as = p->remote_as;
101 d->local_as = p->local_as;
102 d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103 d->af = ipa_is_ip4(p->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104 d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105 d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106 d->as4 = p_ok ? p->as4_session : 0;
107 }
108
109 static uint bgp_find_update_afi(byte *pos, uint len);
110
111 static int
112 bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113 {
114 /* No need to estimate it for other messages than UPDATE */
115 if (pkt[18] != PKT_UPDATE)
116 return 0;
117
118 /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119 if (p->summary_add_path_rx < 3)
120 return p->summary_add_path_rx == 2;
121
122 uint afi = bgp_find_update_afi(pkt, len);
123 struct bgp_channel *c = bgp_get_channel(p, afi);
124 if (!c)
125 {
126 /* Either frame error (if !afi) or unknown AFI/SAFI,
127 will be reported later in regular parsing */
128 BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129 return 0;
130 }
131
132 return c->add_path_rx;
133 }
134
135 static void
136 bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137 {
138 struct mrt_bgp_data d;
139 init_mrt_bgp_data(conn, &d);
140
141 d.message = pkt;
142 d.msg_len = len;
143 d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144
145 mrt_dump_bgp_message(&d);
146 }
147
148 void
149 bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150 {
151 struct mrt_bgp_data d;
152 init_mrt_bgp_data(conn, &d);
153
154 d.old_state = old;
155 d.new_state = new;
156
157 mrt_dump_bgp_state_change(&d);
158 }
159
160 static byte *
161 bgp_create_notification(struct bgp_conn *conn, byte *buf)
162 {
163 struct bgp_proto *p = conn->bgp;
164
165 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166 buf[0] = conn->notify_code;
167 buf[1] = conn->notify_subcode;
168 memcpy(buf+2, conn->notify_data, conn->notify_size);
169 return buf + 2 + conn->notify_size;
170 }
171
172
173 /* Capability negotiation as per RFC 5492 */
174
175 const struct bgp_af_caps *
176 bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177 {
178 struct bgp_af_caps *ac;
179
180 WALK_AF_CAPS(caps, ac)
181 if (ac->afi == afi)
182 return ac;
183
184 return NULL;
185 }
186
187 static struct bgp_af_caps *
188 bgp_get_af_caps(struct bgp_caps **pcaps, u32 afi)
189 {
190 struct bgp_caps *caps = *pcaps;
191 struct bgp_af_caps *ac;
192
193 WALK_AF_CAPS(caps, ac)
194 if (ac->afi == afi)
195 return ac;
196
197 uint n = caps->af_count;
198 if (uint_is_pow2(n))
199 *pcaps = caps = mb_realloc(caps, sizeof(struct bgp_caps) +
200 (2 * n) * sizeof(struct bgp_af_caps));
201
202 ac = &caps->af_data[caps->af_count++];
203 memset(ac, 0, sizeof(struct bgp_af_caps));
204 ac->afi = afi;
205
206 return ac;
207 }
208
209 static int
210 bgp_af_caps_cmp(const void *X, const void *Y)
211 {
212 const struct bgp_af_caps *x = X, *y = Y;
213 return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
214 }
215
216
217 void
218 bgp_prepare_capabilities(struct bgp_conn *conn)
219 {
220 struct bgp_proto *p = conn->bgp;
221 struct bgp_channel *c;
222 struct bgp_caps *caps;
223 struct bgp_af_caps *ac;
224
225 if (!p->cf->capabilities)
226 {
227 /* Just prepare empty local_caps */
228 conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
229 return;
230 }
231
232 /* Prepare bgp_caps structure */
233 int n = list_length(&p->p.channels);
234 caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
235 conn->local_caps = caps;
236
237 caps->as4_support = p->cf->enable_as4;
238 caps->ext_messages = p->cf->enable_extended_messages;
239 caps->route_refresh = p->cf->enable_refresh;
240 caps->enhanced_refresh = p->cf->enable_refresh;
241
242 if (caps->as4_support)
243 caps->as4_number = p->public_as;
244
245 if (p->cf->gr_mode)
246 {
247 caps->gr_aware = 1;
248 caps->gr_time = p->cf->gr_time;
249 caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
250 }
251
252 if (p->cf->llgr_mode)
253 caps->llgr_aware = 1;
254
255 /* Allocate and fill per-AF fields */
256 WALK_LIST(c, p->p.channels)
257 {
258 ac = &caps->af_data[caps->af_count++];
259 ac->afi = c->afi;
260 ac->ready = 1;
261
262 ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
263 caps->any_ext_next_hop |= ac->ext_next_hop;
264
265 ac->add_path = c->cf->add_path;
266 caps->any_add_path |= ac->add_path;
267
268 if (c->cf->gr_able)
269 {
270 ac->gr_able = 1;
271
272 if (p->p.gr_recovery)
273 ac->gr_af_flags |= BGP_GRF_FORWARDING;
274 }
275
276 if (c->cf->llgr_able)
277 {
278 ac->llgr_able = 1;
279 ac->llgr_time = c->cf->llgr_time;
280
281 if (p->p.gr_recovery)
282 ac->llgr_flags |= BGP_LLGRF_FORWARDING;
283 }
284 }
285
286 /* Sort capability fields by AFI/SAFI */
287 qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
288 }
289
290 static byte *
291 bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
292 {
293 struct bgp_proto *p = conn->bgp;
294 struct bgp_caps *caps = conn->local_caps;
295 struct bgp_af_caps *ac;
296 byte *buf_head = buf;
297 byte *data;
298
299 /* Create capability list in buffer */
300
301 /*
302 * Note that max length is ~ 22+21*af_count. With max 12 channels that is
303 * 274. We are limited just by buffer size (4096, minus header), as we support
304 * extended optional parameres. Therefore, we have enough space for expansion.
305 */
306
307 WALK_AF_CAPS(caps, ac)
308 if (ac->ready)
309 {
310 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
311 *buf++ = 4; /* Capability data length */
312 put_af4(buf, ac->afi);
313 buf += 4;
314 }
315
316 if (caps->route_refresh)
317 {
318 *buf++ = 2; /* Capability 2: Support for route refresh */
319 *buf++ = 0; /* Capability data length */
320 }
321
322 if (caps->any_ext_next_hop)
323 {
324 *buf++ = 5; /* Capability 5: Support for extended next hop */
325 *buf++ = 0; /* Capability data length, will be fixed later */
326 data = buf;
327
328 WALK_AF_CAPS(caps, ac)
329 if (ac->ext_next_hop)
330 {
331 put_af4(buf, ac->afi);
332 put_u16(buf+4, BGP_AFI_IPV6);
333 buf += 6;
334 }
335
336 data[-1] = buf - data;
337 }
338
339 if (caps->ext_messages)
340 {
341 *buf++ = 6; /* Capability 6: Support for extended messages */
342 *buf++ = 0; /* Capability data length */
343 }
344
345 if (caps->gr_aware)
346 {
347 *buf++ = 64; /* Capability 64: Support for graceful restart */
348 *buf++ = 0; /* Capability data length, will be fixed later */
349 data = buf;
350
351 put_u16(buf, caps->gr_time);
352 buf[0] |= caps->gr_flags;
353 buf += 2;
354
355 WALK_AF_CAPS(caps, ac)
356 if (ac->gr_able)
357 {
358 put_af3(buf, ac->afi);
359 buf[3] = ac->gr_af_flags;
360 buf += 4;
361 }
362
363 data[-1] = buf - data;
364 }
365
366 if (caps->as4_support)
367 {
368 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
369 *buf++ = 4; /* Capability data length */
370 put_u32(buf, p->public_as);
371 buf += 4;
372 }
373
374 if (caps->any_add_path)
375 {
376 *buf++ = 69; /* Capability 69: Support for ADD-PATH */
377 *buf++ = 0; /* Capability data length, will be fixed later */
378 data = buf;
379
380 WALK_AF_CAPS(caps, ac)
381 if (ac->add_path)
382 {
383 put_af3(buf, ac->afi);
384 buf[3] = ac->add_path;
385 buf += 4;
386 }
387
388 data[-1] = buf - data;
389 }
390
391 if (caps->enhanced_refresh)
392 {
393 *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
394 *buf++ = 0; /* Capability data length */
395 }
396
397 if (caps->llgr_aware)
398 {
399 *buf++ = 71; /* Capability 71: Support for long-lived graceful restart */
400 *buf++ = 0; /* Capability data length, will be fixed later */
401 data = buf;
402
403 WALK_AF_CAPS(caps, ac)
404 if (ac->llgr_able)
405 {
406 put_af3(buf, ac->afi);
407 buf[3] = ac->llgr_flags;
408 put_u24(buf+4, ac->llgr_time);
409 buf += 7;
410 }
411
412 data[-1] = buf - data;
413 }
414
415 caps->length = buf - buf_head;
416
417 return buf;
418 }
419
420 static int
421 bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
422 {
423 struct bgp_proto *p = conn->bgp;
424 struct bgp_caps *caps;
425 struct bgp_af_caps *ac;
426 int i, cl;
427 u32 af;
428
429 if (!conn->remote_caps)
430 caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps));
431 else
432 {
433 caps = conn->remote_caps;
434 conn->remote_caps = NULL;
435 }
436
437 caps->length += len;
438
439 while (len > 0)
440 {
441 if (len < 2 || len < (2 + pos[1]))
442 goto err;
443
444 /* Capability length */
445 cl = pos[1];
446
447 /* Capability type */
448 switch (pos[0])
449 {
450 case 1: /* Multiprotocol capability, RFC 4760 */
451 if (cl != 4)
452 goto err;
453
454 af = get_af4(pos+2);
455 ac = bgp_get_af_caps(&caps, af);
456 ac->ready = 1;
457 break;
458
459 case 2: /* Route refresh capability, RFC 2918 */
460 if (cl != 0)
461 goto err;
462
463 caps->route_refresh = 1;
464 break;
465
466 case 5: /* Extended next hop encoding capability, RFC 5549 */
467 if (cl % 6)
468 goto err;
469
470 for (i = 0; i < cl; i += 6)
471 {
472 /* Specified only for IPv4 prefixes with IPv6 next hops */
473 if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
474 (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
475 continue;
476
477 af = get_af4(pos+2+i);
478 ac = bgp_get_af_caps(&caps, af);
479 ac->ext_next_hop = 1;
480 }
481 break;
482
483 case 6: /* Extended message length capability, RFC draft */
484 if (cl != 0)
485 goto err;
486
487 caps->ext_messages = 1;
488 break;
489
490 case 64: /* Graceful restart capability, RFC 4724 */
491 if (cl % 4 != 2)
492 goto err;
493
494 /* Only the last instance is valid */
495 WALK_AF_CAPS(caps, ac)
496 {
497 ac->gr_able = 0;
498 ac->gr_af_flags = 0;
499 }
500
501 caps->gr_aware = 1;
502 caps->gr_flags = pos[2] & 0xf0;
503 caps->gr_time = get_u16(pos + 2) & 0x0fff;
504
505 for (i = 2; i < cl; i += 4)
506 {
507 af = get_af3(pos+2+i);
508 ac = bgp_get_af_caps(&caps, af);
509 ac->gr_able = 1;
510 ac->gr_af_flags = pos[2+i+3];
511 }
512 break;
513
514 case 65: /* AS4 capability, RFC 6793 */
515 if (cl != 4)
516 goto err;
517
518 caps->as4_support = 1;
519 caps->as4_number = get_u32(pos + 2);
520 break;
521
522 case 69: /* ADD-PATH capability, RFC 7911 */
523 if (cl % 4)
524 goto err;
525
526 for (i = 0; i < cl; i += 4)
527 {
528 byte val = pos[2+i+3];
529 if (!val || (val > BGP_ADD_PATH_FULL))
530 {
531 log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
532 p->p.name, val);
533 break;
534 }
535 }
536
537 for (i = 0; i < cl; i += 4)
538 {
539 af = get_af3(pos+2+i);
540 ac = bgp_get_af_caps(&caps, af);
541 ac->add_path = pos[2+i+3];
542 }
543 break;
544
545 case 70: /* Enhanced route refresh capability, RFC 7313 */
546 if (cl != 0)
547 goto err;
548
549 caps->enhanced_refresh = 1;
550 break;
551
552 case 71: /* Long lived graceful restart capability, RFC draft */
553 if (cl % 7)
554 goto err;
555
556 /* Presumably, only the last instance is valid */
557 WALK_AF_CAPS(caps, ac)
558 {
559 ac->llgr_able = 0;
560 ac->llgr_flags = 0;
561 ac->llgr_time = 0;
562 }
563
564 caps->llgr_aware = 1;
565
566 for (i = 0; i < cl; i += 7)
567 {
568 af = get_af3(pos+2+i);
569 ac = bgp_get_af_caps(&caps, af);
570 ac->llgr_able = 1;
571 ac->llgr_flags = pos[2+i+3];
572 ac->llgr_time = get_u24(pos + 2+i+4);
573 }
574 break;
575
576 /* We can safely ignore all other capabilities */
577 }
578
579 ADVANCE(pos, len, 2 + cl);
580 }
581
582 /* The LLGR capability must be advertised together with the GR capability,
583 otherwise it must be disregarded */
584 if (!caps->gr_aware && caps->llgr_aware)
585 {
586 caps->llgr_aware = 0;
587 WALK_AF_CAPS(caps, ac)
588 {
589 ac->llgr_able = 0;
590 ac->llgr_flags = 0;
591 ac->llgr_time = 0;
592 }
593 }
594
595 conn->remote_caps = caps;
596 return 0;
597
598 err:
599 mb_free(caps);
600 bgp_error(conn, 2, 0, NULL, 0);
601 return -1;
602 }
603
604 static int
605 bgp_check_capabilities(struct bgp_conn *conn)
606 {
607 struct bgp_proto *p = conn->bgp;
608 struct bgp_caps *local = conn->local_caps;
609 struct bgp_caps *remote = conn->remote_caps;
610 struct bgp_channel *c;
611 int count = 0;
612
613 /* This is partially overlapping with bgp_conn_enter_established_state(),
614 but we need to run this just after we receive OPEN message */
615
616 WALK_LIST(c, p->p.channels)
617 {
618 const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
619 const struct bgp_af_caps *rem = bgp_find_af_caps(remote, c->afi);
620
621 /* Find out whether this channel will be active */
622 int active = loc && loc->ready &&
623 ((rem && rem->ready) || (!remote->length && (c->afi == BGP_AF_IPV4)));
624
625 /* Mandatory must be active */
626 if (c->cf->mandatory && !active)
627 return 0;
628
629 if (active)
630 count++;
631 }
632
633 /* We need at least one channel active */
634 if (!count)
635 return 0;
636
637 return 1;
638 }
639
640 static int
641 bgp_read_options(struct bgp_conn *conn, byte *pos, uint len, uint rest)
642 {
643 struct bgp_proto *p = conn->bgp;
644 int ext = 0;
645
646 /* Handle extended length (draft-ietf-idr-ext-opt-param-07) */
647 if ((len > 0) && (rest > 0) && (pos[0] == 255))
648 {
649 if (rest < 3)
650 goto err;
651
652 /* Update pos/len to describe optional data */
653 len = get_u16(pos+1);
654 ext = 1;
655 pos += 3;
656 rest -= 3;
657 }
658
659 /* Verify that optional data fits into OPEN packet */
660 if (len > rest)
661 goto err;
662
663 /* Length of option parameter header */
664 uint hlen = ext ? 3 : 2;
665
666 while (len > 0)
667 {
668 if (len < hlen)
669 goto err;
670
671 uint otype = get_u8(pos);
672 uint olen = ext ? get_u16(pos+1) : get_u8(pos+1);
673
674 if (len < (hlen + olen))
675 goto err;
676
677 if (otype == 2)
678 {
679 /* BGP capabilities, RFC 5492 */
680 if (p->cf->capabilities)
681 if (bgp_read_capabilities(conn, pos + hlen, olen) < 0)
682 return -1;
683 }
684 else
685 {
686 /* Unknown option */
687 bgp_error(conn, 2, 4, pos, hlen + olen);
688 return -1;
689 }
690
691 ADVANCE(pos, len, hlen + olen);
692 }
693
694 /* Prepare empty caps if no capability option was announced */
695 if (!conn->remote_caps)
696 conn->remote_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
697
698 return 0;
699
700 err:
701 bgp_error(conn, 2, 0, NULL, 0);
702 return -1;
703 }
704
705 static byte *
706 bgp_create_open(struct bgp_conn *conn, byte *buf)
707 {
708 struct bgp_proto *p = conn->bgp;
709
710 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
711 BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
712
713 buf[0] = BGP_VERSION;
714 put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
715 put_u16(buf+3, p->cf->hold_time);
716 put_u32(buf+5, p->local_id);
717
718 if (p->cf->capabilities)
719 {
720 /* Prepare local_caps and write capabilities to buffer */
721 byte *pos = buf+12;
722 byte *end = bgp_write_capabilities(conn, pos);
723 uint len = end - pos;
724
725 if (len < 254)
726 {
727 buf[9] = len + 2; /* Optional parameters length */
728 buf[10] = 2; /* Option 2: Capability list */
729 buf[11] = len; /* Option data length */
730 }
731 else /* draft-ietf-idr-ext-opt-param-07 */
732 {
733 /* Move capabilities 4 B forward */
734 memmove(buf + 16, pos, len);
735 pos = buf + 16;
736 end = pos + len;
737
738 buf[9] = 255; /* Non-ext OP length, fake */
739 buf[10] = 255; /* Non-ext OP type, signals extended length */
740 put_u16(buf+11, len + 3); /* Extended optional parameters length */
741 buf[13] = 2; /* Option 2: Capability list */
742 put_u16(buf+14, len); /* Option extended data length */
743 }
744
745 return end;
746 }
747 else
748 {
749 buf[9] = 0; /* No optional parameters */
750 return buf + 10;
751 }
752
753 return buf;
754 }
755
756 static void
757 bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
758 {
759 struct bgp_proto *p = conn->bgp;
760 struct bgp_conn *other;
761 u32 asn, hold, id;
762
763 /* Check state */
764 if (conn->state != BS_OPENSENT)
765 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
766
767 /* Check message length */
768 if (len < 29)
769 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
770
771 if (pkt[19] != BGP_VERSION)
772 { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
773
774 asn = get_u16(pkt+20);
775 hold = get_u16(pkt+22);
776 id = get_u32(pkt+24);
777 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
778
779 if (bgp_read_options(conn, pkt+29, pkt[28], len-29) < 0)
780 return;
781
782 if (hold > 0 && hold < 3)
783 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
784
785 /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
786 if (!id || (p->is_internal && id == p->local_id))
787 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
788
789 /* RFC 5492 4 - check for required capabilities */
790 if (p->cf->capabilities && !bgp_check_capabilities(conn))
791 { bgp_error(conn, 2, 7, NULL, 0); return; }
792
793 struct bgp_caps *caps = conn->remote_caps;
794
795 if (caps->as4_support)
796 {
797 u32 as4 = caps->as4_number;
798
799 if ((as4 != asn) && (asn != AS_TRANS))
800 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
801
802 /* When remote ASN is unspecified, it must be external one */
803 if (p->remote_as ? (as4 != p->remote_as) : (as4 == p->local_as))
804 { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
805
806 conn->received_as = as4;
807 }
808 else
809 {
810 if (p->remote_as ? (asn != p->remote_as) : (asn == p->local_as))
811 { bgp_error(conn, 2, 2, pkt+20, 2); return; }
812
813 conn->received_as = asn;
814 }
815
816 /* Check the other connection */
817 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
818 switch (other->state)
819 {
820 case BS_CONNECT:
821 case BS_ACTIVE:
822 /* Stop outgoing connection attempts */
823 bgp_conn_enter_idle_state(other);
824 break;
825
826 case BS_IDLE:
827 case BS_OPENSENT:
828 case BS_CLOSE:
829 break;
830
831 case BS_OPENCONFIRM:
832 /*
833 * Description of collision detection rules in RFC 4271 is confusing and
834 * contradictory, but it is essentially:
835 *
836 * 1. Router with higher ID is dominant
837 * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
838 * 3. When both connections are in OpenConfirm state, one initiated by
839 * the dominant router is kept.
840 *
841 * The first line in the expression below evaluates whether the neighbor
842 * is dominant, the second line whether the new connection was initiated
843 * by the neighbor. If both are true (or both are false), we keep the new
844 * connection, otherwise we keep the old one.
845 */
846 if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
847 == (conn == &p->incoming_conn))
848 {
849 /* Should close the other connection */
850 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
851 bgp_error(other, 6, 7, NULL, 0);
852 break;
853 }
854 /* Fall thru */
855 case BS_ESTABLISHED:
856 /* Should close this connection */
857 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
858 bgp_error(conn, 6, 7, NULL, 0);
859 return;
860
861 default:
862 bug("bgp_rx_open: Unknown state");
863 }
864
865 /* Update our local variables */
866 conn->hold_time = MIN(hold, p->cf->hold_time);
867 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
868 conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
869 conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
870 p->remote_id = id;
871
872 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
873 conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
874
875 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
876 bgp_start_timer(conn->hold_timer, conn->hold_time);
877 bgp_conn_enter_openconfirm_state(conn);
878 }
879
880
881 /*
882 * Next hop handling
883 */
884
885 #define REPORT(msg, args...) \
886 ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
887
888 #define DISCARD(msg, args...) \
889 ({ REPORT(msg, ## args); return; })
890
891 #define WITHDRAW(msg, args...) \
892 ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
893
894 #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
895 #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
896 #define NO_NEXT_HOP "Missing NEXT_HOP attribute"
897 #define NO_LABEL_STACK "Missing MPLS stack"
898
899
900 static void
901 bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
902 {
903 struct bgp_proto *p = s->proto;
904 struct bgp_channel *c = s->channel;
905
906 if (c->cf->gw_mode == GW_DIRECT)
907 {
908 neighbor *nbr = NULL;
909
910 /* GW_DIRECT -> single_hop -> p->neigh != NULL */
911 if (ipa_nonzero(gw))
912 nbr = neigh_find(&p->p, gw, NULL, 0);
913 else if (ipa_nonzero(ll))
914 nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
915
916 if (!nbr || (nbr->scope == SCOPE_HOST))
917 WITHDRAW(BAD_NEXT_HOP);
918
919 a->dest = RTD_UNICAST;
920 a->nh.gw = nbr->addr;
921 a->nh.iface = nbr->iface;
922 a->igp_metric = c->cf->cost;
923 }
924 else /* GW_RECURSIVE */
925 {
926 if (ipa_zero(gw))
927 WITHDRAW(BAD_NEXT_HOP);
928
929 rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
930 s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
931
932 if (!s->mpls)
933 rta_apply_hostentry(a, s->hostentry, NULL);
934
935 /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
936 }
937 }
938
939 static void
940 bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
941 {
942 if (lnum > MPLS_MAX_LABEL_STACK)
943 {
944 REPORT("Too many MPLS labels ($u)", lnum);
945
946 a->dest = RTD_UNREACHABLE;
947 a->hostentry = NULL;
948 a->nh = (struct nexthop) { };
949 return;
950 }
951
952 /* Handle implicit NULL as empty MPLS stack */
953 if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
954 lnum = 0;
955
956 if (s->channel->cf->gw_mode == GW_DIRECT)
957 {
958 a->nh.labels = lnum;
959 memcpy(a->nh.label, labels, 4*lnum);
960 }
961 else /* GW_RECURSIVE */
962 {
963 mpls_label_stack ms;
964
965 ms.len = lnum;
966 memcpy(ms.stack, labels, 4*lnum);
967 rta_apply_hostentry(a, s->hostentry, &ms);
968 }
969 }
970
971
972 static int
973 bgp_match_src(struct bgp_export_state *s, int mode)
974 {
975 switch (mode)
976 {
977 case NH_NO: return 0;
978 case NH_ALL: return 1;
979 case NH_IBGP: return s->src && s->src->is_internal;
980 case NH_EBGP: return s->src && !s->src->is_internal;
981 default: return 0;
982 }
983 }
984
985 static inline int
986 bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
987 {
988 struct bgp_proto *p = s->proto;
989 struct bgp_channel *c = s->channel;
990 ip_addr *nh = (void *) a->u.ptr->data;
991
992 /* Handle next hop self option */
993 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
994 return 0;
995
996 /* Handle next hop keep option */
997 if (c->cf->next_hop_keep && bgp_match_src(s, c->cf->next_hop_keep))
998 return 1;
999
1000 /* Keep it when explicitly set in export filter */
1001 if (a->type & EAF_FRESH)
1002 return 1;
1003
1004 /* Check for non-matching AF */
1005 if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
1006 return 0;
1007
1008 /* Keep it when exported to internal peers */
1009 if (p->is_interior && ipa_nonzero(*nh))
1010 return 1;
1011
1012 /* Keep it when forwarded between single-hop BGPs on the same iface */
1013 struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
1014 return p->neigh && (p->neigh->iface == ifa);
1015 }
1016
1017 static inline int
1018 bgp_use_gateway(struct bgp_export_state *s)
1019 {
1020 struct bgp_proto *p = s->proto;
1021 struct bgp_channel *c = s->channel;
1022 rta *ra = s->route->attrs;
1023
1024 /* Handle next hop self option - also applies to gateway */
1025 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
1026 return 0;
1027
1028 /* We need one valid global gateway */
1029 if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
1030 return 0;
1031
1032 /* Check for non-matching AF */
1033 if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
1034 return 0;
1035
1036 /* Use it when exported to internal peers */
1037 if (p->is_interior)
1038 return 1;
1039
1040 /* Use it when forwarded to single-hop BGP peer on on the same iface */
1041 return p->neigh && (p->neigh->iface == ra->nh.iface);
1042 }
1043
1044 static void
1045 bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
1046 {
1047 if (!a || !bgp_use_next_hop(s, a))
1048 {
1049 if (bgp_use_gateway(s))
1050 {
1051 rta *ra = s->route->attrs;
1052 ip_addr nh[1] = { ra->nh.gw };
1053 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
1054
1055 if (s->mpls)
1056 {
1057 u32 implicit_null = BGP_MPLS_NULL;
1058 u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
1059 uint lnum = ra->nh.labels ? ra->nh.labels : 1;
1060 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
1061 }
1062 }
1063 else
1064 {
1065 ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
1066 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
1067 s->local_next_hop = 1;
1068
1069 /* TODO: Use local MPLS assigned label */
1070 if (s->mpls)
1071 {
1072 u32 implicit_null = BGP_MPLS_NULL;
1073 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
1074 }
1075 }
1076 }
1077
1078 /* Check if next hop is valid */
1079 a = bgp_find_attr(*to, BA_NEXT_HOP);
1080 if (!a)
1081 WITHDRAW(NO_NEXT_HOP);
1082
1083 ip_addr *nh = (void *) a->u.ptr->data;
1084 ip_addr peer = s->proto->remote_ip;
1085 uint len = a->u.ptr->length;
1086
1087 /* Forbid zero next hop */
1088 if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
1089 WITHDRAW(BAD_NEXT_HOP);
1090
1091 /* Forbid next hop equal to neighbor IP */
1092 if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
1093 WITHDRAW(BAD_NEXT_HOP);
1094
1095 /* Forbid next hop with non-matching AF */
1096 if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
1097 !s->channel->ext_next_hop)
1098 WITHDRAW(BAD_NEXT_HOP);
1099
1100 /* Just check if MPLS stack */
1101 if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
1102 WITHDRAW(NO_LABEL_STACK);
1103 }
1104
1105 static uint
1106 bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1107 {
1108 /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
1109 ip_addr *nh = (void *) a->u.ptr->data;
1110 uint len = a->u.ptr->length;
1111
1112 ASSERT((len == 16) || (len == 32));
1113
1114 /*
1115 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1116 * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
1117 * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
1118 * IPv6 address with IPv6 NLRI.
1119 */
1120
1121 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1122 {
1123 put_ip4(buf, ipa_to_ip4(nh[0]));
1124 return 4;
1125 }
1126
1127 put_ip6(buf, ipa_to_ip6(nh[0]));
1128
1129 if (len == 32)
1130 put_ip6(buf+16, ipa_to_ip6(nh[1]));
1131
1132 return len;
1133 }
1134
1135 static void
1136 bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1137 {
1138 struct bgp_channel *c = s->channel;
1139 struct adata *ad = lp_alloc_adata(s->pool, 32);
1140 ip_addr *nh = (void *) ad->data;
1141
1142 if (len == 4)
1143 {
1144 nh[0] = ipa_from_ip4(get_ip4(data));
1145 nh[1] = IPA_NONE;
1146 }
1147 else if (len == 16)
1148 {
1149 nh[0] = ipa_from_ip6(get_ip6(data));
1150 nh[1] = IPA_NONE;
1151
1152 if (ipa_is_link_local(nh[0]))
1153 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1154 }
1155 else if (len == 32)
1156 {
1157 nh[0] = ipa_from_ip6(get_ip6(data));
1158 nh[1] = ipa_from_ip6(get_ip6(data+16));
1159
1160 if (ipa_is_link_local(nh[0]))
1161 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1162
1163 if (ipa_is_ip4(nh[0]) || !ipa_is_link_local(nh[1]))
1164 nh[1] = IPA_NONE;
1165 }
1166 else
1167 bgp_parse_error(s, 9);
1168
1169 if (ipa_zero(nh[1]))
1170 ad->length = 16;
1171
1172 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1173 WITHDRAW(BAD_NEXT_HOP);
1174
1175 // XXXX validate next hop
1176
1177 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1178 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1179 }
1180
1181 static uint
1182 bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1183 {
1184 ip_addr *nh = (void *) a->u.ptr->data;
1185 uint len = a->u.ptr->length;
1186
1187 ASSERT((len == 16) || (len == 32));
1188
1189 /*
1190 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1191 * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1192 * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1193 * IPv6 address with VPNv6 NLRI.
1194 */
1195
1196 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1197 {
1198 put_u64(buf, 0); /* VPN RD is 0 */
1199 put_ip4(buf+8, ipa_to_ip4(nh[0]));
1200 return 12;
1201 }
1202
1203 put_u64(buf, 0); /* VPN RD is 0 */
1204 put_ip6(buf+8, ipa_to_ip6(nh[0]));
1205
1206 if (len == 16)
1207 return 24;
1208
1209 put_u64(buf+24, 0); /* VPN RD is 0 */
1210 put_ip6(buf+32, ipa_to_ip6(nh[1]));
1211
1212 return 48;
1213 }
1214
1215 static void
1216 bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1217 {
1218 struct bgp_channel *c = s->channel;
1219 struct adata *ad = lp_alloc_adata(s->pool, 32);
1220 ip_addr *nh = (void *) ad->data;
1221
1222 if (len == 12)
1223 {
1224 nh[0] = ipa_from_ip4(get_ip4(data+8));
1225 nh[1] = IPA_NONE;
1226 }
1227 else if (len == 24)
1228 {
1229 nh[0] = ipa_from_ip6(get_ip6(data+8));
1230 nh[1] = IPA_NONE;
1231
1232 if (ipa_is_link_local(nh[0]))
1233 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1234 }
1235 else if (len == 48)
1236 {
1237 nh[0] = ipa_from_ip6(get_ip6(data+8));
1238 nh[1] = ipa_from_ip6(get_ip6(data+32));
1239
1240 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1241 nh[1] = IPA_NONE;
1242 }
1243 else
1244 bgp_parse_error(s, 9);
1245
1246 if (ipa_zero(nh[1]))
1247 ad->length = 16;
1248
1249 /* XXXX which error */
1250 if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1251 bgp_parse_error(s, 9);
1252
1253 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1254 WITHDRAW(BAD_NEXT_HOP);
1255
1256 // XXXX validate next hop
1257
1258 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1259 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1260 }
1261
1262
1263
1264 static uint
1265 bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1266 {
1267 return 0;
1268 }
1269
1270 static void
1271 bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1272 {
1273 /*
1274 * Although we expect no next hop and RFC 7606 7.11 states that attribute
1275 * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1276 * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1277 */
1278
1279 return;
1280 }
1281
1282 static void
1283 bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1284 {
1285 /* NEXT_HOP shall not pass */
1286 if (a)
1287 bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1288 }
1289
1290
1291 /*
1292 * UPDATE
1293 */
1294
1295 static void
1296 bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1297 {
1298 if (path_id != s->last_id)
1299 {
1300 s->last_src = rt_get_source(&s->proto->p, path_id);
1301 s->last_id = path_id;
1302
1303 rta_free(s->cached_rta);
1304 s->cached_rta = NULL;
1305 }
1306
1307 if (!a0)
1308 {
1309 /* Route withdraw */
1310 rte_update3(&s->channel->c, n, NULL, s->last_src);
1311 return;
1312 }
1313
1314 /* Prepare cached route attributes */
1315 if (s->cached_rta == NULL)
1316 {
1317 a0->src = s->last_src;
1318
1319 /* Workaround for rta_lookup() breaking eattrs */
1320 ea_list *ea = a0->eattrs;
1321 s->cached_rta = rta_lookup(a0);
1322 a0->eattrs = ea;
1323 }
1324
1325 rta *a = rta_clone(s->cached_rta);
1326 rte *e = rte_get_temp(a);
1327
1328 e->pflags = 0;
1329 e->u.bgp.suppressed = 0;
1330 e->u.bgp.stale = -1;
1331 rte_update3(&s->channel->c, n, e, s->last_src);
1332 }
1333
1334 static void
1335 bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, const adata *mpls, byte **pos, uint *size, byte *pxlen)
1336 {
1337 const u32 dummy = 0;
1338 const u32 *labels = mpls ? (const u32 *) mpls->data : &dummy;
1339 uint lnum = mpls ? (mpls->length / 4) : 1;
1340
1341 for (uint i = 0; i < lnum; i++)
1342 {
1343 put_u24(*pos, labels[i] << 4);
1344 ADVANCE(*pos, *size, 3);
1345 }
1346
1347 /* Add bottom-of-stack flag */
1348 (*pos)[-1] |= BGP_MPLS_BOS;
1349
1350 *pxlen += 24 * lnum;
1351 }
1352
1353 static void
1354 bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1355 {
1356 u32 labels[BGP_MPLS_MAX], label;
1357 uint lnum = 0;
1358
1359 do {
1360 if (*pxlen < 24)
1361 bgp_parse_error(s, 1);
1362
1363 label = get_u24(*pos);
1364 labels[lnum++] = label >> 4;
1365 ADVANCE(*pos, *len, 3);
1366 *pxlen -= 24;
1367
1368 /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1369 fixed-size 24-bit Compatibility field, which MUST be ignored */
1370 if (!a && !s->err_withdraw)
1371 return;
1372 }
1373 while (!(label & BGP_MPLS_BOS));
1374
1375 if (!a)
1376 return;
1377
1378 /* Attach MPLS attribute unless we already have one */
1379 if (!s->mpls_labels)
1380 {
1381 s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1382 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1383 }
1384
1385 /* Overwrite data in the attribute */
1386 s->mpls_labels->length = 4*lnum;
1387 memcpy(s->mpls_labels->data, labels, 4*lnum);
1388
1389 /* Update next hop entry in rta */
1390 bgp_apply_mpls_labels(s, a, labels, lnum);
1391
1392 /* Attributes were changed, invalidate cached entry */
1393 rta_free(s->cached_rta);
1394 s->cached_rta = NULL;
1395
1396 return;
1397 }
1398
1399 static uint
1400 bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1401 {
1402 byte *pos = buf;
1403
1404 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1405 {
1406 struct bgp_prefix *px = HEAD(buck->prefixes);
1407 struct net_addr_ip4 *net = (void *) px->net;
1408
1409 /* Encode path ID */
1410 if (s->add_path)
1411 {
1412 put_u32(pos, px->path_id);
1413 ADVANCE(pos, size, 4);
1414 }
1415
1416 /* Encode prefix length */
1417 *pos = net->pxlen;
1418 ADVANCE(pos, size, 1);
1419
1420 /* Encode MPLS labels */
1421 if (s->mpls)
1422 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1423
1424 /* Encode prefix body */
1425 ip4_addr a = ip4_hton(net->prefix);
1426 uint b = (net->pxlen + 7) / 8;
1427 memcpy(pos, &a, b);
1428 ADVANCE(pos, size, b);
1429
1430 bgp_free_prefix(s->channel, px);
1431 }
1432
1433 return pos - buf;
1434 }
1435
1436 static void
1437 bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1438 {
1439 while (len)
1440 {
1441 net_addr_ip4 net;
1442 u32 path_id = 0;
1443
1444 /* Decode path ID */
1445 if (s->add_path)
1446 {
1447 if (len < 5)
1448 bgp_parse_error(s, 1);
1449
1450 path_id = get_u32(pos);
1451 ADVANCE(pos, len, 4);
1452 }
1453
1454 /* Decode prefix length */
1455 uint l = *pos;
1456 ADVANCE(pos, len, 1);
1457
1458 if (len < ((l + 7) / 8))
1459 bgp_parse_error(s, 1);
1460
1461 /* Decode MPLS labels */
1462 if (s->mpls)
1463 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1464
1465 if (l > IP4_MAX_PREFIX_LENGTH)
1466 bgp_parse_error(s, 10);
1467
1468 /* Decode prefix body */
1469 ip4_addr addr = IP4_NONE;
1470 uint b = (l + 7) / 8;
1471 memcpy(&addr, pos, b);
1472 ADVANCE(pos, len, b);
1473
1474 net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1475 net_normalize_ip4(&net);
1476
1477 // XXXX validate prefix
1478
1479 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1480 }
1481 }
1482
1483
1484 static uint
1485 bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1486 {
1487 byte *pos = buf;
1488
1489 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1490 {
1491 struct bgp_prefix *px = HEAD(buck->prefixes);
1492 struct net_addr_ip6 *net = (void *) px->net;
1493
1494 /* Encode path ID */
1495 if (s->add_path)
1496 {
1497 put_u32(pos, px->path_id);
1498 ADVANCE(pos, size, 4);
1499 }
1500
1501 /* Encode prefix length */
1502 *pos = net->pxlen;
1503 ADVANCE(pos, size, 1);
1504
1505 /* Encode MPLS labels */
1506 if (s->mpls)
1507 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1508
1509 /* Encode prefix body */
1510 ip6_addr a = ip6_hton(net->prefix);
1511 uint b = (net->pxlen + 7) / 8;
1512 memcpy(pos, &a, b);
1513 ADVANCE(pos, size, b);
1514
1515 bgp_free_prefix(s->channel, px);
1516 }
1517
1518 return pos - buf;
1519 }
1520
1521 static void
1522 bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1523 {
1524 while (len)
1525 {
1526 net_addr_ip6 net;
1527 u32 path_id = 0;
1528
1529 /* Decode path ID */
1530 if (s->add_path)
1531 {
1532 if (len < 5)
1533 bgp_parse_error(s, 1);
1534
1535 path_id = get_u32(pos);
1536 ADVANCE(pos, len, 4);
1537 }
1538
1539 /* Decode prefix length */
1540 uint l = *pos;
1541 ADVANCE(pos, len, 1);
1542
1543 if (len < ((l + 7) / 8))
1544 bgp_parse_error(s, 1);
1545
1546 /* Decode MPLS labels */
1547 if (s->mpls)
1548 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1549
1550 if (l > IP6_MAX_PREFIX_LENGTH)
1551 bgp_parse_error(s, 10);
1552
1553 /* Decode prefix body */
1554 ip6_addr addr = IP6_NONE;
1555 uint b = (l + 7) / 8;
1556 memcpy(&addr, pos, b);
1557 ADVANCE(pos, len, b);
1558
1559 net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1560 net_normalize_ip6(&net);
1561
1562 // XXXX validate prefix
1563
1564 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1565 }
1566 }
1567
1568 static uint
1569 bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1570 {
1571 byte *pos = buf;
1572
1573 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1574 {
1575 struct bgp_prefix *px = HEAD(buck->prefixes);
1576 struct net_addr_vpn4 *net = (void *) px->net;
1577
1578 /* Encode path ID */
1579 if (s->add_path)
1580 {
1581 put_u32(pos, px->path_id);
1582 ADVANCE(pos, size, 4);
1583 }
1584
1585 /* Encode prefix length */
1586 *pos = 64 + net->pxlen;
1587 ADVANCE(pos, size, 1);
1588
1589 /* Encode MPLS labels */
1590 if (s->mpls)
1591 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1592
1593 /* Encode route distinguisher */
1594 put_u64(pos, net->rd);
1595 ADVANCE(pos, size, 8);
1596
1597 /* Encode prefix body */
1598 ip4_addr a = ip4_hton(net->prefix);
1599 uint b = (net->pxlen + 7) / 8;
1600 memcpy(pos, &a, b);
1601 ADVANCE(pos, size, b);
1602
1603 bgp_free_prefix(s->channel, px);
1604 }
1605
1606 return pos - buf;
1607 }
1608
1609 static void
1610 bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1611 {
1612 while (len)
1613 {
1614 net_addr_vpn4 net;
1615 u32 path_id = 0;
1616
1617 /* Decode path ID */
1618 if (s->add_path)
1619 {
1620 if (len < 5)
1621 bgp_parse_error(s, 1);
1622
1623 path_id = get_u32(pos);
1624 ADVANCE(pos, len, 4);
1625 }
1626
1627 /* Decode prefix length */
1628 uint l = *pos;
1629 ADVANCE(pos, len, 1);
1630
1631 if (len < ((l + 7) / 8))
1632 bgp_parse_error(s, 1);
1633
1634 /* Decode MPLS labels */
1635 if (s->mpls)
1636 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1637
1638 /* Decode route distinguisher */
1639 if (l < 64)
1640 bgp_parse_error(s, 1);
1641
1642 u64 rd = get_u64(pos);
1643 ADVANCE(pos, len, 8);
1644 l -= 64;
1645
1646 if (l > IP4_MAX_PREFIX_LENGTH)
1647 bgp_parse_error(s, 10);
1648
1649 /* Decode prefix body */
1650 ip4_addr addr = IP4_NONE;
1651 uint b = (l + 7) / 8;
1652 memcpy(&addr, pos, b);
1653 ADVANCE(pos, len, b);
1654
1655 net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1656 net_normalize_vpn4(&net);
1657
1658 // XXXX validate prefix
1659
1660 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1661 }
1662 }
1663
1664
1665 static uint
1666 bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1667 {
1668 byte *pos = buf;
1669
1670 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1671 {
1672 struct bgp_prefix *px = HEAD(buck->prefixes);
1673 struct net_addr_vpn6 *net = (void *) px->net;
1674
1675 /* Encode path ID */
1676 if (s->add_path)
1677 {
1678 put_u32(pos, px->path_id);
1679 ADVANCE(pos, size, 4);
1680 }
1681
1682 /* Encode prefix length */
1683 *pos = 64 + net->pxlen;
1684 ADVANCE(pos, size, 1);
1685
1686 /* Encode MPLS labels */
1687 if (s->mpls)
1688 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1689
1690 /* Encode route distinguisher */
1691 put_u64(pos, net->rd);
1692 ADVANCE(pos, size, 8);
1693
1694 /* Encode prefix body */
1695 ip6_addr a = ip6_hton(net->prefix);
1696 uint b = (net->pxlen + 7) / 8;
1697 memcpy(pos, &a, b);
1698 ADVANCE(pos, size, b);
1699
1700 bgp_free_prefix(s->channel, px);
1701 }
1702
1703 return pos - buf;
1704 }
1705
1706 static void
1707 bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1708 {
1709 while (len)
1710 {
1711 net_addr_vpn6 net;
1712 u32 path_id = 0;
1713
1714 /* Decode path ID */
1715 if (s->add_path)
1716 {
1717 if (len < 5)
1718 bgp_parse_error(s, 1);
1719
1720 path_id = get_u32(pos);
1721 ADVANCE(pos, len, 4);
1722 }
1723
1724 /* Decode prefix length */
1725 uint l = *pos;
1726 ADVANCE(pos, len, 1);
1727
1728 if (len < ((l + 7) / 8))
1729 bgp_parse_error(s, 1);
1730
1731 /* Decode MPLS labels */
1732 if (s->mpls)
1733 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1734
1735 /* Decode route distinguisher */
1736 if (l < 64)
1737 bgp_parse_error(s, 1);
1738
1739 u64 rd = get_u64(pos);
1740 ADVANCE(pos, len, 8);
1741 l -= 64;
1742
1743 if (l > IP6_MAX_PREFIX_LENGTH)
1744 bgp_parse_error(s, 10);
1745
1746 /* Decode prefix body */
1747 ip6_addr addr = IP6_NONE;
1748 uint b = (l + 7) / 8;
1749 memcpy(&addr, pos, b);
1750 ADVANCE(pos, len, b);
1751
1752 net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1753 net_normalize_vpn6(&net);
1754
1755 // XXXX validate prefix
1756
1757 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1758 }
1759 }
1760
1761
1762 static uint
1763 bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1764 {
1765 byte *pos = buf;
1766
1767 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1768 {
1769 struct bgp_prefix *px = HEAD(buck->prefixes);
1770 struct net_addr_flow4 *net = (void *) px->net;
1771 uint flen = net->length - sizeof(net_addr_flow4);
1772
1773 /* Encode path ID */
1774 if (s->add_path)
1775 {
1776 put_u32(pos, px->path_id);
1777 ADVANCE(pos, size, 4);
1778 }
1779
1780 if (flen > size)
1781 break;
1782
1783 /* Copy whole flow data including length */
1784 memcpy(pos, net->data, flen);
1785 ADVANCE(pos, size, flen);
1786
1787 bgp_free_prefix(s->channel, px);
1788 }
1789
1790 return pos - buf;
1791 }
1792
1793 static void
1794 bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1795 {
1796 while (len)
1797 {
1798 u32 path_id = 0;
1799
1800 /* Decode path ID */
1801 if (s->add_path)
1802 {
1803 if (len < 4)
1804 bgp_parse_error(s, 1);
1805
1806 path_id = get_u32(pos);
1807 ADVANCE(pos, len, 4);
1808 }
1809
1810 if (len < 2)
1811 bgp_parse_error(s, 1);
1812
1813 /* Decode flow length */
1814 uint hlen = flow_hdr_length(pos);
1815 uint dlen = flow_read_length(pos);
1816 uint flen = hlen + dlen;
1817 byte *data = pos + hlen;
1818
1819 if (len < flen)
1820 bgp_parse_error(s, 1);
1821
1822 /* Validate flow data */
1823 enum flow_validated_state r = flow4_validate(data, dlen);
1824 if (r != FLOW_ST_VALID)
1825 {
1826 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1827 bgp_parse_error(s, 1);
1828 }
1829
1830 ip4_addr px = IP4_NONE;
1831 uint pxlen = 0;
1832
1833 /* Decode dst prefix */
1834 if (data[0] == FLOW_TYPE_DST_PREFIX)
1835 {
1836 px = flow_read_ip4_part(data);
1837 pxlen = flow_read_pxlen(data);
1838 }
1839
1840 /* Prepare the flow */
1841 net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1842 net_fill_flow4(n, px, pxlen, pos, flen);
1843 ADVANCE(pos, len, flen);
1844
1845 bgp_rte_update(s, n, path_id, a);
1846 }
1847 }
1848
1849
1850 static uint
1851 bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1852 {
1853 byte *pos = buf;
1854
1855 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1856 {
1857 struct bgp_prefix *px = HEAD(buck->prefixes);
1858 struct net_addr_flow6 *net = (void *) px->net;
1859 uint flen = net->length - sizeof(net_addr_flow6);
1860
1861 /* Encode path ID */
1862 if (s->add_path)
1863 {
1864 put_u32(pos, px->path_id);
1865 ADVANCE(pos, size, 4);
1866 }
1867
1868 if (flen > size)
1869 break;
1870
1871 /* Copy whole flow data including length */
1872 memcpy(pos, net->data, flen);
1873 ADVANCE(pos, size, flen);
1874
1875 bgp_free_prefix(s->channel, px);
1876 }
1877
1878 return pos - buf;
1879 }
1880
1881 static void
1882 bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1883 {
1884 while (len)
1885 {
1886 u32 path_id = 0;
1887
1888 /* Decode path ID */
1889 if (s->add_path)
1890 {
1891 if (len < 4)
1892 bgp_parse_error(s, 1);
1893
1894 path_id = get_u32(pos);
1895 ADVANCE(pos, len, 4);
1896 }
1897
1898 if (len < 2)
1899 bgp_parse_error(s, 1);
1900
1901 /* Decode flow length */
1902 uint hlen = flow_hdr_length(pos);
1903 uint dlen = flow_read_length(pos);
1904 uint flen = hlen + dlen;
1905 byte *data = pos + hlen;
1906
1907 if (len < flen)
1908 bgp_parse_error(s, 1);
1909
1910 /* Validate flow data */
1911 enum flow_validated_state r = flow6_validate(data, dlen);
1912 if (r != FLOW_ST_VALID)
1913 {
1914 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1915 bgp_parse_error(s, 1);
1916 }
1917
1918 ip6_addr px = IP6_NONE;
1919 uint pxlen = 0;
1920
1921 /* Decode dst prefix */
1922 if (data[0] == FLOW_TYPE_DST_PREFIX)
1923 {
1924 px = flow_read_ip6_part(data);
1925 pxlen = flow_read_pxlen(data);
1926 }
1927
1928 /* Prepare the flow */
1929 net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1930 net_fill_flow6(n, px, pxlen, pos, flen);
1931 ADVANCE(pos, len, flen);
1932
1933 bgp_rte_update(s, n, path_id, a);
1934 }
1935 }
1936
1937
1938 static const struct bgp_af_desc bgp_af_table[] = {
1939 {
1940 .afi = BGP_AF_IPV4,
1941 .net = NET_IP4,
1942 .name = "ipv4",
1943 .encode_nlri = bgp_encode_nlri_ip4,
1944 .decode_nlri = bgp_decode_nlri_ip4,
1945 .encode_next_hop = bgp_encode_next_hop_ip,
1946 .decode_next_hop = bgp_decode_next_hop_ip,
1947 .update_next_hop = bgp_update_next_hop_ip,
1948 },
1949 {
1950 .afi = BGP_AF_IPV4_MC,
1951 .net = NET_IP4,
1952 .name = "ipv4-mc",
1953 .encode_nlri = bgp_encode_nlri_ip4,
1954 .decode_nlri = bgp_decode_nlri_ip4,
1955 .encode_next_hop = bgp_encode_next_hop_ip,
1956 .decode_next_hop = bgp_decode_next_hop_ip,
1957 .update_next_hop = bgp_update_next_hop_ip,
1958 },
1959 {
1960 .afi = BGP_AF_IPV4_MPLS,
1961 .net = NET_IP4,
1962 .mpls = 1,
1963 .name = "ipv4-mpls",
1964 .encode_nlri = bgp_encode_nlri_ip4,
1965 .decode_nlri = bgp_decode_nlri_ip4,
1966 .encode_next_hop = bgp_encode_next_hop_ip,
1967 .decode_next_hop = bgp_decode_next_hop_ip,
1968 .update_next_hop = bgp_update_next_hop_ip,
1969 },
1970 {
1971 .afi = BGP_AF_IPV6,
1972 .net = NET_IP6,
1973 .name = "ipv6",
1974 .encode_nlri = bgp_encode_nlri_ip6,
1975 .decode_nlri = bgp_decode_nlri_ip6,
1976 .encode_next_hop = bgp_encode_next_hop_ip,
1977 .decode_next_hop = bgp_decode_next_hop_ip,
1978 .update_next_hop = bgp_update_next_hop_ip,
1979 },
1980 {
1981 .afi = BGP_AF_IPV6_MC,
1982 .net = NET_IP6,
1983 .name = "ipv6-mc",
1984 .encode_nlri = bgp_encode_nlri_ip6,
1985 .decode_nlri = bgp_decode_nlri_ip6,
1986 .encode_next_hop = bgp_encode_next_hop_ip,
1987 .decode_next_hop = bgp_decode_next_hop_ip,
1988 .update_next_hop = bgp_update_next_hop_ip,
1989 },
1990 {
1991 .afi = BGP_AF_IPV6_MPLS,
1992 .net = NET_IP6,
1993 .mpls = 1,
1994 .name = "ipv6-mpls",
1995 .encode_nlri = bgp_encode_nlri_ip6,
1996 .decode_nlri = bgp_decode_nlri_ip6,
1997 .encode_next_hop = bgp_encode_next_hop_ip,
1998 .decode_next_hop = bgp_decode_next_hop_ip,
1999 .update_next_hop = bgp_update_next_hop_ip,
2000 },
2001 {
2002 .afi = BGP_AF_VPN4_MPLS,
2003 .net = NET_VPN4,
2004 .mpls = 1,
2005 .name = "vpn4-mpls",
2006 .encode_nlri = bgp_encode_nlri_vpn4,
2007 .decode_nlri = bgp_decode_nlri_vpn4,
2008 .encode_next_hop = bgp_encode_next_hop_vpn,
2009 .decode_next_hop = bgp_decode_next_hop_vpn,
2010 .update_next_hop = bgp_update_next_hop_ip,
2011 },
2012 {
2013 .afi = BGP_AF_VPN6_MPLS,
2014 .net = NET_VPN6,
2015 .mpls = 1,
2016 .name = "vpn6-mpls",
2017 .encode_nlri = bgp_encode_nlri_vpn6,
2018 .decode_nlri = bgp_decode_nlri_vpn6,
2019 .encode_next_hop = bgp_encode_next_hop_vpn,
2020 .decode_next_hop = bgp_decode_next_hop_vpn,
2021 .update_next_hop = bgp_update_next_hop_ip,
2022 },
2023 {
2024 .afi = BGP_AF_VPN4_MC,
2025 .net = NET_VPN4,
2026 .name = "vpn4-mc",
2027 .encode_nlri = bgp_encode_nlri_vpn4,
2028 .decode_nlri = bgp_decode_nlri_vpn4,
2029 .encode_next_hop = bgp_encode_next_hop_vpn,
2030 .decode_next_hop = bgp_decode_next_hop_vpn,
2031 .update_next_hop = bgp_update_next_hop_ip,
2032 },
2033 {
2034 .afi = BGP_AF_VPN6_MC,
2035 .net = NET_VPN6,
2036 .name = "vpn6-mc",
2037 .encode_nlri = bgp_encode_nlri_vpn6,
2038 .decode_nlri = bgp_decode_nlri_vpn6,
2039 .encode_next_hop = bgp_encode_next_hop_vpn,
2040 .decode_next_hop = bgp_decode_next_hop_vpn,
2041 .update_next_hop = bgp_update_next_hop_ip,
2042 },
2043 {
2044 .afi = BGP_AF_FLOW4,
2045 .net = NET_FLOW4,
2046 .no_igp = 1,
2047 .name = "flow4",
2048 .encode_nlri = bgp_encode_nlri_flow4,
2049 .decode_nlri = bgp_decode_nlri_flow4,
2050 .encode_next_hop = bgp_encode_next_hop_none,
2051 .decode_next_hop = bgp_decode_next_hop_none,
2052 .update_next_hop = bgp_update_next_hop_none,
2053 },
2054 {
2055 .afi = BGP_AF_FLOW6,
2056 .net = NET_FLOW6,
2057 .no_igp = 1,
2058 .name = "flow6",
2059 .encode_nlri = bgp_encode_nlri_flow6,
2060 .decode_nlri = bgp_decode_nlri_flow6,
2061 .encode_next_hop = bgp_encode_next_hop_none,
2062 .decode_next_hop = bgp_decode_next_hop_none,
2063 .update_next_hop = bgp_update_next_hop_none,
2064 },
2065 };
2066
2067 const struct bgp_af_desc *
2068 bgp_get_af_desc(u32 afi)
2069 {
2070 uint i;
2071 for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
2072 if (bgp_af_table[i].afi == afi)
2073 return &bgp_af_table[i];
2074
2075 return NULL;
2076 }
2077
2078 static inline uint
2079 bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2080 {
2081 return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
2082 }
2083
2084 static inline uint
2085 bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
2086 {
2087 return s->channel->desc->encode_next_hop(s, nh, buf, 255);
2088 }
2089
2090 void
2091 bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
2092 {
2093 s->channel->desc->update_next_hop(s, a, to);
2094 }
2095
2096 #define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
2097
2098 static byte *
2099 bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2100 {
2101 /*
2102 * 2 B Withdrawn Routes Length (zero)
2103 * --- IPv4 Withdrawn Routes NLRI (unused)
2104 * 2 B Total Path Attribute Length
2105 * var Path Attributes
2106 * var IPv4 Network Layer Reachability Information
2107 */
2108
2109 int lr, la;
2110
2111 la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
2112 if (la < 0)
2113 {
2114 /* Attribute list too long */
2115 bgp_withdraw_bucket(s->channel, buck);
2116 return NULL;
2117 }
2118
2119 put_u16(buf+0, 0);
2120 put_u16(buf+2, la);
2121
2122 lr = bgp_encode_nlri(s, buck, buf+4+la, end);
2123
2124 return buf+4+la+lr;
2125 }
2126
2127 static byte *
2128 bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2129 {
2130 /*
2131 * 2 B IPv4 Withdrawn Routes Length (zero)
2132 * --- IPv4 Withdrawn Routes NLRI (unused)
2133 * 2 B Total Path Attribute Length
2134 * 1 B MP_REACH_NLRI hdr - Attribute Flags
2135 * 1 B MP_REACH_NLRI hdr - Attribute Type Code
2136 * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
2137 * 2 B MP_REACH_NLRI data - Address Family Identifier
2138 * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
2139 * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
2140 * var MP_REACH_NLRI data - Network Address of Next Hop
2141 * 1 B MP_REACH_NLRI data - Reserved (zero)
2142 * var MP_REACH_NLRI data - Network Layer Reachability Information
2143 * var Rest of Path Attributes
2144 * --- IPv4 Network Layer Reachability Information (unused)
2145 */
2146
2147 int lh, lr, la; /* Lengths of next hop, NLRI and attributes */
2148
2149 /* Begin of MP_REACH_NLRI atribute */
2150 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2151 buf[5] = BA_MP_REACH_NLRI;
2152 put_u16(buf+6, 0); /* Will be fixed later */
2153 put_af3(buf+8, s->channel->afi);
2154 byte *pos = buf+11;
2155
2156 /* Encode attributes to temporary buffer */
2157 byte *abuf = alloca(MAX_ATTRS_LENGTH);
2158 la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2159 if (la < 0)
2160 {
2161 /* Attribute list too long */
2162 bgp_withdraw_bucket(s->channel, buck);
2163 return NULL;
2164 }
2165
2166 /* Encode the next hop */
2167 lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2168 *pos = lh;
2169 pos += 1+lh;
2170
2171 /* Reserved field */
2172 *pos++ = 0;
2173
2174 /* Encode the NLRI */
2175 lr = bgp_encode_nlri(s, buck, pos, end - la);
2176 pos += lr;
2177
2178 /* End of MP_REACH_NLRI atribute, update data length */
2179 put_u16(buf+6, pos-buf-8);
2180
2181 /* Copy remaining attributes */
2182 memcpy(pos, abuf, la);
2183 pos += la;
2184
2185 /* Initial UPDATE fields */
2186 put_u16(buf+0, 0);
2187 put_u16(buf+2, pos-buf-4);
2188
2189 return pos;
2190 }
2191
2192 #undef MAX_ATTRS_LENGTH
2193
2194 static byte *
2195 bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2196 {
2197 /*
2198 * 2 B Withdrawn Routes Length
2199 * var IPv4 Withdrawn Routes NLRI
2200 * 2 B Total Path Attribute Length (zero)
2201 * --- Path Attributes (unused)
2202 * --- IPv4 Network Layer Reachability Information (unused)
2203 */
2204
2205 uint len = bgp_encode_nlri(s, buck, buf+2, end);
2206
2207 put_u16(buf+0, len);
2208 put_u16(buf+2+len, 0);
2209
2210 return buf+4+len;
2211 }
2212
2213 static byte *
2214 bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2215 {
2216 /*
2217 * 2 B Withdrawn Routes Length (zero)
2218 * --- IPv4 Withdrawn Routes NLRI (unused)
2219 * 2 B Total Path Attribute Length
2220 * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
2221 * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
2222 * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
2223 * 2 B MP_UNREACH_NLRI data - Address Family Identifier
2224 * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2225 * var MP_UNREACH_NLRI data - Network Layer Reachability Information
2226 * --- IPv4 Network Layer Reachability Information (unused)
2227 */
2228
2229 uint len = bgp_encode_nlri(s, buck, buf+11, end);
2230
2231 put_u16(buf+0, 0);
2232 put_u16(buf+2, 7+len);
2233
2234 /* Begin of MP_UNREACH_NLRI atribute */
2235 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2236 buf[5] = BA_MP_UNREACH_NLRI;
2237 put_u16(buf+6, 3+len);
2238 put_af3(buf+8, s->channel->afi);
2239
2240 return buf+11+len;
2241 }
2242
2243 static byte *
2244 bgp_create_update(struct bgp_channel *c, byte *buf)
2245 {
2246 struct bgp_proto *p = (void *) c->c.proto;
2247 struct bgp_bucket *buck;
2248 byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2249 byte *res = NULL;
2250
2251 again: ;
2252
2253 /* Initialize write state */
2254 struct bgp_write_state s = {
2255 .proto = p,
2256 .channel = c,
2257 .pool = bgp_linpool,
2258 .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2259 .as4_session = p->as4_session,
2260 .add_path = c->add_path_tx,
2261 .mpls = c->desc->mpls,
2262 };
2263
2264 /* Try unreachable bucket */
2265 if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2266 {
2267 res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2268 bgp_create_ip_unreach(&s, buck, buf, end):
2269 bgp_create_mp_unreach(&s, buck, buf, end);
2270
2271 goto done;
2272 }
2273
2274 /* Try reachable buckets */
2275 if (!EMPTY_LIST(c->bucket_queue))
2276 {
2277 buck = HEAD(c->bucket_queue);
2278
2279 /* Cleanup empty buckets */
2280 if (EMPTY_LIST(buck->prefixes))
2281 {
2282 bgp_free_bucket(c, buck);
2283 goto again;
2284 }
2285
2286 res = !s.mp_reach ?
2287 bgp_create_ip_reach(&s, buck, buf, end):
2288 bgp_create_mp_reach(&s, buck, buf, end);
2289
2290 if (EMPTY_LIST(buck->prefixes))
2291 bgp_free_bucket(c, buck);
2292 else
2293 bgp_defer_bucket(c, buck);
2294
2295 if (!res)
2296 goto again;
2297
2298 goto done;
2299 }
2300
2301 /* No more prefixes to send */
2302 return NULL;
2303
2304 done:
2305 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2306 p->stats.tx_updates++;
2307 lp_flush(s.pool);
2308
2309 return res;
2310 }
2311
2312 static byte *
2313 bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2314 {
2315 /* Empty update packet */
2316 put_u32(buf, 0);
2317
2318 return buf+4;
2319 }
2320
2321 static byte *
2322 bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2323 {
2324 put_u16(buf+0, 0);
2325 put_u16(buf+2, 6); /* length 4--9 */
2326
2327 /* Empty MP_UNREACH_NLRI atribute */
2328 buf[4] = BAF_OPTIONAL;
2329 buf[5] = BA_MP_UNREACH_NLRI;
2330 buf[6] = 3; /* Length 7--9 */
2331 put_af3(buf+7, c->afi);
2332
2333 return buf+10;
2334 }
2335
2336 static byte *
2337 bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2338 {
2339 struct bgp_proto *p = (void *) c->c.proto;
2340
2341 BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2342 p->stats.tx_updates++;
2343
2344 return (c->afi == BGP_AF_IPV4) ?
2345 bgp_create_ip_end_mark(c, buf):
2346 bgp_create_mp_end_mark(c, buf);
2347 }
2348
2349 static inline void
2350 bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2351 {
2352 struct bgp_proto *p = s->proto;
2353 struct bgp_channel *c = bgp_get_channel(p, afi);
2354
2355 BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2356
2357 if (!c)
2358 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2359
2360 if (c->load_state == BFS_LOADING)
2361 c->load_state = BFS_NONE;
2362
2363 if (p->p.gr_recovery)
2364 channel_graceful_restart_unlock(&c->c);
2365
2366 if (c->gr_active)
2367 bgp_graceful_restart_done(c);
2368 }
2369
2370 static inline void
2371 bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2372 {
2373 struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2374 rta *a = NULL;
2375
2376 if (!c)
2377 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2378
2379 s->channel = c;
2380 s->add_path = c->add_path_rx;
2381 s->mpls = c->desc->mpls;
2382
2383 s->last_id = 0;
2384 s->last_src = s->proto->p.main_source;
2385
2386 /*
2387 * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2388 * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2389 * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2390 * decode_next_hop hooks) by restoring a->eattrs afterwards.
2391 */
2392
2393 if (ea)
2394 {
2395 a = allocz(RTA_MAX_SIZE);
2396
2397 a->source = RTS_BGP;
2398 a->scope = SCOPE_UNIVERSE;
2399 a->from = s->proto->remote_ip;
2400 a->eattrs = ea;
2401
2402 c->desc->decode_next_hop(s, nh, nh_len, a);
2403 bgp_finish_attrs(s, a);
2404
2405 /* Handle withdraw during next hop decoding */
2406 if (s->err_withdraw)
2407 a = NULL;
2408 }
2409
2410 c->desc->decode_nlri(s, nlri, len, a);
2411
2412 rta_free(s->cached_rta);
2413 s->cached_rta = NULL;
2414 }
2415
2416 static void
2417 bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2418 {
2419 struct bgp_proto *p = conn->bgp;
2420 ea_list *ea = NULL;
2421
2422 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2423 p->last_rx_update = current_time();
2424 p->stats.rx_updates++;
2425
2426 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2427 if (conn->state == BS_OPENCONFIRM)
2428 bgp_conn_enter_established_state(conn);
2429
2430 if (conn->state != BS_ESTABLISHED)
2431 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2432
2433 bgp_start_timer(conn->hold_timer, conn->hold_time);
2434
2435 /* Initialize parse state */
2436 struct bgp_parse_state s = {
2437 .proto = p,
2438 .pool = bgp_linpool,
2439 .as4_session = p->as4_session,
2440 };
2441
2442 /* Parse error handler */
2443 if (setjmp(s.err_jmpbuf))
2444 {
2445 bgp_error(conn, 3, s.err_subcode, NULL, 0);
2446 goto done;
2447 }
2448
2449 /* Check minimal length */
2450 if (len < 23)
2451 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2452
2453 /* Skip fixed header */
2454 uint pos = 19;
2455
2456 /*
2457 * UPDATE message format
2458 *
2459 * 2 B IPv4 Withdrawn Routes Length
2460 * var IPv4 Withdrawn Routes NLRI
2461 * 2 B Total Path Attribute Length
2462 * var Path Attributes
2463 * var IPv4 Reachable Routes NLRI
2464 */
2465
2466 s.ip_unreach_len = get_u16(pkt + pos);
2467 s.ip_unreach_nlri = pkt + pos + 2;
2468 pos += 2 + s.ip_unreach_len;
2469
2470 if (pos + 2 > len)
2471 bgp_parse_error(&s, 1);
2472
2473 s.attr_len = get_u16(pkt + pos);
2474 s.attrs = pkt + pos + 2;
2475 pos += 2 + s.attr_len;
2476
2477 if (pos > len)
2478 bgp_parse_error(&s, 1);
2479
2480 s.ip_reach_len = len - pos;
2481 s.ip_reach_nlri = pkt + pos;
2482
2483
2484 if (s.attr_len)
2485 ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2486 else
2487 ea = NULL;
2488
2489 /* Check for End-of-RIB marker */
2490 if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2491 { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2492
2493 /* Check for MP End-of-RIB marker */
2494 if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2495 !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2496 { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2497
2498 if (s.ip_unreach_len)
2499 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2500
2501 if (s.mp_unreach_len)
2502 bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2503
2504 if (s.ip_reach_len)
2505 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2506 ea, s.ip_next_hop_data, s.ip_next_hop_len);
2507
2508 if (s.mp_reach_len)
2509 bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2510 ea, s.mp_next_hop_data, s.mp_next_hop_len);
2511
2512 done:
2513 rta_free(s.cached_rta);
2514 lp_flush(s.pool);
2515 return;
2516 }
2517
2518 static uint
2519 bgp_find_update_afi(byte *pos, uint len)
2520 {
2521 /*
2522 * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2523 * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2524 * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2525 */
2526 if (len < 23)
2527 return 0;
2528
2529 /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2530 uint wlen = get_u16(pos + 19);
2531 uint alen = get_u16(pos + 21);
2532 ADVANCE(pos, len, 23);
2533
2534 /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2535 if ((wlen != 0) || (alen < len) || !alen)
2536 return BGP_AF_IPV4;
2537
2538 if (alen > len)
2539 return 0;
2540
2541 /* Process attribute list (alen == len) */
2542 while (len)
2543 {
2544 if (len < 2)
2545 return 0;
2546
2547 uint flags = pos[0];
2548 uint code = pos[1];
2549 ADVANCE(pos, len, 2);
2550
2551 uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2552 if (len < ll)
2553 return 0;
2554
2555 /* Read attribute length and move to attribute body */
2556 alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2557 ADVANCE(pos, len, ll);
2558
2559 if (len < alen)
2560 return 0;
2561
2562 /* Found MP NLRI */
2563 if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2564 {
2565 if (alen < 3)
2566 return 0;
2567
2568 return BGP_AF(get_u16(pos), pos[2]);
2569 }
2570
2571 /* Move to the next attribute */
2572 ADVANCE(pos, len, alen);
2573 }
2574
2575 /* No basic or MP NLRI, but there are some attributes -> error */
2576 return 0;
2577 }
2578
2579
2580 /*
2581 * ROUTE-REFRESH
2582 */
2583
2584 static inline byte *
2585 bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2586 {
2587 struct bgp_proto *p = (void *) c->c.proto;
2588
2589 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2590
2591 /* Original route refresh request, RFC 2918 */
2592 put_af4(buf, c->afi);
2593 buf[2] = BGP_RR_REQUEST;
2594
2595 return buf+4;
2596 }
2597
2598 static inline byte *
2599 bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2600 {
2601 struct bgp_proto *p = (void *) c->c.proto;
2602
2603 BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2604
2605 /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2606 put_af4(buf, c->afi);
2607 buf[2] = BGP_RR_BEGIN;
2608
2609 return buf+4;
2610 }
2611
2612 static inline byte *
2613 bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2614 {
2615 struct bgp_proto *p = (void *) c->c.proto;
2616
2617 BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2618
2619 /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2620 put_af4(buf, c->afi);
2621 buf[2] = BGP_RR_END;
2622
2623 return buf+4;
2624 }
2625
2626 static void
2627 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2628 {
2629 struct bgp_proto *p = conn->bgp;
2630
2631 if (conn->state != BS_ESTABLISHED)
2632 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2633
2634 if (!conn->local_caps->route_refresh)
2635 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2636
2637 if (len < (BGP_HEADER_LENGTH + 4))
2638 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2639
2640 if (len > (BGP_HEADER_LENGTH + 4))
2641 { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2642
2643 struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2644 if (!c)
2645 {
2646 log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2647 p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2648 return;
2649 }
2650
2651 /* RFC 7313 redefined reserved field as RR message subtype */
2652 uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2653
2654 switch (subtype)
2655 {
2656 case BGP_RR_REQUEST:
2657 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2658 channel_request_feeding(&c->c);
2659 break;
2660
2661 case BGP_RR_BEGIN:
2662 BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2663 bgp_refresh_begin(c);
2664 break;
2665
2666 case BGP_RR_END:
2667 BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2668 bgp_refresh_end(c);
2669 break;
2670
2671 default:
2672 log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2673 p->p.name, subtype);
2674 break;
2675 }
2676 }
2677
2678 static inline struct bgp_channel *
2679 bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2680 {
2681 uint i = conn->last_channel;
2682
2683 /* Try the last channel, but at most several times */
2684 if ((conn->channels_to_send & (1 << i)) &&
2685 (conn->last_channel_count < 16))
2686 goto found;
2687
2688 /* Find channel with non-zero channels_to_send */
2689 do
2690 {
2691 i++;
2692 if (i >= p->channel_count)
2693 i = 0;
2694 }
2695 while (! (conn->channels_to_send & (1 << i)));
2696
2697 /* Use that channel */
2698 conn->last_channel = i;
2699 conn->last_channel_count = 0;
2700
2701 found:
2702 conn->last_channel_count++;
2703 return p->channel_map[i];
2704 }
2705
2706 static inline int
2707 bgp_send(struct bgp_conn *conn, uint type, uint len)
2708 {
2709 sock *sk = conn->sk;
2710 byte *buf = sk->tbuf;
2711
2712 conn->bgp->stats.tx_messages++;
2713 conn->bgp->stats.tx_bytes += len;
2714
2715 memset(buf, 0xff, 16); /* Marker */
2716 put_u16(buf+16, len);
2717 buf[18] = type;
2718
2719 return sk_send(sk, len);
2720 }
2721
2722 /**
2723 * bgp_fire_tx - transmit packets
2724 * @conn: connection
2725 *
2726 * Whenever the transmit buffers of the underlying TCP connection
2727 * are free and we have any packets queued for sending, the socket functions
2728 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2729 * queued (Notification > Keepalive > Open > Update), assembling its header
2730 * and body and sending it to the connection.
2731 */
2732 static int
2733 bgp_fire_tx(struct bgp_conn *conn)
2734 {
2735 struct bgp_proto *p = conn->bgp;
2736 struct bgp_channel *c;
2737 byte *buf, *pkt, *end;
2738 uint s;
2739
2740 if (!conn->sk)
2741 return 0;
2742
2743 buf = conn->sk->tbuf;
2744 pkt = buf + BGP_HEADER_LENGTH;
2745 s = conn->packets_to_send;
2746
2747 if (s & (1 << PKT_SCHEDULE_CLOSE))
2748 {
2749 /* We can finally close connection and enter idle state */
2750 bgp_conn_enter_idle_state(conn);
2751 return 0;
2752 }
2753 if (s & (1 << PKT_NOTIFICATION))
2754 {
2755 conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2756 end = bgp_create_notification(conn, pkt);
2757 return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2758 }
2759 else if (s & (1 << PKT_OPEN))
2760 {
2761 conn->packets_to_send &= ~(1 << PKT_OPEN);
2762 end = bgp_create_open(conn, pkt);
2763 return bgp_send(conn, PKT_OPEN, end - buf);
2764 }
2765 else if (s & (1 << PKT_KEEPALIVE))
2766 {
2767 conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2768 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2769 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2770 return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2771 }
2772 else while (conn->channels_to_send)
2773 {
2774 c = bgp_get_channel_to_send(p, conn);
2775 s = c->packets_to_send;
2776
2777 if (s & (1 << PKT_ROUTE_REFRESH))
2778 {
2779 c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2780 end = bgp_create_route_refresh(c, pkt);
2781 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2782 }
2783 else if (s & (1 << PKT_BEGIN_REFRESH))
2784 {
2785 /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2786 c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2787 end = bgp_create_begin_refresh(c, pkt);
2788 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2789 }
2790 else if (s & (1 << PKT_UPDATE))
2791 {
2792 end = bgp_create_update(c, pkt);
2793 if (end)
2794 return bgp_send(conn, PKT_UPDATE, end - buf);
2795
2796 /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2797 c->packets_to_send = 0;
2798 conn->channels_to_send &= ~(1 << c->index);
2799
2800 if (c->feed_state == BFS_LOADED)
2801 {
2802 c->feed_state = BFS_NONE;
2803 end = bgp_create_end_mark(c, pkt);
2804 return bgp_send(conn, PKT_UPDATE, end - buf);
2805 }
2806
2807 else if (c->feed_state == BFS_REFRESHED)
2808 {
2809 c->feed_state = BFS_NONE;
2810 end = bgp_create_end_refresh(c, pkt);
2811 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2812 }
2813 }
2814 else if (s)
2815 bug("Channel packets_to_send: %x", s);
2816
2817 c->packets_to_send = 0;
2818 conn->channels_to_send &= ~(1 << c->index);
2819 }
2820
2821 return 0;
2822 }
2823
2824 /**
2825 * bgp_schedule_packet - schedule a packet for transmission
2826 * @conn: connection
2827 * @c: channel
2828 * @type: packet type
2829 *
2830 * Schedule a packet of type @type to be sent as soon as possible.
2831 */
2832 void
2833 bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2834 {
2835 ASSERT(conn->sk);
2836
2837 DBG("BGP: Scheduling packet type %d\n", type);
2838
2839 if (c)
2840 {
2841 if (! conn->channels_to_send)
2842 {
2843 conn->last_channel = c->index;
2844 conn->last_channel_count = 0;
2845 }
2846
2847 c->packets_to_send |= 1 << type;
2848 conn->channels_to_send |= 1 << c->index;
2849 }
2850 else
2851 conn->packets_to_send |= 1 << type;
2852
2853 if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2854 ev_schedule(conn->tx_ev);
2855 }
2856 void
2857 bgp_kick_tx(void *vconn)
2858 {
2859 struct bgp_conn *conn = vconn;
2860
2861 DBG("BGP: kicking TX\n");
2862 uint max = 1024;
2863 while (--max && (bgp_fire_tx(conn) > 0))
2864 ;
2865
2866 if (!max && !ev_active(conn->tx_ev))
2867 ev_schedule(conn->tx_ev);
2868 }
2869
2870 void
2871 bgp_tx(sock *sk)
2872 {
2873 struct bgp_conn *conn = sk->data;
2874
2875 DBG("BGP: TX hook\n");
2876 uint max = 1024;
2877 while (--max && (bgp_fire_tx(conn) > 0))
2878 ;
2879
2880 if (!max && !ev_active(conn->tx_ev))
2881 ev_schedule(conn->tx_ev);
2882 }
2883
2884
2885 static struct {
2886 byte major, minor;
2887 byte *msg;
2888 } bgp_msg_table[] = {
2889 { 1, 0, "Invalid message header" },
2890 { 1, 1, "Connection not synchronized" },
2891 { 1, 2, "Bad message length" },
2892 { 1, 3, "Bad message type" },
2893 { 2, 0, "Invalid OPEN message" },
2894 { 2, 1, "Unsupported version number" },
2895 { 2, 2, "Bad peer AS" },
2896 { 2, 3, "Bad BGP identifier" },
2897 { 2, 4, "Unsupported optional parameter" },
2898 { 2, 5, "Authentication failure" },
2899 { 2, 6, "Unacceptable hold time" },
2900 { 2, 7, "Required capability missing" }, /* [RFC5492] */
2901 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2902 { 3, 0, "Invalid UPDATE message" },
2903 { 3, 1, "Malformed attribute list" },
2904 { 3, 2, "Unrecognized well-known attribute" },
2905 { 3, 3, "Missing mandatory attribute" },
2906 { 3, 4, "Invalid attribute flags" },
2907 { 3, 5, "Invalid attribute length" },
2908 { 3, 6, "Invalid ORIGIN attribute" },
2909 { 3, 7, "AS routing loop" }, /* Deprecated */
2910 { 3, 8, "Invalid NEXT_HOP attribute" },
2911 { 3, 9, "Optional attribute error" },
2912 { 3, 10, "Invalid network field" },
2913 { 3, 11, "Malformed AS_PATH" },
2914 { 4, 0, "Hold timer expired" },
2915 { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2916 { 5, 1, "Unexpected message in OpenSent state" },
2917 { 5, 2, "Unexpected message in OpenConfirm state" },
2918 { 5, 3, "Unexpected message in Established state" },
2919 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2920 { 6, 1, "Maximum number of prefixes reached" },
2921 { 6, 2, "Administrative shutdown" },
2922 { 6, 3, "Peer de-configured" },
2923 { 6, 4, "Administrative reset" },
2924 { 6, 5, "Connection rejected" },
2925 { 6, 6, "Other configuration change" },
2926 { 6, 7, "Connection collision resolution" },
2927 { 6, 8, "Out of Resources" },
2928 { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2929 { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2930 };
2931
2932 /**
2933 * bgp_error_dsc - return BGP error description
2934 * @code: BGP error code
2935 * @subcode: BGP error subcode
2936 *
2937 * bgp_error_dsc() returns error description for BGP errors
2938 * which might be static string or given temporary buffer.
2939 */
2940 const char *
2941 bgp_error_dsc(uint code, uint subcode)
2942 {
2943 static char buff[32];
2944 uint i;
2945
2946 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2947 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2948 return bgp_msg_table[i].msg;
2949
2950 bsprintf(buff, "Unknown error %u.%u", code, subcode);
2951 return buff;
2952 }
2953
2954 /* RFC 8203 - shutdown communication message */
2955 static int
2956 bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2957 {
2958 byte *msg = data + 1;
2959 uint msg_len = data[0];
2960 uint i;
2961
2962 /* Handle zero length message */
2963 if (msg_len == 0)
2964 return 1;
2965
2966 /* Handle proper message */
2967 if (msg_len + 1 > len)
2968 return 0;
2969
2970 /* Some elementary cleanup */
2971 for (i = 0; i < msg_len; i++)
2972 if (msg[i] < ' ')
2973 msg[i] = ' ';
2974
2975 proto_set_message(&p->p, msg, msg_len);
2976 *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2977 return 1;
2978 }
2979
2980 void
2981 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2982 {
2983 byte argbuf[256+16], *t = argbuf;
2984 uint i;
2985
2986 /* Don't report Cease messages generated by myself */
2987 if (code == 6 && class == BE_BGP_TX)
2988 return;
2989
2990 /* Reset shutdown message */
2991 if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2992 proto_set_message(&p->p, NULL, 0);
2993
2994 if (len)
2995 {
2996 /* Bad peer AS - we would like to print the AS */
2997 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2998 {
2999 t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
3000 goto done;
3001 }
3002
3003 /* RFC 8203 - shutdown communication */
3004 if (((code == 6) && ((subcode == 2) || (subcode == 4))))
3005 if (bgp_handle_message(p, data, len, &t))
3006 goto done;
3007
3008 *t++ = ':';
3009 *t++ = ' ';
3010 if (len > 16)
3011 len = 16;
3012 for (i=0; i<len; i++)
3013 t += bsprintf(t, "%02x", data[i]);
3014 }
3015
3016 done:
3017 *t = 0;
3018 const byte *dsc = bgp_error_dsc(code, subcode);
3019 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
3020 }
3021
3022 static void
3023 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
3024 {
3025 struct bgp_proto *p = conn->bgp;
3026
3027 if (len < 21)
3028 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
3029
3030 uint code = pkt[19];
3031 uint subcode = pkt[20];
3032 int err = (code != 6);
3033
3034 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
3035 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
3036
3037 bgp_conn_enter_close_state(conn);
3038 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
3039
3040 if (err)
3041 {
3042 bgp_update_startup_delay(p);
3043 bgp_stop(p, 0, NULL, 0);
3044 }
3045 else
3046 {
3047 uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
3048 if (p->cf->disable_after_cease & subcode_bit)
3049 {
3050 log(L_INFO "%s: Disabled after Cease notification", p->p.name);
3051 p->startup_delay = 0;
3052 p->p.disabled = 1;
3053 }
3054 }
3055 }
3056
3057 static void
3058 bgp_rx_keepalive(struct bgp_conn *conn)
3059 {
3060 struct bgp_proto *p = conn->bgp;
3061
3062 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
3063 bgp_start_timer(conn->hold_timer, conn->hold_time);
3064
3065 if (conn->state == BS_OPENCONFIRM)
3066 { bgp_conn_enter_established_state(conn); return; }
3067
3068 if (conn->state != BS_ESTABLISHED)
3069 bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
3070 }
3071
3072
3073 /**
3074 * bgp_rx_packet - handle a received packet
3075 * @conn: BGP connection
3076 * @pkt: start of the packet
3077 * @len: packet size
3078 *
3079 * bgp_rx_packet() takes a newly received packet and calls the corresponding
3080 * packet handler according to the packet type.
3081 */
3082 static void
3083 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
3084 {
3085 byte type = pkt[18];
3086
3087 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
3088 conn->bgp->stats.rx_messages++;
3089 conn->bgp->stats.rx_bytes += len;
3090
3091 if (conn->bgp->p.mrtdump & MD_MESSAGES)
3092 bgp_dump_message(conn, pkt, len);
3093
3094 switch (type)
3095 {
3096 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
3097 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
3098 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
3099 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
3100 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
3101 default: bgp_error(conn, 1, 3, pkt+18, 1);
3102 }
3103 }
3104
3105 /**
3106 * bgp_rx - handle received data
3107 * @sk: socket
3108 * @size: amount of data received
3109 *
3110 * bgp_rx() is called by the socket layer whenever new data arrive from
3111 * the underlying TCP connection. It assembles the data fragments to packets,
3112 * checks their headers and framing and passes complete packets to
3113 * bgp_rx_packet().
3114 */
3115 int
3116 bgp_rx(sock *sk, uint size)
3117 {
3118 struct bgp_conn *conn = sk->data;
3119 byte *pkt_start = sk->rbuf;
3120 byte *end = pkt_start + size;
3121 uint i, len;
3122
3123 DBG("BGP: RX hook: Got %d bytes\n", size);
3124 while (end >= pkt_start + BGP_HEADER_LENGTH)
3125 {
3126 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
3127 return 0;
3128 for(i=0; i<16; i++)
3129 if (pkt_start[i] != 0xff)
3130 {
3131 bgp_error(conn, 1, 1, NULL, 0);
3132 break;
3133 }
3134 len = get_u16(pkt_start+16);
3135 if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
3136 {
3137 bgp_error(conn, 1, 2, pkt_start+16, 2);
3138 break;
3139 }
3140 if (end < pkt_start + len)
3141 break;
3142 bgp_rx_packet(conn, pkt_start, len);
3143 pkt_start += len;
3144 }
3145 if (pkt_start != sk->rbuf)
3146 {
3147 memmove(sk->rbuf, pkt_start, end - pkt_start);
3148 sk->rpos = sk->rbuf + (end - pkt_start);
3149 }
3150 return 0;
3151 }