]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
3be48c008e7999970e14cb08356be9e9b4b03814
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 #undef LOCAL_DEBUG
12
13 #include <stdlib.h>
14
15 #include "nest/bird.h"
16 #include "nest/iface.h"
17 #include "nest/protocol.h"
18 #include "nest/route.h"
19 #include "nest/attrs.h"
20 #include "proto/mrt/mrt.h"
21 #include "conf/conf.h"
22 #include "lib/unaligned.h"
23 #include "lib/flowspec.h"
24 #include "lib/socket.h"
25
26 #include "nest/cli.h"
27
28 #include "bgp.h"
29
30
31 #define BGP_RR_REQUEST 0
32 #define BGP_RR_BEGIN 1
33 #define BGP_RR_END 2
34
35 #define BGP_NLRI_MAX (4 + 1 + 32)
36
37 #define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
38 #define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
39 #define BGP_MPLS_NULL 3 /* Implicit NULL label */
40 #define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
41
42
43 static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44 static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45
46 /* Table for state -> RFC 6608 FSM error subcodes */
47 static byte fsm_err_subcode[BS_MAX] = {
48 [BS_OPENSENT] = 1,
49 [BS_OPENCONFIRM] = 2,
50 [BS_ESTABLISHED] = 3
51 };
52
53
54 static struct bgp_channel *
55 bgp_get_channel(struct bgp_proto *p, u32 afi)
56 {
57 uint i;
58
59 for (i = 0; i < p->channel_count; i++)
60 if (p->afi_map[i] == afi)
61 return p->channel_map[i];
62
63 return NULL;
64 }
65
66 static inline void
67 put_af3(byte *buf, u32 id)
68 {
69 put_u16(buf, id >> 16);
70 buf[2] = id & 0xff;
71 }
72
73 static inline void
74 put_af4(byte *buf, u32 id)
75 {
76 put_u16(buf, id >> 16);
77 buf[2] = 0;
78 buf[3] = id & 0xff;
79 }
80
81 static inline u32
82 get_af3(byte *buf)
83 {
84 return (get_u16(buf) << 16) | buf[2];
85 }
86
87 static inline u32
88 get_af4(byte *buf)
89 {
90 return (get_u16(buf) << 16) | buf[3];
91 }
92
93 static void
94 init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95 {
96 struct bgp_proto *p = conn->bgp;
97 int p_ok = conn->state >= BS_OPENCONFIRM;
98
99 memset(d, 0, sizeof(struct mrt_bgp_data));
100 d->peer_as = p->remote_as;
101 d->local_as = p->local_as;
102 d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103 d->af = ipa_is_ip4(p->cf->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104 d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105 d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106 d->as4 = p_ok ? p->as4_session : 0;
107 }
108
109 static uint bgp_find_update_afi(byte *pos, uint len);
110
111 static int
112 bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113 {
114 /* No need to estimate it for other messages than UPDATE */
115 if (pkt[18] != PKT_UPDATE)
116 return 0;
117
118 /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119 if (p->summary_add_path_rx < 3)
120 return p->summary_add_path_rx == 2;
121
122 uint afi = bgp_find_update_afi(pkt, len);
123 struct bgp_channel *c = bgp_get_channel(p, afi);
124 if (!c)
125 {
126 /* Either frame error (if !afi) or unknown AFI/SAFI,
127 will be reported later in regular parsing */
128 BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129 return 0;
130 }
131
132 return c->add_path_rx;
133 }
134
135 static void
136 bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137 {
138 struct mrt_bgp_data d;
139 init_mrt_bgp_data(conn, &d);
140
141 d.message = pkt;
142 d.msg_len = len;
143 d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144
145 mrt_dump_bgp_message(&d);
146 }
147
148 void
149 bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150 {
151 struct mrt_bgp_data d;
152 init_mrt_bgp_data(conn, &d);
153
154 d.old_state = old;
155 d.new_state = new;
156
157 mrt_dump_bgp_state_change(&d);
158 }
159
160 static byte *
161 bgp_create_notification(struct bgp_conn *conn, byte *buf)
162 {
163 struct bgp_proto *p = conn->bgp;
164
165 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166 buf[0] = conn->notify_code;
167 buf[1] = conn->notify_subcode;
168 memcpy(buf+2, conn->notify_data, conn->notify_size);
169 return buf + 2 + conn->notify_size;
170 }
171
172
173 /* Capability negotiation as per RFC 5492 */
174
175 const struct bgp_af_caps *
176 bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177 {
178 struct bgp_af_caps *ac;
179
180 WALK_AF_CAPS(caps, ac)
181 if (ac->afi == afi)
182 return ac;
183
184 return NULL;
185 }
186
187 static struct bgp_af_caps *
188 bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
189 {
190 struct bgp_af_caps *ac;
191
192 WALK_AF_CAPS(caps, ac)
193 if (ac->afi == afi)
194 return ac;
195
196 ac = &caps->af_data[caps->af_count++];
197 memset(ac, 0, sizeof(struct bgp_af_caps));
198 ac->afi = afi;
199
200 return ac;
201 }
202
203 static int
204 bgp_af_caps_cmp(const void *X, const void *Y)
205 {
206 const struct bgp_af_caps *x = X, *y = Y;
207 return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
208 }
209
210
211 static byte *
212 bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
213 {
214 struct bgp_proto *p = conn->bgp;
215 struct bgp_channel *c;
216 struct bgp_caps *caps;
217 struct bgp_af_caps *ac;
218 uint any_ext_next_hop = 0;
219 uint any_add_path = 0;
220 byte *data;
221
222 /* Prepare bgp_caps structure */
223
224 int n = list_length(&p->p.channels);
225 caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
226 conn->local_caps = caps;
227
228 caps->as4_support = p->cf->enable_as4;
229 caps->ext_messages = p->cf->enable_extended_messages;
230 caps->route_refresh = p->cf->enable_refresh;
231 caps->enhanced_refresh = p->cf->enable_refresh;
232
233 if (caps->as4_support)
234 caps->as4_number = p->public_as;
235
236 if (p->cf->gr_mode)
237 {
238 caps->gr_aware = 1;
239 caps->gr_time = p->cf->gr_time;
240 caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
241 }
242
243 if (p->cf->llgr_mode)
244 caps->llgr_aware = 1;
245
246 /* Allocate and fill per-AF fields */
247 WALK_LIST(c, p->p.channels)
248 {
249 ac = &caps->af_data[caps->af_count++];
250 ac->afi = c->afi;
251 ac->ready = 1;
252
253 ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
254 any_ext_next_hop |= ac->ext_next_hop;
255
256 ac->add_path = c->cf->add_path;
257 any_add_path |= ac->add_path;
258
259 if (c->cf->gr_able)
260 {
261 ac->gr_able = 1;
262
263 if (p->p.gr_recovery)
264 ac->gr_af_flags |= BGP_GRF_FORWARDING;
265 }
266
267 if (c->cf->llgr_able)
268 {
269 ac->llgr_able = 1;
270 ac->llgr_time = c->cf->llgr_time;
271
272 if (p->p.gr_recovery)
273 ac->llgr_flags |= BGP_LLGRF_FORWARDING;
274 }
275 }
276
277 /* Sort capability fields by AFI/SAFI */
278 qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
279
280
281 /* Create capability list in buffer */
282
283 /*
284 * Note that max length is ~ 22+21*af_count. With max 12 channels that is
285 * 274. Option limit is 253 and buffer size is 4096, so we cannot overflow
286 * unless we add new capabilities or more AFs. XXXXX
287 */
288
289 WALK_AF_CAPS(caps, ac)
290 if (ac->ready)
291 {
292 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
293 *buf++ = 4; /* Capability data length */
294 put_af4(buf, ac->afi);
295 buf += 4;
296 }
297
298 if (caps->route_refresh)
299 {
300 *buf++ = 2; /* Capability 2: Support for route refresh */
301 *buf++ = 0; /* Capability data length */
302 }
303
304 if (any_ext_next_hop)
305 {
306 *buf++ = 5; /* Capability 5: Support for extended next hop */
307 *buf++ = 0; /* Capability data length, will be fixed later */
308 data = buf;
309
310 WALK_AF_CAPS(caps, ac)
311 if (ac->ext_next_hop)
312 {
313 put_af4(buf, ac->afi);
314 put_u16(buf+4, BGP_AFI_IPV6);
315 buf += 6;
316 }
317
318 data[-1] = buf - data;
319 }
320
321 if (caps->ext_messages)
322 {
323 *buf++ = 6; /* Capability 6: Support for extended messages */
324 *buf++ = 0; /* Capability data length */
325 }
326
327 if (caps->gr_aware)
328 {
329 *buf++ = 64; /* Capability 64: Support for graceful restart */
330 *buf++ = 0; /* Capability data length, will be fixed later */
331 data = buf;
332
333 put_u16(buf, caps->gr_time);
334 buf[0] |= caps->gr_flags;
335 buf += 2;
336
337 WALK_AF_CAPS(caps, ac)
338 if (ac->gr_able)
339 {
340 put_af3(buf, ac->afi);
341 buf[3] = ac->gr_af_flags;
342 buf += 4;
343 }
344
345 data[-1] = buf - data;
346 }
347
348 if (caps->as4_support)
349 {
350 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
351 *buf++ = 4; /* Capability data length */
352 put_u32(buf, p->public_as);
353 buf += 4;
354 }
355
356 if (any_add_path)
357 {
358 *buf++ = 69; /* Capability 69: Support for ADD-PATH */
359 *buf++ = 0; /* Capability data length, will be fixed later */
360 data = buf;
361
362 WALK_AF_CAPS(caps, ac)
363 if (ac->add_path)
364 {
365 put_af3(buf, ac->afi);
366 buf[3] = ac->add_path;
367 buf += 4;
368 }
369
370 data[-1] = buf - data;
371 }
372
373 if (caps->enhanced_refresh)
374 {
375 *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
376 *buf++ = 0; /* Capability data length */
377 }
378
379 if (caps->llgr_aware)
380 {
381 *buf++ = 71; /* Capability 71: Support for long-lived graceful restart */
382 *buf++ = 0; /* Capability data length, will be fixed later */
383 data = buf;
384
385 WALK_AF_CAPS(caps, ac)
386 if (ac->llgr_able)
387 {
388 put_af3(buf, ac->afi);
389 buf[3] = ac->llgr_flags;
390 put_u24(buf+4, ac->llgr_time);
391 buf += 7;
392 }
393
394 data[-1] = buf - data;
395 }
396
397 return buf;
398 }
399
400 static void
401 bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
402 {
403 struct bgp_proto *p = conn->bgp;
404 struct bgp_af_caps *ac;
405 int i, cl;
406 u32 af;
407
408 while (len > 0)
409 {
410 if (len < 2 || len < (2 + pos[1]))
411 goto err;
412
413 /* Capability length */
414 cl = pos[1];
415
416 /* Capability type */
417 switch (pos[0])
418 {
419 case 1: /* Multiprotocol capability, RFC 4760 */
420 if (cl != 4)
421 goto err;
422
423 af = get_af4(pos+2);
424 ac = bgp_get_af_caps(caps, af);
425 ac->ready = 1;
426 break;
427
428 case 2: /* Route refresh capability, RFC 2918 */
429 if (cl != 0)
430 goto err;
431
432 caps->route_refresh = 1;
433 break;
434
435 case 5: /* Extended next hop encoding capability, RFC 5549 */
436 if (cl % 6)
437 goto err;
438
439 for (i = 0; i < cl; i += 6)
440 {
441 /* Specified only for IPv4 prefixes with IPv6 next hops */
442 if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
443 (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
444 continue;
445
446 af = get_af4(pos+2+i);
447 ac = bgp_get_af_caps(caps, af);
448 ac->ext_next_hop = 1;
449 }
450 break;
451
452 case 6: /* Extended message length capability, RFC draft */
453 if (cl != 0)
454 goto err;
455
456 caps->ext_messages = 1;
457 break;
458
459 case 64: /* Graceful restart capability, RFC 4724 */
460 if (cl % 4 != 2)
461 goto err;
462
463 /* Only the last instance is valid */
464 WALK_AF_CAPS(caps, ac)
465 {
466 ac->gr_able = 0;
467 ac->gr_af_flags = 0;
468 }
469
470 caps->gr_aware = 1;
471 caps->gr_flags = pos[2] & 0xf0;
472 caps->gr_time = get_u16(pos + 2) & 0x0fff;
473
474 for (i = 2; i < cl; i += 4)
475 {
476 af = get_af3(pos+2+i);
477 ac = bgp_get_af_caps(caps, af);
478 ac->gr_able = 1;
479 ac->gr_af_flags = pos[2+i+3];
480 }
481 break;
482
483 case 65: /* AS4 capability, RFC 6793 */
484 if (cl != 4)
485 goto err;
486
487 caps->as4_support = 1;
488 caps->as4_number = get_u32(pos + 2);
489 break;
490
491 case 69: /* ADD-PATH capability, RFC 7911 */
492 if (cl % 4)
493 goto err;
494
495 for (i = 0; i < cl; i += 4)
496 {
497 byte val = pos[2+i+3];
498 if (!val || (val > BGP_ADD_PATH_FULL))
499 {
500 log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
501 p->p.name, val);
502 break;
503 }
504 }
505
506 for (i = 0; i < cl; i += 4)
507 {
508 af = get_af3(pos+2+i);
509 ac = bgp_get_af_caps(caps, af);
510 ac->add_path = pos[2+i+3];
511 }
512 break;
513
514 case 70: /* Enhanced route refresh capability, RFC 7313 */
515 if (cl != 0)
516 goto err;
517
518 caps->enhanced_refresh = 1;
519 break;
520
521 case 71: /* Long lived graceful restart capability, RFC draft */
522 if (cl % 7)
523 goto err;
524
525 /* Presumably, only the last instance is valid */
526 WALK_AF_CAPS(caps, ac)
527 {
528 ac->llgr_able = 0;
529 ac->llgr_flags = 0;
530 ac->llgr_time = 0;
531 }
532
533 caps->llgr_aware = 1;
534
535 for (i = 0; i < cl; i += 7)
536 {
537 af = get_af3(pos+2+i);
538 ac = bgp_get_af_caps(caps, af);
539 ac->llgr_able = 1;
540 ac->llgr_flags = pos[2+i+3];
541 ac->llgr_time = get_u24(pos + 2+i+4);
542 }
543 break;
544
545 /* We can safely ignore all other capabilities */
546 }
547
548 ADVANCE(pos, len, 2 + cl);
549 }
550
551 /* The LLGR capability must be advertised together with the GR capability,
552 otherwise it must be disregarded */
553 if (!caps->gr_aware && caps->llgr_aware)
554 {
555 caps->llgr_aware = 0;
556 WALK_AF_CAPS(caps, ac)
557 {
558 ac->llgr_able = 0;
559 ac->llgr_flags = 0;
560 ac->llgr_time = 0;
561 }
562 }
563
564 return;
565
566 err:
567 bgp_error(conn, 2, 0, NULL, 0);
568 return;
569 }
570
571 static int
572 bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
573 {
574 struct bgp_proto *p = conn->bgp;
575 struct bgp_caps *caps;
576 int ol;
577
578 /* Max number of announced AFIs is limited by max option length (255) */
579 caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
580 memset(caps, 0, sizeof(struct bgp_caps));
581
582 while (len > 0)
583 {
584 if ((len < 2) || (len < (2 + pos[1])))
585 { bgp_error(conn, 2, 0, NULL, 0); return -1; }
586
587 ol = pos[1];
588 if (pos[0] == 2)
589 {
590 /* BGP capabilities, RFC 5492 */
591 if (p->cf->capabilities)
592 bgp_read_capabilities(conn, caps, pos + 2, ol);
593 }
594 else
595 {
596 /* Unknown option */
597 bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
598 return -1;
599 }
600
601 ADVANCE(pos, len, 2 + ol);
602 }
603
604 uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
605 conn->remote_caps = mb_allocz(p->p.pool, n);
606 memcpy(conn->remote_caps, caps, n);
607
608 return 0;
609 }
610
611 static byte *
612 bgp_create_open(struct bgp_conn *conn, byte *buf)
613 {
614 struct bgp_proto *p = conn->bgp;
615
616 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
617 BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
618
619 buf[0] = BGP_VERSION;
620 put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
621 put_u16(buf+3, p->cf->hold_time);
622 put_u32(buf+5, p->local_id);
623
624 if (p->cf->capabilities)
625 {
626 /* Prepare local_caps and write capabilities to buffer */
627 byte *end = bgp_write_capabilities(conn, buf+12);
628 uint len = end - (buf+12);
629
630 buf[9] = len + 2; /* Optional parameters length */
631 buf[10] = 2; /* Option 2: Capability list */
632 buf[11] = len; /* Option data length */
633
634 return end;
635 }
636 else
637 {
638 /* Prepare empty local_caps */
639 conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
640
641 buf[9] = 0; /* No optional parameters */
642 return buf + 10;
643 }
644
645 return buf;
646 }
647
648 static void
649 bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
650 {
651 struct bgp_proto *p = conn->bgp;
652 struct bgp_conn *other;
653 u32 asn, hold, id;
654
655 /* Check state */
656 if (conn->state != BS_OPENSENT)
657 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
658
659 /* Check message contents */
660 if (len < 29 || len != 29 + (uint) pkt[28])
661 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
662
663 if (pkt[19] != BGP_VERSION)
664 { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
665
666 asn = get_u16(pkt+20);
667 hold = get_u16(pkt+22);
668 id = get_u32(pkt+24);
669 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
670
671 if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
672 return;
673
674 if (hold > 0 && hold < 3)
675 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
676
677 /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
678 if (!id || (p->is_internal && id == p->local_id))
679 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
680
681 struct bgp_caps *caps = conn->remote_caps;
682
683 if (caps->as4_support)
684 {
685 u32 as4 = caps->as4_number;
686
687 if ((as4 != asn) && (asn != AS_TRANS))
688 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
689
690 if (as4 != p->remote_as)
691 { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
692 }
693 else
694 {
695 if (asn != p->remote_as)
696 { bgp_error(conn, 2, 2, pkt+20, 2); return; }
697 }
698
699 /* Check the other connection */
700 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
701 switch (other->state)
702 {
703 case BS_CONNECT:
704 case BS_ACTIVE:
705 /* Stop outgoing connection attempts */
706 bgp_conn_enter_idle_state(other);
707 break;
708
709 case BS_IDLE:
710 case BS_OPENSENT:
711 case BS_CLOSE:
712 break;
713
714 case BS_OPENCONFIRM:
715 /*
716 * Description of collision detection rules in RFC 4271 is confusing and
717 * contradictory, but it is essentially:
718 *
719 * 1. Router with higher ID is dominant
720 * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
721 * 3. When both connections are in OpenConfirm state, one initiated by
722 * the dominant router is kept.
723 *
724 * The first line in the expression below evaluates whether the neighbor
725 * is dominant, the second line whether the new connection was initiated
726 * by the neighbor. If both are true (or both are false), we keep the new
727 * connection, otherwise we keep the old one.
728 */
729 if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
730 == (conn == &p->incoming_conn))
731 {
732 /* Should close the other connection */
733 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
734 bgp_error(other, 6, 7, NULL, 0);
735 break;
736 }
737 /* Fall thru */
738 case BS_ESTABLISHED:
739 /* Should close this connection */
740 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
741 bgp_error(conn, 6, 7, NULL, 0);
742 return;
743
744 default:
745 bug("bgp_rx_open: Unknown state");
746 }
747
748 /* Update our local variables */
749 conn->hold_time = MIN(hold, p->cf->hold_time);
750 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
751 conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
752 conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
753 p->remote_id = id;
754
755 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
756 conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
757
758 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
759 bgp_start_timer(conn->hold_timer, conn->hold_time);
760 bgp_conn_enter_openconfirm_state(conn);
761 }
762
763
764 /*
765 * Next hop handling
766 */
767
768 #define REPORT(msg, args...) \
769 ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
770
771 #define DISCARD(msg, args...) \
772 ({ REPORT(msg, ## args); return; })
773
774 #define WITHDRAW(msg, args...) \
775 ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
776
777 #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
778 #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
779 #define NO_NEXT_HOP "Missing NEXT_HOP attribute"
780 #define NO_LABEL_STACK "Missing MPLS stack"
781
782
783 static void
784 bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
785 {
786 struct bgp_proto *p = s->proto;
787 struct bgp_channel *c = s->channel;
788
789 if (c->cf->gw_mode == GW_DIRECT)
790 {
791 neighbor *nbr = NULL;
792
793 /* GW_DIRECT -> single_hop -> p->neigh != NULL */
794 if (ipa_nonzero(gw))
795 nbr = neigh_find(&p->p, gw, NULL, 0);
796 else if (ipa_nonzero(ll))
797 nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
798
799 if (!nbr || (nbr->scope == SCOPE_HOST))
800 WITHDRAW(BAD_NEXT_HOP);
801
802 a->dest = RTD_UNICAST;
803 a->nh.gw = nbr->addr;
804 a->nh.iface = nbr->iface;
805 }
806 else /* GW_RECURSIVE */
807 {
808 if (ipa_zero(gw))
809 WITHDRAW(BAD_NEXT_HOP);
810
811 rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
812 s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
813
814 if (!s->mpls)
815 rta_apply_hostentry(a, s->hostentry, NULL);
816
817 /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
818 }
819 }
820
821 static void
822 bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
823 {
824 if (lnum > MPLS_MAX_LABEL_STACK)
825 {
826 REPORT("Too many MPLS labels ($u)", lnum);
827
828 a->dest = RTD_UNREACHABLE;
829 a->hostentry = NULL;
830 a->nh = (struct nexthop) { };
831 return;
832 }
833
834 /* Handle implicit NULL as empty MPLS stack */
835 if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
836 lnum = 0;
837
838 if (s->channel->cf->gw_mode == GW_DIRECT)
839 {
840 a->nh.labels = lnum;
841 memcpy(a->nh.label, labels, 4*lnum);
842 }
843 else /* GW_RECURSIVE */
844 {
845 mpls_label_stack ms;
846
847 ms.len = lnum;
848 memcpy(ms.stack, labels, 4*lnum);
849 rta_apply_hostentry(a, s->hostentry, &ms);
850 }
851 }
852
853
854 static inline int
855 bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
856 {
857 struct bgp_proto *p = s->proto;
858 ip_addr *nh = (void *) a->u.ptr->data;
859
860 if (s->channel->cf->next_hop_self)
861 return 0;
862
863 if (s->channel->cf->next_hop_keep)
864 return 1;
865
866 /* Keep it when explicitly set in export filter */
867 if (a->type & EAF_FRESH)
868 return 1;
869
870 /* Keep it when exported to internal peers */
871 if (p->is_interior && ipa_nonzero(*nh))
872 return 1;
873
874 /* Keep it when forwarded between single-hop BGPs on the same iface */
875 struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
876 return p->neigh && (p->neigh->iface == ifa);
877 }
878
879 static inline int
880 bgp_use_gateway(struct bgp_export_state *s)
881 {
882 struct bgp_proto *p = s->proto;
883 rta *ra = s->route->attrs;
884
885 if (s->channel->cf->next_hop_self)
886 return 0;
887
888 /* We need one valid global gateway */
889 if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
890 return 0;
891
892 /* Use it when exported to internal peers */
893 if (p->is_interior)
894 return 1;
895
896 /* Use it when forwarded to single-hop BGP peer on on the same iface */
897 return p->neigh && (p->neigh->iface == ra->nh.iface);
898 }
899
900 static void
901 bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
902 {
903 if (!a || !bgp_use_next_hop(s, a))
904 {
905 if (bgp_use_gateway(s))
906 {
907 rta *ra = s->route->attrs;
908 ip_addr nh[1] = { ra->nh.gw };
909 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
910
911 if (s->mpls)
912 {
913 u32 implicit_null = BGP_MPLS_NULL;
914 u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
915 uint lnum = ra->nh.labels ? ra->nh.labels : 1;
916 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
917 }
918 }
919 else
920 {
921 ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
922 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
923
924 /* TODO: Use local MPLS assigned label */
925 if (s->mpls)
926 {
927 u32 implicit_null = BGP_MPLS_NULL;
928 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
929 }
930 }
931 }
932
933 /* Check if next hop is valid */
934 a = bgp_find_attr(*to, BA_NEXT_HOP);
935 if (!a)
936 WITHDRAW(NO_NEXT_HOP);
937
938 ip_addr *nh = (void *) a->u.ptr->data;
939 ip_addr peer = s->proto->cf->remote_ip;
940 uint len = a->u.ptr->length;
941
942 /* Forbid zero next hop */
943 if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
944 WITHDRAW(BAD_NEXT_HOP);
945
946 /* Forbid next hop equal to neighbor IP */
947 if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
948 WITHDRAW(BAD_NEXT_HOP);
949
950 /* Forbid next hop with non-matching AF */
951 if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
952 !s->channel->ext_next_hop)
953 WITHDRAW(BAD_NEXT_HOP);
954
955 /* Just check if MPLS stack */
956 if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
957 WITHDRAW(NO_LABEL_STACK);
958 }
959
960 static uint
961 bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
962 {
963 /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
964 ip_addr *nh = (void *) a->u.ptr->data;
965 uint len = a->u.ptr->length;
966
967 ASSERT((len == 16) || (len == 32));
968
969 /*
970 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
971 * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
972 * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
973 * IPv6 address with IPv6 NLRI.
974 */
975
976 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
977 {
978 put_ip4(buf, ipa_to_ip4(nh[0]));
979 return 4;
980 }
981
982 put_ip6(buf, ipa_to_ip6(nh[0]));
983
984 if (len == 32)
985 put_ip6(buf+16, ipa_to_ip6(nh[1]));
986
987 return len;
988 }
989
990 static void
991 bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
992 {
993 struct bgp_channel *c = s->channel;
994 struct adata *ad = lp_alloc_adata(s->pool, 32);
995 ip_addr *nh = (void *) ad->data;
996
997 if (len == 4)
998 {
999 nh[0] = ipa_from_ip4(get_ip4(data));
1000 nh[1] = IPA_NONE;
1001 }
1002 else if (len == 16)
1003 {
1004 nh[0] = ipa_from_ip6(get_ip6(data));
1005 nh[1] = IPA_NONE;
1006
1007 if (ipa_is_link_local(nh[0]))
1008 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1009 }
1010 else if (len == 32)
1011 {
1012 nh[0] = ipa_from_ip6(get_ip6(data));
1013 nh[1] = ipa_from_ip6(get_ip6(data+16));
1014
1015 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1016 nh[1] = IPA_NONE;
1017 }
1018 else
1019 bgp_parse_error(s, 9);
1020
1021 if (ipa_zero(nh[1]))
1022 ad->length = 16;
1023
1024 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1025 WITHDRAW(BAD_NEXT_HOP);
1026
1027 // XXXX validate next hop
1028
1029 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1030 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1031 }
1032
1033 static uint
1034 bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1035 {
1036 ip_addr *nh = (void *) a->u.ptr->data;
1037 uint len = a->u.ptr->length;
1038
1039 ASSERT((len == 16) || (len == 32));
1040
1041 /*
1042 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1043 * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1044 * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1045 * IPv6 address with VPNv6 NLRI.
1046 */
1047
1048 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1049 {
1050 put_u64(buf, 0); /* VPN RD is 0 */
1051 put_ip4(buf+8, ipa_to_ip4(nh[0]));
1052 return 12;
1053 }
1054
1055 put_u64(buf, 0); /* VPN RD is 0 */
1056 put_ip6(buf+8, ipa_to_ip6(nh[0]));
1057
1058 if (len == 16)
1059 return 24;
1060
1061 put_u64(buf+24, 0); /* VPN RD is 0 */
1062 put_ip6(buf+32, ipa_to_ip6(nh[1]));
1063
1064 return 48;
1065 }
1066
1067 static void
1068 bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1069 {
1070 struct bgp_channel *c = s->channel;
1071 struct adata *ad = lp_alloc_adata(s->pool, 32);
1072 ip_addr *nh = (void *) ad->data;
1073
1074 if (len == 12)
1075 {
1076 nh[0] = ipa_from_ip4(get_ip4(data+8));
1077 nh[1] = IPA_NONE;
1078 }
1079 else if (len == 24)
1080 {
1081 nh[0] = ipa_from_ip6(get_ip6(data+8));
1082 nh[1] = IPA_NONE;
1083
1084 if (ipa_is_link_local(nh[0]))
1085 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1086 }
1087 else if (len == 48)
1088 {
1089 nh[0] = ipa_from_ip6(get_ip6(data+8));
1090 nh[1] = ipa_from_ip6(get_ip6(data+32));
1091
1092 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1093 nh[1] = IPA_NONE;
1094 }
1095 else
1096 bgp_parse_error(s, 9);
1097
1098 if (ipa_zero(nh[1]))
1099 ad->length = 16;
1100
1101 /* XXXX which error */
1102 if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1103 bgp_parse_error(s, 9);
1104
1105 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1106 WITHDRAW(BAD_NEXT_HOP);
1107
1108 // XXXX validate next hop
1109
1110 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1111 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1112 }
1113
1114
1115
1116 static uint
1117 bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1118 {
1119 return 0;
1120 }
1121
1122 static void
1123 bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1124 {
1125 /*
1126 * Although we expect no next hop and RFC 7606 7.11 states that attribute
1127 * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1128 * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1129 */
1130
1131 return;
1132 }
1133
1134 static void
1135 bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1136 {
1137 /* NEXT_HOP shall not pass */
1138 if (a)
1139 bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1140 }
1141
1142
1143 /*
1144 * UPDATE
1145 */
1146
1147 static void
1148 bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1149 {
1150 if (path_id != s->last_id)
1151 {
1152 s->last_src = rt_get_source(&s->proto->p, path_id);
1153 s->last_id = path_id;
1154
1155 rta_free(s->cached_rta);
1156 s->cached_rta = NULL;
1157 }
1158
1159 if (!a0)
1160 {
1161 /* Route withdraw */
1162 rte_update2(&s->channel->c, n, NULL, s->last_src);
1163 return;
1164 }
1165
1166 /* Prepare cached route attributes */
1167 if (s->cached_rta == NULL)
1168 {
1169 a0->src = s->last_src;
1170
1171 /* Workaround for rta_lookup() breaking eattrs */
1172 ea_list *ea = a0->eattrs;
1173 s->cached_rta = rta_lookup(a0);
1174 a0->eattrs = ea;
1175 }
1176
1177 rta *a = rta_clone(s->cached_rta);
1178 rte *e = rte_get_temp(a);
1179
1180 e->pflags = 0;
1181 e->u.bgp.suppressed = 0;
1182 e->u.bgp.stale = -1;
1183 rte_update2(&s->channel->c, n, e, s->last_src);
1184 }
1185
1186 static void
1187 bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1188 {
1189 u32 dummy = 0;
1190 u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1191 uint lnum = mpls ? (mpls->length / 4) : 1;
1192
1193 for (uint i = 0; i < lnum; i++)
1194 {
1195 put_u24(*pos, labels[i] << 4);
1196 ADVANCE(*pos, *size, 3);
1197 }
1198
1199 /* Add bottom-of-stack flag */
1200 (*pos)[-1] |= BGP_MPLS_BOS;
1201
1202 *pxlen += 24 * lnum;
1203 }
1204
1205 static void
1206 bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1207 {
1208 u32 labels[BGP_MPLS_MAX], label;
1209 uint lnum = 0;
1210
1211 do {
1212 if (*pxlen < 24)
1213 bgp_parse_error(s, 1);
1214
1215 label = get_u24(*pos);
1216 labels[lnum++] = label >> 4;
1217 ADVANCE(*pos, *len, 3);
1218 *pxlen -= 24;
1219
1220 /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1221 fixed-size 24-bit Compatibility field, which MUST be ignored */
1222 if (!a && !s->err_withdraw)
1223 return;
1224 }
1225 while (!(label & BGP_MPLS_BOS));
1226
1227 if (!a)
1228 return;
1229
1230 /* Attach MPLS attribute unless we already have one */
1231 if (!s->mpls_labels)
1232 {
1233 s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1234 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1235 }
1236
1237 /* Overwrite data in the attribute */
1238 s->mpls_labels->length = 4*lnum;
1239 memcpy(s->mpls_labels->data, labels, 4*lnum);
1240
1241 /* Update next hop entry in rta */
1242 bgp_apply_mpls_labels(s, a, labels, lnum);
1243
1244 /* Attributes were changed, invalidate cached entry */
1245 rta_free(s->cached_rta);
1246 s->cached_rta = NULL;
1247
1248 return;
1249 }
1250
1251 static uint
1252 bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1253 {
1254 byte *pos = buf;
1255
1256 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1257 {
1258 struct bgp_prefix *px = HEAD(buck->prefixes);
1259 struct net_addr_ip4 *net = (void *) px->net;
1260
1261 /* Encode path ID */
1262 if (s->add_path)
1263 {
1264 put_u32(pos, px->path_id);
1265 ADVANCE(pos, size, 4);
1266 }
1267
1268 /* Encode prefix length */
1269 *pos = net->pxlen;
1270 ADVANCE(pos, size, 1);
1271
1272 /* Encode MPLS labels */
1273 if (s->mpls)
1274 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1275
1276 /* Encode prefix body */
1277 ip4_addr a = ip4_hton(net->prefix);
1278 uint b = (net->pxlen + 7) / 8;
1279 memcpy(pos, &a, b);
1280 ADVANCE(pos, size, b);
1281
1282 bgp_free_prefix(s->channel, px);
1283 }
1284
1285 return pos - buf;
1286 }
1287
1288 static void
1289 bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1290 {
1291 while (len)
1292 {
1293 net_addr_ip4 net;
1294 u32 path_id = 0;
1295
1296 /* Decode path ID */
1297 if (s->add_path)
1298 {
1299 if (len < 5)
1300 bgp_parse_error(s, 1);
1301
1302 path_id = get_u32(pos);
1303 ADVANCE(pos, len, 4);
1304 }
1305
1306 /* Decode prefix length */
1307 uint l = *pos;
1308 ADVANCE(pos, len, 1);
1309
1310 if (len < ((l + 7) / 8))
1311 bgp_parse_error(s, 1);
1312
1313 /* Decode MPLS labels */
1314 if (s->mpls)
1315 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1316
1317 if (l > IP4_MAX_PREFIX_LENGTH)
1318 bgp_parse_error(s, 10);
1319
1320 /* Decode prefix body */
1321 ip4_addr addr = IP4_NONE;
1322 uint b = (l + 7) / 8;
1323 memcpy(&addr, pos, b);
1324 ADVANCE(pos, len, b);
1325
1326 net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1327 net_normalize_ip4(&net);
1328
1329 // XXXX validate prefix
1330
1331 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1332 }
1333 }
1334
1335
1336 static uint
1337 bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1338 {
1339 byte *pos = buf;
1340
1341 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1342 {
1343 struct bgp_prefix *px = HEAD(buck->prefixes);
1344 struct net_addr_ip6 *net = (void *) px->net;
1345
1346 /* Encode path ID */
1347 if (s->add_path)
1348 {
1349 put_u32(pos, px->path_id);
1350 ADVANCE(pos, size, 4);
1351 }
1352
1353 /* Encode prefix length */
1354 *pos = net->pxlen;
1355 ADVANCE(pos, size, 1);
1356
1357 /* Encode MPLS labels */
1358 if (s->mpls)
1359 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1360
1361 /* Encode prefix body */
1362 ip6_addr a = ip6_hton(net->prefix);
1363 uint b = (net->pxlen + 7) / 8;
1364 memcpy(pos, &a, b);
1365 ADVANCE(pos, size, b);
1366
1367 bgp_free_prefix(s->channel, px);
1368 }
1369
1370 return pos - buf;
1371 }
1372
1373 static void
1374 bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1375 {
1376 while (len)
1377 {
1378 net_addr_ip6 net;
1379 u32 path_id = 0;
1380
1381 /* Decode path ID */
1382 if (s->add_path)
1383 {
1384 if (len < 5)
1385 bgp_parse_error(s, 1);
1386
1387 path_id = get_u32(pos);
1388 ADVANCE(pos, len, 4);
1389 }
1390
1391 /* Decode prefix length */
1392 uint l = *pos;
1393 ADVANCE(pos, len, 1);
1394
1395 if (len < ((l + 7) / 8))
1396 bgp_parse_error(s, 1);
1397
1398 /* Decode MPLS labels */
1399 if (s->mpls)
1400 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1401
1402 if (l > IP6_MAX_PREFIX_LENGTH)
1403 bgp_parse_error(s, 10);
1404
1405 /* Decode prefix body */
1406 ip6_addr addr = IP6_NONE;
1407 uint b = (l + 7) / 8;
1408 memcpy(&addr, pos, b);
1409 ADVANCE(pos, len, b);
1410
1411 net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1412 net_normalize_ip6(&net);
1413
1414 // XXXX validate prefix
1415
1416 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1417 }
1418 }
1419
1420 static uint
1421 bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1422 {
1423 byte *pos = buf;
1424
1425 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1426 {
1427 struct bgp_prefix *px = HEAD(buck->prefixes);
1428 struct net_addr_vpn4 *net = (void *) px->net;
1429
1430 /* Encode path ID */
1431 if (s->add_path)
1432 {
1433 put_u32(pos, px->path_id);
1434 ADVANCE(pos, size, 4);
1435 }
1436
1437 /* Encode prefix length */
1438 *pos = 64 + net->pxlen;
1439 ADVANCE(pos, size, 1);
1440
1441 /* Encode MPLS labels */
1442 if (s->mpls)
1443 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1444
1445 /* Encode route distinguisher */
1446 put_u64(pos, net->rd);
1447 ADVANCE(pos, size, 8);
1448
1449 /* Encode prefix body */
1450 ip4_addr a = ip4_hton(net->prefix);
1451 uint b = (net->pxlen + 7) / 8;
1452 memcpy(pos, &a, b);
1453 ADVANCE(pos, size, b);
1454
1455 bgp_free_prefix(s->channel, px);
1456 }
1457
1458 return pos - buf;
1459 }
1460
1461 static void
1462 bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1463 {
1464 while (len)
1465 {
1466 net_addr_vpn4 net;
1467 u32 path_id = 0;
1468
1469 /* Decode path ID */
1470 if (s->add_path)
1471 {
1472 if (len < 5)
1473 bgp_parse_error(s, 1);
1474
1475 path_id = get_u32(pos);
1476 ADVANCE(pos, len, 4);
1477 }
1478
1479 /* Decode prefix length */
1480 uint l = *pos;
1481 ADVANCE(pos, len, 1);
1482
1483 if (len < ((l + 7) / 8))
1484 bgp_parse_error(s, 1);
1485
1486 /* Decode MPLS labels */
1487 if (s->mpls)
1488 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1489
1490 /* Decode route distinguisher */
1491 if (l < 64)
1492 bgp_parse_error(s, 1);
1493
1494 u64 rd = get_u64(pos);
1495 ADVANCE(pos, len, 8);
1496 l -= 64;
1497
1498 if (l > IP4_MAX_PREFIX_LENGTH)
1499 bgp_parse_error(s, 10);
1500
1501 /* Decode prefix body */
1502 ip4_addr addr = IP4_NONE;
1503 uint b = (l + 7) / 8;
1504 memcpy(&addr, pos, b);
1505 ADVANCE(pos, len, b);
1506
1507 net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1508 net_normalize_vpn4(&net);
1509
1510 // XXXX validate prefix
1511
1512 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1513 }
1514 }
1515
1516
1517 static uint
1518 bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1519 {
1520 byte *pos = buf;
1521
1522 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1523 {
1524 struct bgp_prefix *px = HEAD(buck->prefixes);
1525 struct net_addr_vpn6 *net = (void *) px->net;
1526
1527 /* Encode path ID */
1528 if (s->add_path)
1529 {
1530 put_u32(pos, px->path_id);
1531 ADVANCE(pos, size, 4);
1532 }
1533
1534 /* Encode prefix length */
1535 *pos = 64 + net->pxlen;
1536 ADVANCE(pos, size, 1);
1537
1538 /* Encode MPLS labels */
1539 if (s->mpls)
1540 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1541
1542 /* Encode route distinguisher */
1543 put_u64(pos, net->rd);
1544 ADVANCE(pos, size, 8);
1545
1546 /* Encode prefix body */
1547 ip6_addr a = ip6_hton(net->prefix);
1548 uint b = (net->pxlen + 7) / 8;
1549 memcpy(pos, &a, b);
1550 ADVANCE(pos, size, b);
1551
1552 bgp_free_prefix(s->channel, px);
1553 }
1554
1555 return pos - buf;
1556 }
1557
1558 static void
1559 bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1560 {
1561 while (len)
1562 {
1563 net_addr_vpn6 net;
1564 u32 path_id = 0;
1565
1566 /* Decode path ID */
1567 if (s->add_path)
1568 {
1569 if (len < 5)
1570 bgp_parse_error(s, 1);
1571
1572 path_id = get_u32(pos);
1573 ADVANCE(pos, len, 4);
1574 }
1575
1576 /* Decode prefix length */
1577 uint l = *pos;
1578 ADVANCE(pos, len, 1);
1579
1580 if (len < ((l + 7) / 8))
1581 bgp_parse_error(s, 1);
1582
1583 /* Decode MPLS labels */
1584 if (s->mpls)
1585 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1586
1587 /* Decode route distinguisher */
1588 if (l < 64)
1589 bgp_parse_error(s, 1);
1590
1591 u64 rd = get_u64(pos);
1592 ADVANCE(pos, len, 8);
1593 l -= 64;
1594
1595 if (l > IP6_MAX_PREFIX_LENGTH)
1596 bgp_parse_error(s, 10);
1597
1598 /* Decode prefix body */
1599 ip6_addr addr = IP6_NONE;
1600 uint b = (l + 7) / 8;
1601 memcpy(&addr, pos, b);
1602 ADVANCE(pos, len, b);
1603
1604 net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1605 net_normalize_vpn6(&net);
1606
1607 // XXXX validate prefix
1608
1609 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1610 }
1611 }
1612
1613
1614 static uint
1615 bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1616 {
1617 byte *pos = buf;
1618
1619 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1620 {
1621 struct bgp_prefix *px = HEAD(buck->prefixes);
1622 struct net_addr_flow4 *net = (void *) px->net;
1623 uint flen = net->length - sizeof(net_addr_flow4);
1624
1625 /* Encode path ID */
1626 if (s->add_path)
1627 {
1628 put_u32(pos, px->path_id);
1629 ADVANCE(pos, size, 4);
1630 }
1631
1632 if (flen > size)
1633 break;
1634
1635 /* Copy whole flow data including length */
1636 memcpy(pos, net->data, flen);
1637 ADVANCE(pos, size, flen);
1638
1639 bgp_free_prefix(s->channel, px);
1640 }
1641
1642 return pos - buf;
1643 }
1644
1645 static void
1646 bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1647 {
1648 while (len)
1649 {
1650 u32 path_id = 0;
1651
1652 /* Decode path ID */
1653 if (s->add_path)
1654 {
1655 if (len < 4)
1656 bgp_parse_error(s, 1);
1657
1658 path_id = get_u32(pos);
1659 ADVANCE(pos, len, 4);
1660 }
1661
1662 if (len < 2)
1663 bgp_parse_error(s, 1);
1664
1665 /* Decode flow length */
1666 uint hlen = flow_hdr_length(pos);
1667 uint dlen = flow_read_length(pos);
1668 uint flen = hlen + dlen;
1669 byte *data = pos + hlen;
1670
1671 if (len < flen)
1672 bgp_parse_error(s, 1);
1673
1674 /* Validate flow data */
1675 enum flow_validated_state r = flow4_validate(data, dlen);
1676 if (r != FLOW_ST_VALID)
1677 {
1678 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1679 bgp_parse_error(s, 1);
1680 }
1681
1682 if (data[0] != FLOW_TYPE_DST_PREFIX)
1683 {
1684 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1685 bgp_parse_error(s, 1);
1686 }
1687
1688 /* Decode dst prefix */
1689 ip4_addr px = IP4_NONE;
1690 uint pxlen = data[1];
1691
1692 // FIXME: Use some generic function
1693 memcpy(&px, data+2, BYTES(pxlen));
1694 px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen));
1695
1696 /* Prepare the flow */
1697 net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1698 net_fill_flow4(n, px, pxlen, pos, flen);
1699 ADVANCE(pos, len, flen);
1700
1701 bgp_rte_update(s, n, path_id, a);
1702 }
1703 }
1704
1705
1706 static uint
1707 bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1708 {
1709 byte *pos = buf;
1710
1711 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1712 {
1713 struct bgp_prefix *px = HEAD(buck->prefixes);
1714 struct net_addr_flow6 *net = (void *) px->net;
1715 uint flen = net->length - sizeof(net_addr_flow6);
1716
1717 /* Encode path ID */
1718 if (s->add_path)
1719 {
1720 put_u32(pos, px->path_id);
1721 ADVANCE(pos, size, 4);
1722 }
1723
1724 if (flen > size)
1725 break;
1726
1727 /* Copy whole flow data including length */
1728 memcpy(pos, net->data, flen);
1729 ADVANCE(pos, size, flen);
1730
1731 bgp_free_prefix(s->channel, px);
1732 }
1733
1734 return pos - buf;
1735 }
1736
1737 static void
1738 bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1739 {
1740 while (len)
1741 {
1742 u32 path_id = 0;
1743
1744 /* Decode path ID */
1745 if (s->add_path)
1746 {
1747 if (len < 4)
1748 bgp_parse_error(s, 1);
1749
1750 path_id = get_u32(pos);
1751 ADVANCE(pos, len, 4);
1752 }
1753
1754 if (len < 2)
1755 bgp_parse_error(s, 1);
1756
1757 /* Decode flow length */
1758 uint hlen = flow_hdr_length(pos);
1759 uint dlen = flow_read_length(pos);
1760 uint flen = hlen + dlen;
1761 byte *data = pos + hlen;
1762
1763 if (len < flen)
1764 bgp_parse_error(s, 1);
1765
1766 /* Validate flow data */
1767 enum flow_validated_state r = flow6_validate(data, dlen);
1768 if (r != FLOW_ST_VALID)
1769 {
1770 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1771 bgp_parse_error(s, 1);
1772 }
1773
1774 if (data[0] != FLOW_TYPE_DST_PREFIX)
1775 {
1776 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1777 bgp_parse_error(s, 1);
1778 }
1779
1780 /* Decode dst prefix */
1781 ip6_addr px = IP6_NONE;
1782 uint pxlen = data[1];
1783
1784 // FIXME: Use some generic function
1785 memcpy(&px, data+2, BYTES(pxlen));
1786 px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen));
1787
1788 /* Prepare the flow */
1789 net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1790 net_fill_flow6(n, px, pxlen, pos, flen);
1791 ADVANCE(pos, len, flen);
1792
1793 bgp_rte_update(s, n, path_id, a);
1794 }
1795 }
1796
1797
1798 static const struct bgp_af_desc bgp_af_table[] = {
1799 {
1800 .afi = BGP_AF_IPV4,
1801 .net = NET_IP4,
1802 .name = "ipv4",
1803 .encode_nlri = bgp_encode_nlri_ip4,
1804 .decode_nlri = bgp_decode_nlri_ip4,
1805 .encode_next_hop = bgp_encode_next_hop_ip,
1806 .decode_next_hop = bgp_decode_next_hop_ip,
1807 .update_next_hop = bgp_update_next_hop_ip,
1808 },
1809 {
1810 .afi = BGP_AF_IPV4_MC,
1811 .net = NET_IP4,
1812 .name = "ipv4-mc",
1813 .encode_nlri = bgp_encode_nlri_ip4,
1814 .decode_nlri = bgp_decode_nlri_ip4,
1815 .encode_next_hop = bgp_encode_next_hop_ip,
1816 .decode_next_hop = bgp_decode_next_hop_ip,
1817 .update_next_hop = bgp_update_next_hop_ip,
1818 },
1819 {
1820 .afi = BGP_AF_IPV4_MPLS,
1821 .net = NET_IP4,
1822 .mpls = 1,
1823 .name = "ipv4-mpls",
1824 .encode_nlri = bgp_encode_nlri_ip4,
1825 .decode_nlri = bgp_decode_nlri_ip4,
1826 .encode_next_hop = bgp_encode_next_hop_ip,
1827 .decode_next_hop = bgp_decode_next_hop_ip,
1828 .update_next_hop = bgp_update_next_hop_ip,
1829 },
1830 {
1831 .afi = BGP_AF_IPV6,
1832 .net = NET_IP6,
1833 .name = "ipv6",
1834 .encode_nlri = bgp_encode_nlri_ip6,
1835 .decode_nlri = bgp_decode_nlri_ip6,
1836 .encode_next_hop = bgp_encode_next_hop_ip,
1837 .decode_next_hop = bgp_decode_next_hop_ip,
1838 .update_next_hop = bgp_update_next_hop_ip,
1839 },
1840 {
1841 .afi = BGP_AF_IPV6_MC,
1842 .net = NET_IP6,
1843 .name = "ipv6-mc",
1844 .encode_nlri = bgp_encode_nlri_ip6,
1845 .decode_nlri = bgp_decode_nlri_ip6,
1846 .encode_next_hop = bgp_encode_next_hop_ip,
1847 .decode_next_hop = bgp_decode_next_hop_ip,
1848 .update_next_hop = bgp_update_next_hop_ip,
1849 },
1850 {
1851 .afi = BGP_AF_IPV6_MPLS,
1852 .net = NET_IP6,
1853 .mpls = 1,
1854 .name = "ipv6-mpls",
1855 .encode_nlri = bgp_encode_nlri_ip6,
1856 .decode_nlri = bgp_decode_nlri_ip6,
1857 .encode_next_hop = bgp_encode_next_hop_ip,
1858 .decode_next_hop = bgp_decode_next_hop_ip,
1859 .update_next_hop = bgp_update_next_hop_ip,
1860 },
1861 {
1862 .afi = BGP_AF_VPN4_MPLS,
1863 .net = NET_VPN4,
1864 .mpls = 1,
1865 .name = "vpn4-mpls",
1866 .encode_nlri = bgp_encode_nlri_vpn4,
1867 .decode_nlri = bgp_decode_nlri_vpn4,
1868 .encode_next_hop = bgp_encode_next_hop_vpn,
1869 .decode_next_hop = bgp_decode_next_hop_vpn,
1870 .update_next_hop = bgp_update_next_hop_ip,
1871 },
1872 {
1873 .afi = BGP_AF_VPN6_MPLS,
1874 .net = NET_VPN6,
1875 .mpls = 1,
1876 .name = "vpn6-mpls",
1877 .encode_nlri = bgp_encode_nlri_vpn6,
1878 .decode_nlri = bgp_decode_nlri_vpn6,
1879 .encode_next_hop = bgp_encode_next_hop_vpn,
1880 .decode_next_hop = bgp_decode_next_hop_vpn,
1881 .update_next_hop = bgp_update_next_hop_ip,
1882 },
1883 {
1884 .afi = BGP_AF_VPN4_MC,
1885 .net = NET_VPN4,
1886 .name = "vpn4-mc",
1887 .encode_nlri = bgp_encode_nlri_vpn4,
1888 .decode_nlri = bgp_decode_nlri_vpn4,
1889 .encode_next_hop = bgp_encode_next_hop_vpn,
1890 .decode_next_hop = bgp_decode_next_hop_vpn,
1891 .update_next_hop = bgp_update_next_hop_ip,
1892 },
1893 {
1894 .afi = BGP_AF_VPN6_MC,
1895 .net = NET_VPN6,
1896 .name = "vpn6-mc",
1897 .encode_nlri = bgp_encode_nlri_vpn6,
1898 .decode_nlri = bgp_decode_nlri_vpn6,
1899 .encode_next_hop = bgp_encode_next_hop_vpn,
1900 .decode_next_hop = bgp_decode_next_hop_vpn,
1901 .update_next_hop = bgp_update_next_hop_ip,
1902 },
1903 {
1904 .afi = BGP_AF_FLOW4,
1905 .net = NET_FLOW4,
1906 .no_igp = 1,
1907 .name = "flow4",
1908 .encode_nlri = bgp_encode_nlri_flow4,
1909 .decode_nlri = bgp_decode_nlri_flow4,
1910 .encode_next_hop = bgp_encode_next_hop_none,
1911 .decode_next_hop = bgp_decode_next_hop_none,
1912 .update_next_hop = bgp_update_next_hop_none,
1913 },
1914 {
1915 .afi = BGP_AF_FLOW6,
1916 .net = NET_FLOW6,
1917 .no_igp = 1,
1918 .name = "flow6",
1919 .encode_nlri = bgp_encode_nlri_flow6,
1920 .decode_nlri = bgp_decode_nlri_flow6,
1921 .encode_next_hop = bgp_encode_next_hop_none,
1922 .decode_next_hop = bgp_decode_next_hop_none,
1923 .update_next_hop = bgp_update_next_hop_none,
1924 },
1925 };
1926
1927 const struct bgp_af_desc *
1928 bgp_get_af_desc(u32 afi)
1929 {
1930 uint i;
1931 for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1932 if (bgp_af_table[i].afi == afi)
1933 return &bgp_af_table[i];
1934
1935 return NULL;
1936 }
1937
1938 static inline uint
1939 bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1940 {
1941 return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1942 }
1943
1944 static inline uint
1945 bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1946 {
1947 return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1948 }
1949
1950 void
1951 bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1952 {
1953 s->channel->desc->update_next_hop(s, a, to);
1954 }
1955
1956 #define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1957
1958 static byte *
1959 bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1960 {
1961 /*
1962 * 2 B Withdrawn Routes Length (zero)
1963 * --- IPv4 Withdrawn Routes NLRI (unused)
1964 * 2 B Total Path Attribute Length
1965 * var Path Attributes
1966 * var IPv4 Network Layer Reachability Information
1967 */
1968
1969 int lr, la;
1970
1971 la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
1972 if (la < 0)
1973 {
1974 /* Attribute list too long */
1975 bgp_withdraw_bucket(s->channel, buck);
1976 return NULL;
1977 }
1978
1979 put_u16(buf+0, 0);
1980 put_u16(buf+2, la);
1981
1982 lr = bgp_encode_nlri(s, buck, buf+4+la, end);
1983
1984 return buf+4+la+lr;
1985 }
1986
1987 static byte *
1988 bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1989 {
1990 /*
1991 * 2 B IPv4 Withdrawn Routes Length (zero)
1992 * --- IPv4 Withdrawn Routes NLRI (unused)
1993 * 2 B Total Path Attribute Length
1994 * 1 B MP_REACH_NLRI hdr - Attribute Flags
1995 * 1 B MP_REACH_NLRI hdr - Attribute Type Code
1996 * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
1997 * 2 B MP_REACH_NLRI data - Address Family Identifier
1998 * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
1999 * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
2000 * var MP_REACH_NLRI data - Network Address of Next Hop
2001 * 1 B MP_REACH_NLRI data - Reserved (zero)
2002 * var MP_REACH_NLRI data - Network Layer Reachability Information
2003 * var Rest of Path Attributes
2004 * --- IPv4 Network Layer Reachability Information (unused)
2005 */
2006
2007 int lh, lr, la; /* Lengths of next hop, NLRI and attributes */
2008
2009 /* Begin of MP_REACH_NLRI atribute */
2010 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2011 buf[5] = BA_MP_REACH_NLRI;
2012 put_u16(buf+6, 0); /* Will be fixed later */
2013 put_af3(buf+8, s->channel->afi);
2014 byte *pos = buf+11;
2015
2016 /* Encode attributes to temporary buffer */
2017 byte *abuf = alloca(MAX_ATTRS_LENGTH);
2018 la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2019 if (la < 0)
2020 {
2021 /* Attribute list too long */
2022 bgp_withdraw_bucket(s->channel, buck);
2023 return NULL;
2024 }
2025
2026 /* Encode the next hop */
2027 lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2028 *pos = lh;
2029 pos += 1+lh;
2030
2031 /* Reserved field */
2032 *pos++ = 0;
2033
2034 /* Encode the NLRI */
2035 lr = bgp_encode_nlri(s, buck, pos, end - la);
2036 pos += lr;
2037
2038 /* End of MP_REACH_NLRI atribute, update data length */
2039 put_u16(buf+6, pos-buf-8);
2040
2041 /* Copy remaining attributes */
2042 memcpy(pos, abuf, la);
2043 pos += la;
2044
2045 /* Initial UPDATE fields */
2046 put_u16(buf+0, 0);
2047 put_u16(buf+2, pos-buf-4);
2048
2049 return pos;
2050 }
2051
2052 #undef MAX_ATTRS_LENGTH
2053
2054 static byte *
2055 bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2056 {
2057 /*
2058 * 2 B Withdrawn Routes Length
2059 * var IPv4 Withdrawn Routes NLRI
2060 * 2 B Total Path Attribute Length (zero)
2061 * --- Path Attributes (unused)
2062 * --- IPv4 Network Layer Reachability Information (unused)
2063 */
2064
2065 uint len = bgp_encode_nlri(s, buck, buf+2, end);
2066
2067 put_u16(buf+0, len);
2068 put_u16(buf+2+len, 0);
2069
2070 return buf+4+len;
2071 }
2072
2073 static byte *
2074 bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2075 {
2076 /*
2077 * 2 B Withdrawn Routes Length (zero)
2078 * --- IPv4 Withdrawn Routes NLRI (unused)
2079 * 2 B Total Path Attribute Length
2080 * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
2081 * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
2082 * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
2083 * 2 B MP_UNREACH_NLRI data - Address Family Identifier
2084 * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2085 * var MP_UNREACH_NLRI data - Network Layer Reachability Information
2086 * --- IPv4 Network Layer Reachability Information (unused)
2087 */
2088
2089 uint len = bgp_encode_nlri(s, buck, buf+11, end);
2090
2091 put_u16(buf+0, 0);
2092 put_u16(buf+2, 7+len);
2093
2094 /* Begin of MP_UNREACH_NLRI atribute */
2095 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2096 buf[5] = BA_MP_UNREACH_NLRI;
2097 put_u16(buf+6, 3+len);
2098 put_af3(buf+8, s->channel->afi);
2099
2100 return buf+11+len;
2101 }
2102
2103 static byte *
2104 bgp_create_update(struct bgp_channel *c, byte *buf)
2105 {
2106 struct bgp_proto *p = (void *) c->c.proto;
2107 struct bgp_bucket *buck;
2108 byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2109 byte *res = NULL;
2110
2111 again: ;
2112
2113 /* Initialize write state */
2114 struct bgp_write_state s = {
2115 .proto = p,
2116 .channel = c,
2117 .pool = bgp_linpool,
2118 .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2119 .as4_session = p->as4_session,
2120 .add_path = c->add_path_tx,
2121 .mpls = c->desc->mpls,
2122 };
2123
2124 /* Try unreachable bucket */
2125 if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2126 {
2127 res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2128 bgp_create_ip_unreach(&s, buck, buf, end):
2129 bgp_create_mp_unreach(&s, buck, buf, end);
2130
2131 goto done;
2132 }
2133
2134 /* Try reachable buckets */
2135 if (!EMPTY_LIST(c->bucket_queue))
2136 {
2137 buck = HEAD(c->bucket_queue);
2138
2139 /* Cleanup empty buckets */
2140 if (EMPTY_LIST(buck->prefixes))
2141 {
2142 bgp_free_bucket(c, buck);
2143 goto again;
2144 }
2145
2146 res = !s.mp_reach ?
2147 bgp_create_ip_reach(&s, buck, buf, end):
2148 bgp_create_mp_reach(&s, buck, buf, end);
2149
2150 if (EMPTY_LIST(buck->prefixes))
2151 bgp_free_bucket(c, buck);
2152 else
2153 bgp_defer_bucket(c, buck);
2154
2155 if (!res)
2156 goto again;
2157
2158 goto done;
2159 }
2160
2161 /* No more prefixes to send */
2162 return NULL;
2163
2164 done:
2165 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2166 lp_flush(s.pool);
2167
2168 return res;
2169 }
2170
2171 static byte *
2172 bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2173 {
2174 /* Empty update packet */
2175 put_u32(buf, 0);
2176
2177 return buf+4;
2178 }
2179
2180 static byte *
2181 bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2182 {
2183 put_u16(buf+0, 0);
2184 put_u16(buf+2, 6); /* length 4--9 */
2185
2186 /* Empty MP_UNREACH_NLRI atribute */
2187 buf[4] = BAF_OPTIONAL;
2188 buf[5] = BA_MP_UNREACH_NLRI;
2189 buf[6] = 3; /* Length 7--9 */
2190 put_af3(buf+7, c->afi);
2191
2192 return buf+10;
2193 }
2194
2195 static byte *
2196 bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2197 {
2198 struct bgp_proto *p = (void *) c->c.proto;
2199
2200 BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2201
2202 return (c->afi == BGP_AF_IPV4) ?
2203 bgp_create_ip_end_mark(c, buf):
2204 bgp_create_mp_end_mark(c, buf);
2205 }
2206
2207 static inline void
2208 bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2209 {
2210 struct bgp_proto *p = s->proto;
2211 struct bgp_channel *c = bgp_get_channel(p, afi);
2212
2213 BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2214
2215 if (!c)
2216 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2217
2218 if (c->load_state == BFS_LOADING)
2219 c->load_state = BFS_NONE;
2220
2221 if (p->p.gr_recovery)
2222 channel_graceful_restart_unlock(&c->c);
2223
2224 if (c->gr_active)
2225 bgp_graceful_restart_done(c);
2226 }
2227
2228 static inline void
2229 bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2230 {
2231 struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2232 rta *a = NULL;
2233
2234 if (!c)
2235 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2236
2237 s->channel = c;
2238 s->add_path = c->add_path_rx;
2239 s->mpls = c->desc->mpls;
2240
2241 s->last_id = 0;
2242 s->last_src = s->proto->p.main_source;
2243
2244 /*
2245 * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2246 * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2247 * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2248 * decode_next_hop hooks) by restoring a->eattrs afterwards.
2249 */
2250
2251 if (ea)
2252 {
2253 a = allocz(RTA_MAX_SIZE);
2254
2255 a->source = RTS_BGP;
2256 a->scope = SCOPE_UNIVERSE;
2257 a->from = s->proto->cf->remote_ip;
2258 a->eattrs = ea;
2259
2260 c->desc->decode_next_hop(s, nh, nh_len, a);
2261
2262 /* Handle withdraw during next hop decoding */
2263 if (s->err_withdraw)
2264 a = NULL;
2265 }
2266
2267 c->desc->decode_nlri(s, nlri, len, a);
2268
2269 rta_free(s->cached_rta);
2270 s->cached_rta = NULL;
2271 }
2272
2273 static void
2274 bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2275 {
2276 struct bgp_proto *p = conn->bgp;
2277 ea_list *ea = NULL;
2278
2279 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2280
2281 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2282 if (conn->state == BS_OPENCONFIRM)
2283 bgp_conn_enter_established_state(conn);
2284
2285 if (conn->state != BS_ESTABLISHED)
2286 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2287
2288 bgp_start_timer(conn->hold_timer, conn->hold_time);
2289
2290 /* Initialize parse state */
2291 struct bgp_parse_state s = {
2292 .proto = p,
2293 .pool = bgp_linpool,
2294 .as4_session = p->as4_session,
2295 };
2296
2297 /* Parse error handler */
2298 if (setjmp(s.err_jmpbuf))
2299 {
2300 bgp_error(conn, 3, s.err_subcode, NULL, 0);
2301 goto done;
2302 }
2303
2304 /* Check minimal length */
2305 if (len < 23)
2306 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2307
2308 /* Skip fixed header */
2309 uint pos = 19;
2310
2311 /*
2312 * UPDATE message format
2313 *
2314 * 2 B IPv4 Withdrawn Routes Length
2315 * var IPv4 Withdrawn Routes NLRI
2316 * 2 B Total Path Attribute Length
2317 * var Path Attributes
2318 * var IPv4 Reachable Routes NLRI
2319 */
2320
2321 s.ip_unreach_len = get_u16(pkt + pos);
2322 s.ip_unreach_nlri = pkt + pos + 2;
2323 pos += 2 + s.ip_unreach_len;
2324
2325 if (pos + 2 > len)
2326 bgp_parse_error(&s, 1);
2327
2328 s.attr_len = get_u16(pkt + pos);
2329 s.attrs = pkt + pos + 2;
2330 pos += 2 + s.attr_len;
2331
2332 if (pos > len)
2333 bgp_parse_error(&s, 1);
2334
2335 s.ip_reach_len = len - pos;
2336 s.ip_reach_nlri = pkt + pos;
2337
2338
2339 if (s.attr_len)
2340 ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2341 else
2342 ea = NULL;
2343
2344 /* Check for End-of-RIB marker */
2345 if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2346 { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2347
2348 /* Check for MP End-of-RIB marker */
2349 if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2350 !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2351 { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2352
2353 if (s.ip_unreach_len)
2354 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2355
2356 if (s.mp_unreach_len)
2357 bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2358
2359 if (s.ip_reach_len)
2360 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2361 ea, s.ip_next_hop_data, s.ip_next_hop_len);
2362
2363 if (s.mp_reach_len)
2364 bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2365 ea, s.mp_next_hop_data, s.mp_next_hop_len);
2366
2367 done:
2368 rta_free(s.cached_rta);
2369 lp_flush(s.pool);
2370 return;
2371 }
2372
2373 static uint
2374 bgp_find_update_afi(byte *pos, uint len)
2375 {
2376 /*
2377 * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2378 * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2379 * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2380 */
2381 if (len < 23)
2382 return 0;
2383
2384 /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2385 uint wlen = get_u16(pos + 19);
2386 uint alen = get_u16(pos + 21);
2387 ADVANCE(pos, len, 23);
2388
2389 /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2390 if ((wlen != 0) || (alen < len) || !alen)
2391 return BGP_AF_IPV4;
2392
2393 if (alen > len)
2394 return 0;
2395
2396 /* Process attribute list (alen == len) */
2397 while (len)
2398 {
2399 if (len < 2)
2400 return 0;
2401
2402 uint flags = pos[0];
2403 uint code = pos[1];
2404 ADVANCE(pos, len, 2);
2405
2406 uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2407 if (len < ll)
2408 return 0;
2409
2410 /* Read attribute length and move to attribute body */
2411 alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2412 ADVANCE(pos, len, ll);
2413
2414 if (len < alen)
2415 return 0;
2416
2417 /* Found MP NLRI */
2418 if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2419 {
2420 if (alen < 3)
2421 return 0;
2422
2423 return BGP_AF(get_u16(pos), pos[2]);
2424 }
2425
2426 /* Move to the next attribute */
2427 ADVANCE(pos, len, alen);
2428 }
2429
2430 /* No basic or MP NLRI, but there are some attributes -> error */
2431 return 0;
2432 }
2433
2434
2435 /*
2436 * ROUTE-REFRESH
2437 */
2438
2439 static inline byte *
2440 bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2441 {
2442 struct bgp_proto *p = (void *) c->c.proto;
2443
2444 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2445
2446 /* Original route refresh request, RFC 2918 */
2447 put_af4(buf, c->afi);
2448 buf[2] = BGP_RR_REQUEST;
2449
2450 return buf+4;
2451 }
2452
2453 static inline byte *
2454 bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2455 {
2456 struct bgp_proto *p = (void *) c->c.proto;
2457
2458 BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2459
2460 /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2461 put_af4(buf, c->afi);
2462 buf[2] = BGP_RR_BEGIN;
2463
2464 return buf+4;
2465 }
2466
2467 static inline byte *
2468 bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2469 {
2470 struct bgp_proto *p = (void *) c->c.proto;
2471
2472 BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2473
2474 /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2475 put_af4(buf, c->afi);
2476 buf[2] = BGP_RR_END;
2477
2478 return buf+4;
2479 }
2480
2481 static void
2482 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2483 {
2484 struct bgp_proto *p = conn->bgp;
2485
2486 if (conn->state != BS_ESTABLISHED)
2487 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2488
2489 if (!conn->local_caps->route_refresh)
2490 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2491
2492 if (len < (BGP_HEADER_LENGTH + 4))
2493 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2494
2495 if (len > (BGP_HEADER_LENGTH + 4))
2496 { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2497
2498 struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2499 if (!c)
2500 {
2501 log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2502 p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2503 return;
2504 }
2505
2506 /* RFC 7313 redefined reserved field as RR message subtype */
2507 uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2508
2509 switch (subtype)
2510 {
2511 case BGP_RR_REQUEST:
2512 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2513 channel_request_feeding(&c->c);
2514 break;
2515
2516 case BGP_RR_BEGIN:
2517 BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2518 bgp_refresh_begin(c);
2519 break;
2520
2521 case BGP_RR_END:
2522 BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2523 bgp_refresh_end(c);
2524 break;
2525
2526 default:
2527 log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2528 p->p.name, subtype);
2529 break;
2530 }
2531 }
2532
2533 static inline struct bgp_channel *
2534 bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2535 {
2536 uint i = conn->last_channel;
2537
2538 /* Try the last channel, but at most several times */
2539 if ((conn->channels_to_send & (1 << i)) &&
2540 (conn->last_channel_count < 16))
2541 goto found;
2542
2543 /* Find channel with non-zero channels_to_send */
2544 do
2545 {
2546 i++;
2547 if (i >= p->channel_count)
2548 i = 0;
2549 }
2550 while (! (conn->channels_to_send & (1 << i)));
2551
2552 /* Use that channel */
2553 conn->last_channel = i;
2554 conn->last_channel_count = 0;
2555
2556 found:
2557 conn->last_channel_count++;
2558 return p->channel_map[i];
2559 }
2560
2561 static inline int
2562 bgp_send(struct bgp_conn *conn, uint type, uint len)
2563 {
2564 sock *sk = conn->sk;
2565 byte *buf = sk->tbuf;
2566
2567 memset(buf, 0xff, 16); /* Marker */
2568 put_u16(buf+16, len);
2569 buf[18] = type;
2570
2571 return sk_send(sk, len);
2572 }
2573
2574 /**
2575 * bgp_fire_tx - transmit packets
2576 * @conn: connection
2577 *
2578 * Whenever the transmit buffers of the underlying TCP connection
2579 * are free and we have any packets queued for sending, the socket functions
2580 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2581 * queued (Notification > Keepalive > Open > Update), assembling its header
2582 * and body and sending it to the connection.
2583 */
2584 static int
2585 bgp_fire_tx(struct bgp_conn *conn)
2586 {
2587 struct bgp_proto *p = conn->bgp;
2588 struct bgp_channel *c;
2589 byte *buf, *pkt, *end;
2590 uint s;
2591
2592 if (!conn->sk)
2593 return 0;
2594
2595 buf = conn->sk->tbuf;
2596 pkt = buf + BGP_HEADER_LENGTH;
2597 s = conn->packets_to_send;
2598
2599 if (s & (1 << PKT_SCHEDULE_CLOSE))
2600 {
2601 /* We can finally close connection and enter idle state */
2602 bgp_conn_enter_idle_state(conn);
2603 return 0;
2604 }
2605 if (s & (1 << PKT_NOTIFICATION))
2606 {
2607 conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2608 end = bgp_create_notification(conn, pkt);
2609 return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2610 }
2611 else if (s & (1 << PKT_KEEPALIVE))
2612 {
2613 conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2614 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2615 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2616 return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2617 }
2618 else if (s & (1 << PKT_OPEN))
2619 {
2620 conn->packets_to_send &= ~(1 << PKT_OPEN);
2621 end = bgp_create_open(conn, pkt);
2622 return bgp_send(conn, PKT_OPEN, end - buf);
2623 }
2624 else while (conn->channels_to_send)
2625 {
2626 c = bgp_get_channel_to_send(p, conn);
2627 s = c->packets_to_send;
2628
2629 if (s & (1 << PKT_ROUTE_REFRESH))
2630 {
2631 c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2632 end = bgp_create_route_refresh(c, pkt);
2633 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2634 }
2635 else if (s & (1 << PKT_BEGIN_REFRESH))
2636 {
2637 /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2638 c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2639 end = bgp_create_begin_refresh(c, pkt);
2640 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2641 }
2642 else if (s & (1 << PKT_UPDATE))
2643 {
2644 end = bgp_create_update(c, pkt);
2645 if (end)
2646 return bgp_send(conn, PKT_UPDATE, end - buf);
2647
2648 /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2649 c->packets_to_send = 0;
2650 conn->channels_to_send &= ~(1 << c->index);
2651
2652 if (c->feed_state == BFS_LOADED)
2653 {
2654 c->feed_state = BFS_NONE;
2655 end = bgp_create_end_mark(c, pkt);
2656 return bgp_send(conn, PKT_UPDATE, end - buf);
2657 }
2658
2659 else if (c->feed_state == BFS_REFRESHED)
2660 {
2661 c->feed_state = BFS_NONE;
2662 end = bgp_create_end_refresh(c, pkt);
2663 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2664 }
2665 }
2666 else if (s)
2667 bug("Channel packets_to_send: %x", s);
2668
2669 c->packets_to_send = 0;
2670 conn->channels_to_send &= ~(1 << c->index);
2671 }
2672
2673 return 0;
2674 }
2675
2676 /**
2677 * bgp_schedule_packet - schedule a packet for transmission
2678 * @conn: connection
2679 * @c: channel
2680 * @type: packet type
2681 *
2682 * Schedule a packet of type @type to be sent as soon as possible.
2683 */
2684 void
2685 bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2686 {
2687 ASSERT(conn->sk);
2688
2689 DBG("BGP: Scheduling packet type %d\n", type);
2690
2691 if (c)
2692 {
2693 if (! conn->channels_to_send)
2694 {
2695 conn->last_channel = c->index;
2696 conn->last_channel_count = 0;
2697 }
2698
2699 c->packets_to_send |= 1 << type;
2700 conn->channels_to_send |= 1 << c->index;
2701 }
2702 else
2703 conn->packets_to_send |= 1 << type;
2704
2705 if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2706 ev_schedule(conn->tx_ev);
2707 }
2708
2709 void
2710 bgp_kick_tx(void *vconn)
2711 {
2712 struct bgp_conn *conn = vconn;
2713
2714 DBG("BGP: kicking TX\n");
2715 while (bgp_fire_tx(conn) > 0)
2716 ;
2717 }
2718
2719 void
2720 bgp_tx(sock *sk)
2721 {
2722 struct bgp_conn *conn = sk->data;
2723
2724 DBG("BGP: TX hook\n");
2725 while (bgp_fire_tx(conn) > 0)
2726 ;
2727 }
2728
2729
2730 static struct {
2731 byte major, minor;
2732 byte *msg;
2733 } bgp_msg_table[] = {
2734 { 1, 0, "Invalid message header" },
2735 { 1, 1, "Connection not synchronized" },
2736 { 1, 2, "Bad message length" },
2737 { 1, 3, "Bad message type" },
2738 { 2, 0, "Invalid OPEN message" },
2739 { 2, 1, "Unsupported version number" },
2740 { 2, 2, "Bad peer AS" },
2741 { 2, 3, "Bad BGP identifier" },
2742 { 2, 4, "Unsupported optional parameter" },
2743 { 2, 5, "Authentication failure" },
2744 { 2, 6, "Unacceptable hold time" },
2745 { 2, 7, "Required capability missing" }, /* [RFC5492] */
2746 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2747 { 3, 0, "Invalid UPDATE message" },
2748 { 3, 1, "Malformed attribute list" },
2749 { 3, 2, "Unrecognized well-known attribute" },
2750 { 3, 3, "Missing mandatory attribute" },
2751 { 3, 4, "Invalid attribute flags" },
2752 { 3, 5, "Invalid attribute length" },
2753 { 3, 6, "Invalid ORIGIN attribute" },
2754 { 3, 7, "AS routing loop" }, /* Deprecated */
2755 { 3, 8, "Invalid NEXT_HOP attribute" },
2756 { 3, 9, "Optional attribute error" },
2757 { 3, 10, "Invalid network field" },
2758 { 3, 11, "Malformed AS_PATH" },
2759 { 4, 0, "Hold timer expired" },
2760 { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2761 { 5, 1, "Unexpected message in OpenSent state" },
2762 { 5, 2, "Unexpected message in OpenConfirm state" },
2763 { 5, 3, "Unexpected message in Established state" },
2764 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2765 { 6, 1, "Maximum number of prefixes reached" },
2766 { 6, 2, "Administrative shutdown" },
2767 { 6, 3, "Peer de-configured" },
2768 { 6, 4, "Administrative reset" },
2769 { 6, 5, "Connection rejected" },
2770 { 6, 6, "Other configuration change" },
2771 { 6, 7, "Connection collision resolution" },
2772 { 6, 8, "Out of Resources" },
2773 { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2774 { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2775 };
2776
2777 /**
2778 * bgp_error_dsc - return BGP error description
2779 * @code: BGP error code
2780 * @subcode: BGP error subcode
2781 *
2782 * bgp_error_dsc() returns error description for BGP errors
2783 * which might be static string or given temporary buffer.
2784 */
2785 const char *
2786 bgp_error_dsc(uint code, uint subcode)
2787 {
2788 static char buff[32];
2789 uint i;
2790
2791 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2792 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2793 return bgp_msg_table[i].msg;
2794
2795 bsprintf(buff, "Unknown error %u.%u", code, subcode);
2796 return buff;
2797 }
2798
2799 /* RFC 8203 - shutdown communication message */
2800 static int
2801 bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2802 {
2803 byte *msg = data + 1;
2804 uint msg_len = data[0];
2805 uint i;
2806
2807 /* Handle zero length message */
2808 if (msg_len == 0)
2809 return 1;
2810
2811 /* Handle proper message */
2812 if ((msg_len > 128) && (msg_len + 1 > len))
2813 return 0;
2814
2815 /* Some elementary cleanup */
2816 for (i = 0; i < msg_len; i++)
2817 if (msg[i] < ' ')
2818 msg[i] = ' ';
2819
2820 proto_set_message(&p->p, msg, msg_len);
2821 *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2822 return 1;
2823 }
2824
2825 void
2826 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2827 {
2828 byte argbuf[256], *t = argbuf;
2829 uint i;
2830
2831 /* Don't report Cease messages generated by myself */
2832 if (code == 6 && class == BE_BGP_TX)
2833 return;
2834
2835 /* Reset shutdown message */
2836 if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2837 proto_set_message(&p->p, NULL, 0);
2838
2839 if (len)
2840 {
2841 /* Bad peer AS - we would like to print the AS */
2842 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2843 {
2844 t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2845 goto done;
2846 }
2847
2848 /* RFC 8203 - shutdown communication */
2849 if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2850 if (bgp_handle_message(p, data, len, &t))
2851 goto done;
2852
2853 *t++ = ':';
2854 *t++ = ' ';
2855 if (len > 16)
2856 len = 16;
2857 for (i=0; i<len; i++)
2858 t += bsprintf(t, "%02x", data[i]);
2859 }
2860
2861 done:
2862 *t = 0;
2863 const byte *dsc = bgp_error_dsc(code, subcode);
2864 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2865 }
2866
2867 static void
2868 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2869 {
2870 struct bgp_proto *p = conn->bgp;
2871
2872 if (len < 21)
2873 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2874
2875 uint code = pkt[19];
2876 uint subcode = pkt[20];
2877 int err = (code != 6);
2878
2879 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2880 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2881
2882 bgp_conn_enter_close_state(conn);
2883 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2884
2885 if (err)
2886 {
2887 bgp_update_startup_delay(p);
2888 bgp_stop(p, 0, NULL, 0);
2889 }
2890 else
2891 {
2892 uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
2893 if (p->cf->disable_after_cease & subcode_bit)
2894 {
2895 log(L_INFO "%s: Disabled after Cease notification", p->p.name);
2896 p->startup_delay = 0;
2897 p->p.disabled = 1;
2898 }
2899 }
2900 }
2901
2902 static void
2903 bgp_rx_keepalive(struct bgp_conn *conn)
2904 {
2905 struct bgp_proto *p = conn->bgp;
2906
2907 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2908 bgp_start_timer(conn->hold_timer, conn->hold_time);
2909
2910 if (conn->state == BS_OPENCONFIRM)
2911 { bgp_conn_enter_established_state(conn); return; }
2912
2913 if (conn->state != BS_ESTABLISHED)
2914 bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2915 }
2916
2917
2918 /**
2919 * bgp_rx_packet - handle a received packet
2920 * @conn: BGP connection
2921 * @pkt: start of the packet
2922 * @len: packet size
2923 *
2924 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2925 * packet handler according to the packet type.
2926 */
2927 static void
2928 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2929 {
2930 byte type = pkt[18];
2931
2932 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2933
2934 if (conn->bgp->p.mrtdump & MD_MESSAGES)
2935 bgp_dump_message(conn, pkt, len);
2936
2937 switch (type)
2938 {
2939 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
2940 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
2941 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
2942 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
2943 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
2944 default: bgp_error(conn, 1, 3, pkt+18, 1);
2945 }
2946 }
2947
2948 /**
2949 * bgp_rx - handle received data
2950 * @sk: socket
2951 * @size: amount of data received
2952 *
2953 * bgp_rx() is called by the socket layer whenever new data arrive from
2954 * the underlying TCP connection. It assembles the data fragments to packets,
2955 * checks their headers and framing and passes complete packets to
2956 * bgp_rx_packet().
2957 */
2958 int
2959 bgp_rx(sock *sk, uint size)
2960 {
2961 struct bgp_conn *conn = sk->data;
2962 byte *pkt_start = sk->rbuf;
2963 byte *end = pkt_start + size;
2964 uint i, len;
2965
2966 DBG("BGP: RX hook: Got %d bytes\n", size);
2967 while (end >= pkt_start + BGP_HEADER_LENGTH)
2968 {
2969 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
2970 return 0;
2971 for(i=0; i<16; i++)
2972 if (pkt_start[i] != 0xff)
2973 {
2974 bgp_error(conn, 1, 1, NULL, 0);
2975 break;
2976 }
2977 len = get_u16(pkt_start+16);
2978 if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
2979 {
2980 bgp_error(conn, 1, 2, pkt_start+16, 2);
2981 break;
2982 }
2983 if (end < pkt_start + len)
2984 break;
2985 bgp_rx_packet(conn, pkt_start, len);
2986 pkt_start += len;
2987 }
2988 if (pkt_start != sk->rbuf)
2989 {
2990 memmove(sk->rbuf, pkt_start, end - pkt_start);
2991 sk->rpos = sk->rbuf + (end - pkt_start);
2992 }
2993 return 0;
2994 }