]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
4ae6c5cf6f191f77844742a98ac33a9c6a80cd03
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
7 *
8 * Can be freely distributed and used under the terms of the GNU GPL.
9 */
10
11 #undef LOCAL_DEBUG
12
13 #include <stdlib.h>
14
15 #include "nest/bird.h"
16 #include "nest/iface.h"
17 #include "nest/protocol.h"
18 #include "nest/route.h"
19 #include "nest/attrs.h"
20 #include "proto/mrt/mrt.h"
21 #include "conf/conf.h"
22 #include "lib/unaligned.h"
23 #include "lib/flowspec.h"
24 #include "lib/socket.h"
25
26 #include "nest/cli.h"
27
28 #include "bgp.h"
29
30
31 #define BGP_RR_REQUEST 0
32 #define BGP_RR_BEGIN 1
33 #define BGP_RR_END 2
34
35 #define BGP_NLRI_MAX (4 + 1 + 32)
36
37 #define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
38 #define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
39 #define BGP_MPLS_NULL 3 /* Implicit NULL label */
40 #define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
41
42
43 static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44 static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45
46 /* Table for state -> RFC 6608 FSM error subcodes */
47 static byte fsm_err_subcode[BS_MAX] = {
48 [BS_OPENSENT] = 1,
49 [BS_OPENCONFIRM] = 2,
50 [BS_ESTABLISHED] = 3
51 };
52
53
54 static struct bgp_channel *
55 bgp_get_channel(struct bgp_proto *p, u32 afi)
56 {
57 uint i;
58
59 for (i = 0; i < p->channel_count; i++)
60 if (p->afi_map[i] == afi)
61 return p->channel_map[i];
62
63 return NULL;
64 }
65
66 static inline void
67 put_af3(byte *buf, u32 id)
68 {
69 put_u16(buf, id >> 16);
70 buf[2] = id & 0xff;
71 }
72
73 static inline void
74 put_af4(byte *buf, u32 id)
75 {
76 put_u16(buf, id >> 16);
77 buf[2] = 0;
78 buf[3] = id & 0xff;
79 }
80
81 static inline u32
82 get_af3(byte *buf)
83 {
84 return (get_u16(buf) << 16) | buf[2];
85 }
86
87 static inline u32
88 get_af4(byte *buf)
89 {
90 return (get_u16(buf) << 16) | buf[3];
91 }
92
93 static void
94 init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95 {
96 struct bgp_proto *p = conn->bgp;
97 int p_ok = conn->state >= BS_OPENCONFIRM;
98
99 memset(d, 0, sizeof(struct mrt_bgp_data));
100 d->peer_as = p->remote_as;
101 d->local_as = p->local_as;
102 d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103 d->af = ipa_is_ip4(p->cf->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104 d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105 d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106 d->as4 = p_ok ? p->as4_session : 0;
107 }
108
109 static uint bgp_find_update_afi(byte *pos, uint len);
110
111 static int
112 bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113 {
114 /* No need to estimate it for other messages than UPDATE */
115 if (pkt[18] != PKT_UPDATE)
116 return 0;
117
118 /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119 if (p->summary_add_path_rx < 3)
120 return p->summary_add_path_rx == 2;
121
122 uint afi = bgp_find_update_afi(pkt, len);
123 struct bgp_channel *c = bgp_get_channel(p, afi);
124 if (!c)
125 {
126 /* Either frame error (if !afi) or unknown AFI/SAFI,
127 will be reported later in regular parsing */
128 BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129 return 0;
130 }
131
132 return c->add_path_rx;
133 }
134
135 static void
136 bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137 {
138 struct mrt_bgp_data d;
139 init_mrt_bgp_data(conn, &d);
140
141 d.message = pkt;
142 d.msg_len = len;
143 d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144
145 mrt_dump_bgp_message(&d);
146 }
147
148 void
149 bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150 {
151 struct mrt_bgp_data d;
152 init_mrt_bgp_data(conn, &d);
153
154 d.old_state = old;
155 d.new_state = new;
156
157 mrt_dump_bgp_state_change(&d);
158 }
159
160 static byte *
161 bgp_create_notification(struct bgp_conn *conn, byte *buf)
162 {
163 struct bgp_proto *p = conn->bgp;
164
165 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166 buf[0] = conn->notify_code;
167 buf[1] = conn->notify_subcode;
168 memcpy(buf+2, conn->notify_data, conn->notify_size);
169 return buf + 2 + conn->notify_size;
170 }
171
172
173 /* Capability negotiation as per RFC 5492 */
174
175 const struct bgp_af_caps *
176 bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177 {
178 struct bgp_af_caps *ac;
179
180 WALK_AF_CAPS(caps, ac)
181 if (ac->afi == afi)
182 return ac;
183
184 return NULL;
185 }
186
187 static struct bgp_af_caps *
188 bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
189 {
190 struct bgp_af_caps *ac;
191
192 WALK_AF_CAPS(caps, ac)
193 if (ac->afi == afi)
194 return ac;
195
196 ac = &caps->af_data[caps->af_count++];
197 memset(ac, 0, sizeof(struct bgp_af_caps));
198 ac->afi = afi;
199
200 return ac;
201 }
202
203 static int
204 bgp_af_caps_cmp(const void *X, const void *Y)
205 {
206 const struct bgp_af_caps *x = X, *y = Y;
207 return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
208 }
209
210
211 static byte *
212 bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
213 {
214 struct bgp_proto *p = conn->bgp;
215 struct bgp_channel *c;
216 struct bgp_caps *caps;
217 struct bgp_af_caps *ac;
218 uint any_ext_next_hop = 0;
219 uint any_add_path = 0;
220 byte *buf_head = buf;
221 byte *data;
222
223 /* Prepare bgp_caps structure */
224
225 int n = list_length(&p->p.channels);
226 caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
227 conn->local_caps = caps;
228
229 caps->as4_support = p->cf->enable_as4;
230 caps->ext_messages = p->cf->enable_extended_messages;
231 caps->route_refresh = p->cf->enable_refresh;
232 caps->enhanced_refresh = p->cf->enable_refresh;
233
234 if (caps->as4_support)
235 caps->as4_number = p->public_as;
236
237 if (p->cf->gr_mode)
238 {
239 caps->gr_aware = 1;
240 caps->gr_time = p->cf->gr_time;
241 caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
242 }
243
244 if (p->cf->llgr_mode)
245 caps->llgr_aware = 1;
246
247 /* Allocate and fill per-AF fields */
248 WALK_LIST(c, p->p.channels)
249 {
250 ac = &caps->af_data[caps->af_count++];
251 ac->afi = c->afi;
252 ac->ready = 1;
253
254 ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
255 any_ext_next_hop |= ac->ext_next_hop;
256
257 ac->add_path = c->cf->add_path;
258 any_add_path |= ac->add_path;
259
260 if (c->cf->gr_able)
261 {
262 ac->gr_able = 1;
263
264 if (p->p.gr_recovery)
265 ac->gr_af_flags |= BGP_GRF_FORWARDING;
266 }
267
268 if (c->cf->llgr_able)
269 {
270 ac->llgr_able = 1;
271 ac->llgr_time = c->cf->llgr_time;
272
273 if (p->p.gr_recovery)
274 ac->llgr_flags |= BGP_LLGRF_FORWARDING;
275 }
276 }
277
278 /* Sort capability fields by AFI/SAFI */
279 qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
280
281
282 /* Create capability list in buffer */
283
284 /*
285 * Note that max length is ~ 22+21*af_count. With max 12 channels that is
286 * 274. Option limit is 253 and buffer size is 4096, so we cannot overflow
287 * unless we add new capabilities or more AFs. XXXXX
288 */
289
290 WALK_AF_CAPS(caps, ac)
291 if (ac->ready)
292 {
293 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
294 *buf++ = 4; /* Capability data length */
295 put_af4(buf, ac->afi);
296 buf += 4;
297 }
298
299 if (caps->route_refresh)
300 {
301 *buf++ = 2; /* Capability 2: Support for route refresh */
302 *buf++ = 0; /* Capability data length */
303 }
304
305 if (any_ext_next_hop)
306 {
307 *buf++ = 5; /* Capability 5: Support for extended next hop */
308 *buf++ = 0; /* Capability data length, will be fixed later */
309 data = buf;
310
311 WALK_AF_CAPS(caps, ac)
312 if (ac->ext_next_hop)
313 {
314 put_af4(buf, ac->afi);
315 put_u16(buf+4, BGP_AFI_IPV6);
316 buf += 6;
317 }
318
319 data[-1] = buf - data;
320 }
321
322 if (caps->ext_messages)
323 {
324 *buf++ = 6; /* Capability 6: Support for extended messages */
325 *buf++ = 0; /* Capability data length */
326 }
327
328 if (caps->gr_aware)
329 {
330 *buf++ = 64; /* Capability 64: Support for graceful restart */
331 *buf++ = 0; /* Capability data length, will be fixed later */
332 data = buf;
333
334 put_u16(buf, caps->gr_time);
335 buf[0] |= caps->gr_flags;
336 buf += 2;
337
338 WALK_AF_CAPS(caps, ac)
339 if (ac->gr_able)
340 {
341 put_af3(buf, ac->afi);
342 buf[3] = ac->gr_af_flags;
343 buf += 4;
344 }
345
346 data[-1] = buf - data;
347 }
348
349 if (caps->as4_support)
350 {
351 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
352 *buf++ = 4; /* Capability data length */
353 put_u32(buf, p->public_as);
354 buf += 4;
355 }
356
357 if (any_add_path)
358 {
359 *buf++ = 69; /* Capability 69: Support for ADD-PATH */
360 *buf++ = 0; /* Capability data length, will be fixed later */
361 data = buf;
362
363 WALK_AF_CAPS(caps, ac)
364 if (ac->add_path)
365 {
366 put_af3(buf, ac->afi);
367 buf[3] = ac->add_path;
368 buf += 4;
369 }
370
371 data[-1] = buf - data;
372 }
373
374 if (caps->enhanced_refresh)
375 {
376 *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
377 *buf++ = 0; /* Capability data length */
378 }
379
380 if (caps->llgr_aware)
381 {
382 *buf++ = 71; /* Capability 71: Support for long-lived graceful restart */
383 *buf++ = 0; /* Capability data length, will be fixed later */
384 data = buf;
385
386 WALK_AF_CAPS(caps, ac)
387 if (ac->llgr_able)
388 {
389 put_af3(buf, ac->afi);
390 buf[3] = ac->llgr_flags;
391 put_u24(buf+4, ac->llgr_time);
392 buf += 7;
393 }
394
395 data[-1] = buf - data;
396 }
397
398 caps->length = buf - buf_head;
399
400 return buf;
401 }
402
403 static void
404 bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
405 {
406 struct bgp_proto *p = conn->bgp;
407 struct bgp_af_caps *ac;
408 int i, cl;
409 u32 af;
410
411 caps->length += len;
412
413 while (len > 0)
414 {
415 if (len < 2 || len < (2 + pos[1]))
416 goto err;
417
418 /* Capability length */
419 cl = pos[1];
420
421 /* Capability type */
422 switch (pos[0])
423 {
424 case 1: /* Multiprotocol capability, RFC 4760 */
425 if (cl != 4)
426 goto err;
427
428 af = get_af4(pos+2);
429 ac = bgp_get_af_caps(caps, af);
430 ac->ready = 1;
431 break;
432
433 case 2: /* Route refresh capability, RFC 2918 */
434 if (cl != 0)
435 goto err;
436
437 caps->route_refresh = 1;
438 break;
439
440 case 5: /* Extended next hop encoding capability, RFC 5549 */
441 if (cl % 6)
442 goto err;
443
444 for (i = 0; i < cl; i += 6)
445 {
446 /* Specified only for IPv4 prefixes with IPv6 next hops */
447 if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
448 (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
449 continue;
450
451 af = get_af4(pos+2+i);
452 ac = bgp_get_af_caps(caps, af);
453 ac->ext_next_hop = 1;
454 }
455 break;
456
457 case 6: /* Extended message length capability, RFC draft */
458 if (cl != 0)
459 goto err;
460
461 caps->ext_messages = 1;
462 break;
463
464 case 64: /* Graceful restart capability, RFC 4724 */
465 if (cl % 4 != 2)
466 goto err;
467
468 /* Only the last instance is valid */
469 WALK_AF_CAPS(caps, ac)
470 {
471 ac->gr_able = 0;
472 ac->gr_af_flags = 0;
473 }
474
475 caps->gr_aware = 1;
476 caps->gr_flags = pos[2] & 0xf0;
477 caps->gr_time = get_u16(pos + 2) & 0x0fff;
478
479 for (i = 2; i < cl; i += 4)
480 {
481 af = get_af3(pos+2+i);
482 ac = bgp_get_af_caps(caps, af);
483 ac->gr_able = 1;
484 ac->gr_af_flags = pos[2+i+3];
485 }
486 break;
487
488 case 65: /* AS4 capability, RFC 6793 */
489 if (cl != 4)
490 goto err;
491
492 caps->as4_support = 1;
493 caps->as4_number = get_u32(pos + 2);
494 break;
495
496 case 69: /* ADD-PATH capability, RFC 7911 */
497 if (cl % 4)
498 goto err;
499
500 for (i = 0; i < cl; i += 4)
501 {
502 byte val = pos[2+i+3];
503 if (!val || (val > BGP_ADD_PATH_FULL))
504 {
505 log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
506 p->p.name, val);
507 break;
508 }
509 }
510
511 for (i = 0; i < cl; i += 4)
512 {
513 af = get_af3(pos+2+i);
514 ac = bgp_get_af_caps(caps, af);
515 ac->add_path = pos[2+i+3];
516 }
517 break;
518
519 case 70: /* Enhanced route refresh capability, RFC 7313 */
520 if (cl != 0)
521 goto err;
522
523 caps->enhanced_refresh = 1;
524 break;
525
526 case 71: /* Long lived graceful restart capability, RFC draft */
527 if (cl % 7)
528 goto err;
529
530 /* Presumably, only the last instance is valid */
531 WALK_AF_CAPS(caps, ac)
532 {
533 ac->llgr_able = 0;
534 ac->llgr_flags = 0;
535 ac->llgr_time = 0;
536 }
537
538 caps->llgr_aware = 1;
539
540 for (i = 0; i < cl; i += 7)
541 {
542 af = get_af3(pos+2+i);
543 ac = bgp_get_af_caps(caps, af);
544 ac->llgr_able = 1;
545 ac->llgr_flags = pos[2+i+3];
546 ac->llgr_time = get_u24(pos + 2+i+4);
547 }
548 break;
549
550 /* We can safely ignore all other capabilities */
551 }
552
553 ADVANCE(pos, len, 2 + cl);
554 }
555
556 /* The LLGR capability must be advertised together with the GR capability,
557 otherwise it must be disregarded */
558 if (!caps->gr_aware && caps->llgr_aware)
559 {
560 caps->llgr_aware = 0;
561 WALK_AF_CAPS(caps, ac)
562 {
563 ac->llgr_able = 0;
564 ac->llgr_flags = 0;
565 ac->llgr_time = 0;
566 }
567 }
568
569 return;
570
571 err:
572 bgp_error(conn, 2, 0, NULL, 0);
573 return;
574 }
575
576 static int
577 bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
578 {
579 struct bgp_proto *p = conn->bgp;
580 struct bgp_caps *caps;
581 int ol;
582
583 /* Max number of announced AFIs is limited by max option length (255) */
584 caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
585 memset(caps, 0, sizeof(struct bgp_caps));
586
587 while (len > 0)
588 {
589 if ((len < 2) || (len < (2 + pos[1])))
590 { bgp_error(conn, 2, 0, NULL, 0); return -1; }
591
592 ol = pos[1];
593 if (pos[0] == 2)
594 {
595 /* BGP capabilities, RFC 5492 */
596 if (p->cf->capabilities)
597 bgp_read_capabilities(conn, caps, pos + 2, ol);
598 }
599 else
600 {
601 /* Unknown option */
602 bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
603 return -1;
604 }
605
606 ADVANCE(pos, len, 2 + ol);
607 }
608
609 uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
610 conn->remote_caps = mb_allocz(p->p.pool, n);
611 memcpy(conn->remote_caps, caps, n);
612
613 return 0;
614 }
615
616 static byte *
617 bgp_create_open(struct bgp_conn *conn, byte *buf)
618 {
619 struct bgp_proto *p = conn->bgp;
620
621 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
622 BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
623
624 buf[0] = BGP_VERSION;
625 put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
626 put_u16(buf+3, p->cf->hold_time);
627 put_u32(buf+5, p->local_id);
628
629 if (p->cf->capabilities)
630 {
631 /* Prepare local_caps and write capabilities to buffer */
632 byte *end = bgp_write_capabilities(conn, buf+12);
633 uint len = end - (buf+12);
634
635 buf[9] = len + 2; /* Optional parameters length */
636 buf[10] = 2; /* Option 2: Capability list */
637 buf[11] = len; /* Option data length */
638
639 return end;
640 }
641 else
642 {
643 /* Prepare empty local_caps */
644 conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
645
646 buf[9] = 0; /* No optional parameters */
647 return buf + 10;
648 }
649
650 return buf;
651 }
652
653 static void
654 bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
655 {
656 struct bgp_proto *p = conn->bgp;
657 struct bgp_conn *other;
658 u32 asn, hold, id;
659
660 /* Check state */
661 if (conn->state != BS_OPENSENT)
662 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
663
664 /* Check message contents */
665 if (len < 29 || len != 29 + (uint) pkt[28])
666 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
667
668 if (pkt[19] != BGP_VERSION)
669 { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
670
671 asn = get_u16(pkt+20);
672 hold = get_u16(pkt+22);
673 id = get_u32(pkt+24);
674 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
675
676 if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
677 return;
678
679 if (hold > 0 && hold < 3)
680 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
681
682 /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
683 if (!id || (p->is_internal && id == p->local_id))
684 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
685
686 struct bgp_caps *caps = conn->remote_caps;
687
688 if (caps->as4_support)
689 {
690 u32 as4 = caps->as4_number;
691
692 if ((as4 != asn) && (asn != AS_TRANS))
693 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
694
695 if (as4 != p->remote_as)
696 { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
697 }
698 else
699 {
700 if (asn != p->remote_as)
701 { bgp_error(conn, 2, 2, pkt+20, 2); return; }
702 }
703
704 /* Check the other connection */
705 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
706 switch (other->state)
707 {
708 case BS_CONNECT:
709 case BS_ACTIVE:
710 /* Stop outgoing connection attempts */
711 bgp_conn_enter_idle_state(other);
712 break;
713
714 case BS_IDLE:
715 case BS_OPENSENT:
716 case BS_CLOSE:
717 break;
718
719 case BS_OPENCONFIRM:
720 /*
721 * Description of collision detection rules in RFC 4271 is confusing and
722 * contradictory, but it is essentially:
723 *
724 * 1. Router with higher ID is dominant
725 * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
726 * 3. When both connections are in OpenConfirm state, one initiated by
727 * the dominant router is kept.
728 *
729 * The first line in the expression below evaluates whether the neighbor
730 * is dominant, the second line whether the new connection was initiated
731 * by the neighbor. If both are true (or both are false), we keep the new
732 * connection, otherwise we keep the old one.
733 */
734 if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
735 == (conn == &p->incoming_conn))
736 {
737 /* Should close the other connection */
738 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
739 bgp_error(other, 6, 7, NULL, 0);
740 break;
741 }
742 /* Fall thru */
743 case BS_ESTABLISHED:
744 /* Should close this connection */
745 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
746 bgp_error(conn, 6, 7, NULL, 0);
747 return;
748
749 default:
750 bug("bgp_rx_open: Unknown state");
751 }
752
753 /* Update our local variables */
754 conn->hold_time = MIN(hold, p->cf->hold_time);
755 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
756 conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
757 conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
758 p->remote_id = id;
759
760 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
761 conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
762
763 bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
764 bgp_start_timer(conn->hold_timer, conn->hold_time);
765 bgp_conn_enter_openconfirm_state(conn);
766 }
767
768
769 /*
770 * Next hop handling
771 */
772
773 #define REPORT(msg, args...) \
774 ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
775
776 #define DISCARD(msg, args...) \
777 ({ REPORT(msg, ## args); return; })
778
779 #define WITHDRAW(msg, args...) \
780 ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
781
782 #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
783 #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
784 #define NO_NEXT_HOP "Missing NEXT_HOP attribute"
785 #define NO_LABEL_STACK "Missing MPLS stack"
786
787
788 static void
789 bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
790 {
791 struct bgp_proto *p = s->proto;
792 struct bgp_channel *c = s->channel;
793
794 if (c->cf->gw_mode == GW_DIRECT)
795 {
796 neighbor *nbr = NULL;
797
798 /* GW_DIRECT -> single_hop -> p->neigh != NULL */
799 if (ipa_nonzero(gw))
800 nbr = neigh_find(&p->p, gw, NULL, 0);
801 else if (ipa_nonzero(ll))
802 nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
803
804 if (!nbr || (nbr->scope == SCOPE_HOST))
805 WITHDRAW(BAD_NEXT_HOP);
806
807 a->dest = RTD_UNICAST;
808 a->nh.gw = nbr->addr;
809 a->nh.iface = nbr->iface;
810 }
811 else /* GW_RECURSIVE */
812 {
813 if (ipa_zero(gw))
814 WITHDRAW(BAD_NEXT_HOP);
815
816 rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
817 s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
818
819 if (!s->mpls)
820 rta_apply_hostentry(a, s->hostentry, NULL);
821
822 /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
823 }
824 }
825
826 static void
827 bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
828 {
829 if (lnum > MPLS_MAX_LABEL_STACK)
830 {
831 REPORT("Too many MPLS labels ($u)", lnum);
832
833 a->dest = RTD_UNREACHABLE;
834 a->hostentry = NULL;
835 a->nh = (struct nexthop) { };
836 return;
837 }
838
839 /* Handle implicit NULL as empty MPLS stack */
840 if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
841 lnum = 0;
842
843 if (s->channel->cf->gw_mode == GW_DIRECT)
844 {
845 a->nh.labels = lnum;
846 memcpy(a->nh.label, labels, 4*lnum);
847 }
848 else /* GW_RECURSIVE */
849 {
850 mpls_label_stack ms;
851
852 ms.len = lnum;
853 memcpy(ms.stack, labels, 4*lnum);
854 rta_apply_hostentry(a, s->hostentry, &ms);
855 }
856 }
857
858
859 static int
860 bgp_match_src(struct bgp_export_state *s, int mode)
861 {
862 switch (mode)
863 {
864 case NH_NO: return 0;
865 case NH_ALL: return 1;
866 case NH_IBGP: return s->src && s->src->is_internal;
867 case NH_EBGP: return s->src && !s->src->is_internal;
868 default: return 0;
869 }
870 }
871
872 static inline int
873 bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
874 {
875 struct bgp_proto *p = s->proto;
876 struct bgp_channel *c = s->channel;
877 ip_addr *nh = (void *) a->u.ptr->data;
878
879 /* Handle next hop self option */
880 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
881 return 0;
882
883 /* Handle next hop keep option */
884 if (c->cf->next_hop_keep && bgp_match_src(s, c->cf->next_hop_keep))
885 return 1;
886
887 /* Keep it when explicitly set in export filter */
888 if (a->type & EAF_FRESH)
889 return 1;
890
891 /* Check for non-matching AF */
892 if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
893 return 0;
894
895 /* Keep it when exported to internal peers */
896 if (p->is_interior && ipa_nonzero(*nh))
897 return 1;
898
899 /* Keep it when forwarded between single-hop BGPs on the same iface */
900 struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
901 return p->neigh && (p->neigh->iface == ifa);
902 }
903
904 static inline int
905 bgp_use_gateway(struct bgp_export_state *s)
906 {
907 struct bgp_proto *p = s->proto;
908 struct bgp_channel *c = s->channel;
909 rta *ra = s->route->attrs;
910
911 /* Handle next hop self option - also applies to gateway */
912 if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
913 return 0;
914
915 /* We need one valid global gateway */
916 if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
917 return 0;
918
919 /* Check for non-matching AF */
920 if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
921 return 0;
922
923 /* Use it when exported to internal peers */
924 if (p->is_interior)
925 return 1;
926
927 /* Use it when forwarded to single-hop BGP peer on on the same iface */
928 return p->neigh && (p->neigh->iface == ra->nh.iface);
929 }
930
931 static void
932 bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
933 {
934 if (!a || !bgp_use_next_hop(s, a))
935 {
936 if (bgp_use_gateway(s))
937 {
938 rta *ra = s->route->attrs;
939 ip_addr nh[1] = { ra->nh.gw };
940 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
941
942 if (s->mpls)
943 {
944 u32 implicit_null = BGP_MPLS_NULL;
945 u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
946 uint lnum = ra->nh.labels ? ra->nh.labels : 1;
947 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
948 }
949 }
950 else
951 {
952 ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
953 bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
954
955 /* TODO: Use local MPLS assigned label */
956 if (s->mpls)
957 {
958 u32 implicit_null = BGP_MPLS_NULL;
959 bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
960 }
961 }
962 }
963
964 /* Check if next hop is valid */
965 a = bgp_find_attr(*to, BA_NEXT_HOP);
966 if (!a)
967 WITHDRAW(NO_NEXT_HOP);
968
969 ip_addr *nh = (void *) a->u.ptr->data;
970 ip_addr peer = s->proto->cf->remote_ip;
971 uint len = a->u.ptr->length;
972
973 /* Forbid zero next hop */
974 if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
975 WITHDRAW(BAD_NEXT_HOP);
976
977 /* Forbid next hop equal to neighbor IP */
978 if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
979 WITHDRAW(BAD_NEXT_HOP);
980
981 /* Forbid next hop with non-matching AF */
982 if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
983 !s->channel->ext_next_hop)
984 WITHDRAW(BAD_NEXT_HOP);
985
986 /* Just check if MPLS stack */
987 if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
988 WITHDRAW(NO_LABEL_STACK);
989 }
990
991 static uint
992 bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
993 {
994 /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
995 ip_addr *nh = (void *) a->u.ptr->data;
996 uint len = a->u.ptr->length;
997
998 ASSERT((len == 16) || (len == 32));
999
1000 /*
1001 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1002 * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
1003 * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
1004 * IPv6 address with IPv6 NLRI.
1005 */
1006
1007 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1008 {
1009 put_ip4(buf, ipa_to_ip4(nh[0]));
1010 return 4;
1011 }
1012
1013 put_ip6(buf, ipa_to_ip6(nh[0]));
1014
1015 if (len == 32)
1016 put_ip6(buf+16, ipa_to_ip6(nh[1]));
1017
1018 return len;
1019 }
1020
1021 static void
1022 bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1023 {
1024 struct bgp_channel *c = s->channel;
1025 struct adata *ad = lp_alloc_adata(s->pool, 32);
1026 ip_addr *nh = (void *) ad->data;
1027
1028 if (len == 4)
1029 {
1030 nh[0] = ipa_from_ip4(get_ip4(data));
1031 nh[1] = IPA_NONE;
1032 }
1033 else if (len == 16)
1034 {
1035 nh[0] = ipa_from_ip6(get_ip6(data));
1036 nh[1] = IPA_NONE;
1037
1038 if (ipa_is_link_local(nh[0]))
1039 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1040 }
1041 else if (len == 32)
1042 {
1043 nh[0] = ipa_from_ip6(get_ip6(data));
1044 nh[1] = ipa_from_ip6(get_ip6(data+16));
1045
1046 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1047 nh[1] = IPA_NONE;
1048 }
1049 else
1050 bgp_parse_error(s, 9);
1051
1052 if (ipa_zero(nh[1]))
1053 ad->length = 16;
1054
1055 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1056 WITHDRAW(BAD_NEXT_HOP);
1057
1058 // XXXX validate next hop
1059
1060 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1061 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1062 }
1063
1064 static uint
1065 bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1066 {
1067 ip_addr *nh = (void *) a->u.ptr->data;
1068 uint len = a->u.ptr->length;
1069
1070 ASSERT((len == 16) || (len == 32));
1071
1072 /*
1073 * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1074 * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1075 * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1076 * IPv6 address with VPNv6 NLRI.
1077 */
1078
1079 if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1080 {
1081 put_u64(buf, 0); /* VPN RD is 0 */
1082 put_ip4(buf+8, ipa_to_ip4(nh[0]));
1083 return 12;
1084 }
1085
1086 put_u64(buf, 0); /* VPN RD is 0 */
1087 put_ip6(buf+8, ipa_to_ip6(nh[0]));
1088
1089 if (len == 16)
1090 return 24;
1091
1092 put_u64(buf+24, 0); /* VPN RD is 0 */
1093 put_ip6(buf+32, ipa_to_ip6(nh[1]));
1094
1095 return 48;
1096 }
1097
1098 static void
1099 bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1100 {
1101 struct bgp_channel *c = s->channel;
1102 struct adata *ad = lp_alloc_adata(s->pool, 32);
1103 ip_addr *nh = (void *) ad->data;
1104
1105 if (len == 12)
1106 {
1107 nh[0] = ipa_from_ip4(get_ip4(data+8));
1108 nh[1] = IPA_NONE;
1109 }
1110 else if (len == 24)
1111 {
1112 nh[0] = ipa_from_ip6(get_ip6(data+8));
1113 nh[1] = IPA_NONE;
1114
1115 if (ipa_is_link_local(nh[0]))
1116 { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1117 }
1118 else if (len == 48)
1119 {
1120 nh[0] = ipa_from_ip6(get_ip6(data+8));
1121 nh[1] = ipa_from_ip6(get_ip6(data+32));
1122
1123 if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1124 nh[1] = IPA_NONE;
1125 }
1126 else
1127 bgp_parse_error(s, 9);
1128
1129 if (ipa_zero(nh[1]))
1130 ad->length = 16;
1131
1132 /* XXXX which error */
1133 if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1134 bgp_parse_error(s, 9);
1135
1136 if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1137 WITHDRAW(BAD_NEXT_HOP);
1138
1139 // XXXX validate next hop
1140
1141 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1142 bgp_apply_next_hop(s, a, nh[0], nh[1]);
1143 }
1144
1145
1146
1147 static uint
1148 bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1149 {
1150 return 0;
1151 }
1152
1153 static void
1154 bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1155 {
1156 /*
1157 * Although we expect no next hop and RFC 7606 7.11 states that attribute
1158 * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1159 * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1160 */
1161
1162 return;
1163 }
1164
1165 static void
1166 bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1167 {
1168 /* NEXT_HOP shall not pass */
1169 if (a)
1170 bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1171 }
1172
1173
1174 /*
1175 * UPDATE
1176 */
1177
1178 static void
1179 bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1180 {
1181 if (path_id != s->last_id)
1182 {
1183 s->last_src = rt_get_source(&s->proto->p, path_id);
1184 s->last_id = path_id;
1185
1186 rta_free(s->cached_rta);
1187 s->cached_rta = NULL;
1188 }
1189
1190 if (!a0)
1191 {
1192 /* Route withdraw */
1193 rte_update3(&s->channel->c, n, NULL, s->last_src);
1194 return;
1195 }
1196
1197 /* Prepare cached route attributes */
1198 if (s->cached_rta == NULL)
1199 {
1200 a0->src = s->last_src;
1201
1202 /* Workaround for rta_lookup() breaking eattrs */
1203 ea_list *ea = a0->eattrs;
1204 s->cached_rta = rta_lookup(a0);
1205 a0->eattrs = ea;
1206 }
1207
1208 rta *a = rta_clone(s->cached_rta);
1209 rte *e = rte_get_temp(a);
1210
1211 e->pflags = 0;
1212 e->u.bgp.suppressed = 0;
1213 e->u.bgp.stale = -1;
1214 rte_update3(&s->channel->c, n, e, s->last_src);
1215 }
1216
1217 static void
1218 bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1219 {
1220 u32 dummy = 0;
1221 u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1222 uint lnum = mpls ? (mpls->length / 4) : 1;
1223
1224 for (uint i = 0; i < lnum; i++)
1225 {
1226 put_u24(*pos, labels[i] << 4);
1227 ADVANCE(*pos, *size, 3);
1228 }
1229
1230 /* Add bottom-of-stack flag */
1231 (*pos)[-1] |= BGP_MPLS_BOS;
1232
1233 *pxlen += 24 * lnum;
1234 }
1235
1236 static void
1237 bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1238 {
1239 u32 labels[BGP_MPLS_MAX], label;
1240 uint lnum = 0;
1241
1242 do {
1243 if (*pxlen < 24)
1244 bgp_parse_error(s, 1);
1245
1246 label = get_u24(*pos);
1247 labels[lnum++] = label >> 4;
1248 ADVANCE(*pos, *len, 3);
1249 *pxlen -= 24;
1250
1251 /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1252 fixed-size 24-bit Compatibility field, which MUST be ignored */
1253 if (!a && !s->err_withdraw)
1254 return;
1255 }
1256 while (!(label & BGP_MPLS_BOS));
1257
1258 if (!a)
1259 return;
1260
1261 /* Attach MPLS attribute unless we already have one */
1262 if (!s->mpls_labels)
1263 {
1264 s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1265 bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1266 }
1267
1268 /* Overwrite data in the attribute */
1269 s->mpls_labels->length = 4*lnum;
1270 memcpy(s->mpls_labels->data, labels, 4*lnum);
1271
1272 /* Update next hop entry in rta */
1273 bgp_apply_mpls_labels(s, a, labels, lnum);
1274
1275 /* Attributes were changed, invalidate cached entry */
1276 rta_free(s->cached_rta);
1277 s->cached_rta = NULL;
1278
1279 return;
1280 }
1281
1282 static uint
1283 bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1284 {
1285 byte *pos = buf;
1286
1287 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1288 {
1289 struct bgp_prefix *px = HEAD(buck->prefixes);
1290 struct net_addr_ip4 *net = (void *) px->net;
1291
1292 /* Encode path ID */
1293 if (s->add_path)
1294 {
1295 put_u32(pos, px->path_id);
1296 ADVANCE(pos, size, 4);
1297 }
1298
1299 /* Encode prefix length */
1300 *pos = net->pxlen;
1301 ADVANCE(pos, size, 1);
1302
1303 /* Encode MPLS labels */
1304 if (s->mpls)
1305 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1306
1307 /* Encode prefix body */
1308 ip4_addr a = ip4_hton(net->prefix);
1309 uint b = (net->pxlen + 7) / 8;
1310 memcpy(pos, &a, b);
1311 ADVANCE(pos, size, b);
1312
1313 bgp_free_prefix(s->channel, px);
1314 }
1315
1316 return pos - buf;
1317 }
1318
1319 static void
1320 bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1321 {
1322 while (len)
1323 {
1324 net_addr_ip4 net;
1325 u32 path_id = 0;
1326
1327 /* Decode path ID */
1328 if (s->add_path)
1329 {
1330 if (len < 5)
1331 bgp_parse_error(s, 1);
1332
1333 path_id = get_u32(pos);
1334 ADVANCE(pos, len, 4);
1335 }
1336
1337 /* Decode prefix length */
1338 uint l = *pos;
1339 ADVANCE(pos, len, 1);
1340
1341 if (len < ((l + 7) / 8))
1342 bgp_parse_error(s, 1);
1343
1344 /* Decode MPLS labels */
1345 if (s->mpls)
1346 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1347
1348 if (l > IP4_MAX_PREFIX_LENGTH)
1349 bgp_parse_error(s, 10);
1350
1351 /* Decode prefix body */
1352 ip4_addr addr = IP4_NONE;
1353 uint b = (l + 7) / 8;
1354 memcpy(&addr, pos, b);
1355 ADVANCE(pos, len, b);
1356
1357 net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1358 net_normalize_ip4(&net);
1359
1360 // XXXX validate prefix
1361
1362 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1363 }
1364 }
1365
1366
1367 static uint
1368 bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1369 {
1370 byte *pos = buf;
1371
1372 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1373 {
1374 struct bgp_prefix *px = HEAD(buck->prefixes);
1375 struct net_addr_ip6 *net = (void *) px->net;
1376
1377 /* Encode path ID */
1378 if (s->add_path)
1379 {
1380 put_u32(pos, px->path_id);
1381 ADVANCE(pos, size, 4);
1382 }
1383
1384 /* Encode prefix length */
1385 *pos = net->pxlen;
1386 ADVANCE(pos, size, 1);
1387
1388 /* Encode MPLS labels */
1389 if (s->mpls)
1390 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1391
1392 /* Encode prefix body */
1393 ip6_addr a = ip6_hton(net->prefix);
1394 uint b = (net->pxlen + 7) / 8;
1395 memcpy(pos, &a, b);
1396 ADVANCE(pos, size, b);
1397
1398 bgp_free_prefix(s->channel, px);
1399 }
1400
1401 return pos - buf;
1402 }
1403
1404 static void
1405 bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1406 {
1407 while (len)
1408 {
1409 net_addr_ip6 net;
1410 u32 path_id = 0;
1411
1412 /* Decode path ID */
1413 if (s->add_path)
1414 {
1415 if (len < 5)
1416 bgp_parse_error(s, 1);
1417
1418 path_id = get_u32(pos);
1419 ADVANCE(pos, len, 4);
1420 }
1421
1422 /* Decode prefix length */
1423 uint l = *pos;
1424 ADVANCE(pos, len, 1);
1425
1426 if (len < ((l + 7) / 8))
1427 bgp_parse_error(s, 1);
1428
1429 /* Decode MPLS labels */
1430 if (s->mpls)
1431 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1432
1433 if (l > IP6_MAX_PREFIX_LENGTH)
1434 bgp_parse_error(s, 10);
1435
1436 /* Decode prefix body */
1437 ip6_addr addr = IP6_NONE;
1438 uint b = (l + 7) / 8;
1439 memcpy(&addr, pos, b);
1440 ADVANCE(pos, len, b);
1441
1442 net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1443 net_normalize_ip6(&net);
1444
1445 // XXXX validate prefix
1446
1447 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1448 }
1449 }
1450
1451 static uint
1452 bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1453 {
1454 byte *pos = buf;
1455
1456 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1457 {
1458 struct bgp_prefix *px = HEAD(buck->prefixes);
1459 struct net_addr_vpn4 *net = (void *) px->net;
1460
1461 /* Encode path ID */
1462 if (s->add_path)
1463 {
1464 put_u32(pos, px->path_id);
1465 ADVANCE(pos, size, 4);
1466 }
1467
1468 /* Encode prefix length */
1469 *pos = 64 + net->pxlen;
1470 ADVANCE(pos, size, 1);
1471
1472 /* Encode MPLS labels */
1473 if (s->mpls)
1474 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1475
1476 /* Encode route distinguisher */
1477 put_u64(pos, net->rd);
1478 ADVANCE(pos, size, 8);
1479
1480 /* Encode prefix body */
1481 ip4_addr a = ip4_hton(net->prefix);
1482 uint b = (net->pxlen + 7) / 8;
1483 memcpy(pos, &a, b);
1484 ADVANCE(pos, size, b);
1485
1486 bgp_free_prefix(s->channel, px);
1487 }
1488
1489 return pos - buf;
1490 }
1491
1492 static void
1493 bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1494 {
1495 while (len)
1496 {
1497 net_addr_vpn4 net;
1498 u32 path_id = 0;
1499
1500 /* Decode path ID */
1501 if (s->add_path)
1502 {
1503 if (len < 5)
1504 bgp_parse_error(s, 1);
1505
1506 path_id = get_u32(pos);
1507 ADVANCE(pos, len, 4);
1508 }
1509
1510 /* Decode prefix length */
1511 uint l = *pos;
1512 ADVANCE(pos, len, 1);
1513
1514 if (len < ((l + 7) / 8))
1515 bgp_parse_error(s, 1);
1516
1517 /* Decode MPLS labels */
1518 if (s->mpls)
1519 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1520
1521 /* Decode route distinguisher */
1522 if (l < 64)
1523 bgp_parse_error(s, 1);
1524
1525 u64 rd = get_u64(pos);
1526 ADVANCE(pos, len, 8);
1527 l -= 64;
1528
1529 if (l > IP4_MAX_PREFIX_LENGTH)
1530 bgp_parse_error(s, 10);
1531
1532 /* Decode prefix body */
1533 ip4_addr addr = IP4_NONE;
1534 uint b = (l + 7) / 8;
1535 memcpy(&addr, pos, b);
1536 ADVANCE(pos, len, b);
1537
1538 net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1539 net_normalize_vpn4(&net);
1540
1541 // XXXX validate prefix
1542
1543 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1544 }
1545 }
1546
1547
1548 static uint
1549 bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1550 {
1551 byte *pos = buf;
1552
1553 while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1554 {
1555 struct bgp_prefix *px = HEAD(buck->prefixes);
1556 struct net_addr_vpn6 *net = (void *) px->net;
1557
1558 /* Encode path ID */
1559 if (s->add_path)
1560 {
1561 put_u32(pos, px->path_id);
1562 ADVANCE(pos, size, 4);
1563 }
1564
1565 /* Encode prefix length */
1566 *pos = 64 + net->pxlen;
1567 ADVANCE(pos, size, 1);
1568
1569 /* Encode MPLS labels */
1570 if (s->mpls)
1571 bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1572
1573 /* Encode route distinguisher */
1574 put_u64(pos, net->rd);
1575 ADVANCE(pos, size, 8);
1576
1577 /* Encode prefix body */
1578 ip6_addr a = ip6_hton(net->prefix);
1579 uint b = (net->pxlen + 7) / 8;
1580 memcpy(pos, &a, b);
1581 ADVANCE(pos, size, b);
1582
1583 bgp_free_prefix(s->channel, px);
1584 }
1585
1586 return pos - buf;
1587 }
1588
1589 static void
1590 bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1591 {
1592 while (len)
1593 {
1594 net_addr_vpn6 net;
1595 u32 path_id = 0;
1596
1597 /* Decode path ID */
1598 if (s->add_path)
1599 {
1600 if (len < 5)
1601 bgp_parse_error(s, 1);
1602
1603 path_id = get_u32(pos);
1604 ADVANCE(pos, len, 4);
1605 }
1606
1607 /* Decode prefix length */
1608 uint l = *pos;
1609 ADVANCE(pos, len, 1);
1610
1611 if (len < ((l + 7) / 8))
1612 bgp_parse_error(s, 1);
1613
1614 /* Decode MPLS labels */
1615 if (s->mpls)
1616 bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1617
1618 /* Decode route distinguisher */
1619 if (l < 64)
1620 bgp_parse_error(s, 1);
1621
1622 u64 rd = get_u64(pos);
1623 ADVANCE(pos, len, 8);
1624 l -= 64;
1625
1626 if (l > IP6_MAX_PREFIX_LENGTH)
1627 bgp_parse_error(s, 10);
1628
1629 /* Decode prefix body */
1630 ip6_addr addr = IP6_NONE;
1631 uint b = (l + 7) / 8;
1632 memcpy(&addr, pos, b);
1633 ADVANCE(pos, len, b);
1634
1635 net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1636 net_normalize_vpn6(&net);
1637
1638 // XXXX validate prefix
1639
1640 bgp_rte_update(s, (net_addr *) &net, path_id, a);
1641 }
1642 }
1643
1644
1645 static uint
1646 bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1647 {
1648 byte *pos = buf;
1649
1650 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1651 {
1652 struct bgp_prefix *px = HEAD(buck->prefixes);
1653 struct net_addr_flow4 *net = (void *) px->net;
1654 uint flen = net->length - sizeof(net_addr_flow4);
1655
1656 /* Encode path ID */
1657 if (s->add_path)
1658 {
1659 put_u32(pos, px->path_id);
1660 ADVANCE(pos, size, 4);
1661 }
1662
1663 if (flen > size)
1664 break;
1665
1666 /* Copy whole flow data including length */
1667 memcpy(pos, net->data, flen);
1668 ADVANCE(pos, size, flen);
1669
1670 bgp_free_prefix(s->channel, px);
1671 }
1672
1673 return pos - buf;
1674 }
1675
1676 static void
1677 bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1678 {
1679 while (len)
1680 {
1681 u32 path_id = 0;
1682
1683 /* Decode path ID */
1684 if (s->add_path)
1685 {
1686 if (len < 4)
1687 bgp_parse_error(s, 1);
1688
1689 path_id = get_u32(pos);
1690 ADVANCE(pos, len, 4);
1691 }
1692
1693 if (len < 2)
1694 bgp_parse_error(s, 1);
1695
1696 /* Decode flow length */
1697 uint hlen = flow_hdr_length(pos);
1698 uint dlen = flow_read_length(pos);
1699 uint flen = hlen + dlen;
1700 byte *data = pos + hlen;
1701
1702 if (len < flen)
1703 bgp_parse_error(s, 1);
1704
1705 /* Validate flow data */
1706 enum flow_validated_state r = flow4_validate(data, dlen);
1707 if (r != FLOW_ST_VALID)
1708 {
1709 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1710 bgp_parse_error(s, 1);
1711 }
1712
1713 if (data[0] != FLOW_TYPE_DST_PREFIX)
1714 {
1715 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1716 bgp_parse_error(s, 1);
1717 }
1718
1719 /* Decode dst prefix */
1720 ip4_addr px = IP4_NONE;
1721 uint pxlen = data[1];
1722
1723 // FIXME: Use some generic function
1724 memcpy(&px, data+2, BYTES(pxlen));
1725 px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen));
1726
1727 /* Prepare the flow */
1728 net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1729 net_fill_flow4(n, px, pxlen, pos, flen);
1730 ADVANCE(pos, len, flen);
1731
1732 bgp_rte_update(s, n, path_id, a);
1733 }
1734 }
1735
1736
1737 static uint
1738 bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1739 {
1740 byte *pos = buf;
1741
1742 while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1743 {
1744 struct bgp_prefix *px = HEAD(buck->prefixes);
1745 struct net_addr_flow6 *net = (void *) px->net;
1746 uint flen = net->length - sizeof(net_addr_flow6);
1747
1748 /* Encode path ID */
1749 if (s->add_path)
1750 {
1751 put_u32(pos, px->path_id);
1752 ADVANCE(pos, size, 4);
1753 }
1754
1755 if (flen > size)
1756 break;
1757
1758 /* Copy whole flow data including length */
1759 memcpy(pos, net->data, flen);
1760 ADVANCE(pos, size, flen);
1761
1762 bgp_free_prefix(s->channel, px);
1763 }
1764
1765 return pos - buf;
1766 }
1767
1768 static void
1769 bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1770 {
1771 while (len)
1772 {
1773 u32 path_id = 0;
1774
1775 /* Decode path ID */
1776 if (s->add_path)
1777 {
1778 if (len < 4)
1779 bgp_parse_error(s, 1);
1780
1781 path_id = get_u32(pos);
1782 ADVANCE(pos, len, 4);
1783 }
1784
1785 if (len < 2)
1786 bgp_parse_error(s, 1);
1787
1788 /* Decode flow length */
1789 uint hlen = flow_hdr_length(pos);
1790 uint dlen = flow_read_length(pos);
1791 uint flen = hlen + dlen;
1792 byte *data = pos + hlen;
1793
1794 if (len < flen)
1795 bgp_parse_error(s, 1);
1796
1797 /* Validate flow data */
1798 enum flow_validated_state r = flow6_validate(data, dlen);
1799 if (r != FLOW_ST_VALID)
1800 {
1801 log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1802 bgp_parse_error(s, 1);
1803 }
1804
1805 if (data[0] != FLOW_TYPE_DST_PREFIX)
1806 {
1807 log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1808 bgp_parse_error(s, 1);
1809 }
1810
1811 /* Decode dst prefix */
1812 ip6_addr px = IP6_NONE;
1813 uint pxlen = data[1];
1814
1815 // FIXME: Use some generic function
1816 memcpy(&px, data+2, BYTES(pxlen));
1817 px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen));
1818
1819 /* Prepare the flow */
1820 net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1821 net_fill_flow6(n, px, pxlen, pos, flen);
1822 ADVANCE(pos, len, flen);
1823
1824 bgp_rte_update(s, n, path_id, a);
1825 }
1826 }
1827
1828
1829 static const struct bgp_af_desc bgp_af_table[] = {
1830 {
1831 .afi = BGP_AF_IPV4,
1832 .net = NET_IP4,
1833 .name = "ipv4",
1834 .encode_nlri = bgp_encode_nlri_ip4,
1835 .decode_nlri = bgp_decode_nlri_ip4,
1836 .encode_next_hop = bgp_encode_next_hop_ip,
1837 .decode_next_hop = bgp_decode_next_hop_ip,
1838 .update_next_hop = bgp_update_next_hop_ip,
1839 },
1840 {
1841 .afi = BGP_AF_IPV4_MC,
1842 .net = NET_IP4,
1843 .name = "ipv4-mc",
1844 .encode_nlri = bgp_encode_nlri_ip4,
1845 .decode_nlri = bgp_decode_nlri_ip4,
1846 .encode_next_hop = bgp_encode_next_hop_ip,
1847 .decode_next_hop = bgp_decode_next_hop_ip,
1848 .update_next_hop = bgp_update_next_hop_ip,
1849 },
1850 {
1851 .afi = BGP_AF_IPV4_MPLS,
1852 .net = NET_IP4,
1853 .mpls = 1,
1854 .name = "ipv4-mpls",
1855 .encode_nlri = bgp_encode_nlri_ip4,
1856 .decode_nlri = bgp_decode_nlri_ip4,
1857 .encode_next_hop = bgp_encode_next_hop_ip,
1858 .decode_next_hop = bgp_decode_next_hop_ip,
1859 .update_next_hop = bgp_update_next_hop_ip,
1860 },
1861 {
1862 .afi = BGP_AF_IPV6,
1863 .net = NET_IP6,
1864 .name = "ipv6",
1865 .encode_nlri = bgp_encode_nlri_ip6,
1866 .decode_nlri = bgp_decode_nlri_ip6,
1867 .encode_next_hop = bgp_encode_next_hop_ip,
1868 .decode_next_hop = bgp_decode_next_hop_ip,
1869 .update_next_hop = bgp_update_next_hop_ip,
1870 },
1871 {
1872 .afi = BGP_AF_IPV6_MC,
1873 .net = NET_IP6,
1874 .name = "ipv6-mc",
1875 .encode_nlri = bgp_encode_nlri_ip6,
1876 .decode_nlri = bgp_decode_nlri_ip6,
1877 .encode_next_hop = bgp_encode_next_hop_ip,
1878 .decode_next_hop = bgp_decode_next_hop_ip,
1879 .update_next_hop = bgp_update_next_hop_ip,
1880 },
1881 {
1882 .afi = BGP_AF_IPV6_MPLS,
1883 .net = NET_IP6,
1884 .mpls = 1,
1885 .name = "ipv6-mpls",
1886 .encode_nlri = bgp_encode_nlri_ip6,
1887 .decode_nlri = bgp_decode_nlri_ip6,
1888 .encode_next_hop = bgp_encode_next_hop_ip,
1889 .decode_next_hop = bgp_decode_next_hop_ip,
1890 .update_next_hop = bgp_update_next_hop_ip,
1891 },
1892 {
1893 .afi = BGP_AF_VPN4_MPLS,
1894 .net = NET_VPN4,
1895 .mpls = 1,
1896 .name = "vpn4-mpls",
1897 .encode_nlri = bgp_encode_nlri_vpn4,
1898 .decode_nlri = bgp_decode_nlri_vpn4,
1899 .encode_next_hop = bgp_encode_next_hop_vpn,
1900 .decode_next_hop = bgp_decode_next_hop_vpn,
1901 .update_next_hop = bgp_update_next_hop_ip,
1902 },
1903 {
1904 .afi = BGP_AF_VPN6_MPLS,
1905 .net = NET_VPN6,
1906 .mpls = 1,
1907 .name = "vpn6-mpls",
1908 .encode_nlri = bgp_encode_nlri_vpn6,
1909 .decode_nlri = bgp_decode_nlri_vpn6,
1910 .encode_next_hop = bgp_encode_next_hop_vpn,
1911 .decode_next_hop = bgp_decode_next_hop_vpn,
1912 .update_next_hop = bgp_update_next_hop_ip,
1913 },
1914 {
1915 .afi = BGP_AF_VPN4_MC,
1916 .net = NET_VPN4,
1917 .name = "vpn4-mc",
1918 .encode_nlri = bgp_encode_nlri_vpn4,
1919 .decode_nlri = bgp_decode_nlri_vpn4,
1920 .encode_next_hop = bgp_encode_next_hop_vpn,
1921 .decode_next_hop = bgp_decode_next_hop_vpn,
1922 .update_next_hop = bgp_update_next_hop_ip,
1923 },
1924 {
1925 .afi = BGP_AF_VPN6_MC,
1926 .net = NET_VPN6,
1927 .name = "vpn6-mc",
1928 .encode_nlri = bgp_encode_nlri_vpn6,
1929 .decode_nlri = bgp_decode_nlri_vpn6,
1930 .encode_next_hop = bgp_encode_next_hop_vpn,
1931 .decode_next_hop = bgp_decode_next_hop_vpn,
1932 .update_next_hop = bgp_update_next_hop_ip,
1933 },
1934 {
1935 .afi = BGP_AF_FLOW4,
1936 .net = NET_FLOW4,
1937 .no_igp = 1,
1938 .name = "flow4",
1939 .encode_nlri = bgp_encode_nlri_flow4,
1940 .decode_nlri = bgp_decode_nlri_flow4,
1941 .encode_next_hop = bgp_encode_next_hop_none,
1942 .decode_next_hop = bgp_decode_next_hop_none,
1943 .update_next_hop = bgp_update_next_hop_none,
1944 },
1945 {
1946 .afi = BGP_AF_FLOW6,
1947 .net = NET_FLOW6,
1948 .no_igp = 1,
1949 .name = "flow6",
1950 .encode_nlri = bgp_encode_nlri_flow6,
1951 .decode_nlri = bgp_decode_nlri_flow6,
1952 .encode_next_hop = bgp_encode_next_hop_none,
1953 .decode_next_hop = bgp_decode_next_hop_none,
1954 .update_next_hop = bgp_update_next_hop_none,
1955 },
1956 };
1957
1958 const struct bgp_af_desc *
1959 bgp_get_af_desc(u32 afi)
1960 {
1961 uint i;
1962 for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1963 if (bgp_af_table[i].afi == afi)
1964 return &bgp_af_table[i];
1965
1966 return NULL;
1967 }
1968
1969 static inline uint
1970 bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1971 {
1972 return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1973 }
1974
1975 static inline uint
1976 bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1977 {
1978 return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1979 }
1980
1981 void
1982 bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1983 {
1984 s->channel->desc->update_next_hop(s, a, to);
1985 }
1986
1987 #define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1988
1989 static byte *
1990 bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1991 {
1992 /*
1993 * 2 B Withdrawn Routes Length (zero)
1994 * --- IPv4 Withdrawn Routes NLRI (unused)
1995 * 2 B Total Path Attribute Length
1996 * var Path Attributes
1997 * var IPv4 Network Layer Reachability Information
1998 */
1999
2000 int lr, la;
2001
2002 la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
2003 if (la < 0)
2004 {
2005 /* Attribute list too long */
2006 bgp_withdraw_bucket(s->channel, buck);
2007 return NULL;
2008 }
2009
2010 put_u16(buf+0, 0);
2011 put_u16(buf+2, la);
2012
2013 lr = bgp_encode_nlri(s, buck, buf+4+la, end);
2014
2015 return buf+4+la+lr;
2016 }
2017
2018 static byte *
2019 bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2020 {
2021 /*
2022 * 2 B IPv4 Withdrawn Routes Length (zero)
2023 * --- IPv4 Withdrawn Routes NLRI (unused)
2024 * 2 B Total Path Attribute Length
2025 * 1 B MP_REACH_NLRI hdr - Attribute Flags
2026 * 1 B MP_REACH_NLRI hdr - Attribute Type Code
2027 * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
2028 * 2 B MP_REACH_NLRI data - Address Family Identifier
2029 * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
2030 * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
2031 * var MP_REACH_NLRI data - Network Address of Next Hop
2032 * 1 B MP_REACH_NLRI data - Reserved (zero)
2033 * var MP_REACH_NLRI data - Network Layer Reachability Information
2034 * var Rest of Path Attributes
2035 * --- IPv4 Network Layer Reachability Information (unused)
2036 */
2037
2038 int lh, lr, la; /* Lengths of next hop, NLRI and attributes */
2039
2040 /* Begin of MP_REACH_NLRI atribute */
2041 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2042 buf[5] = BA_MP_REACH_NLRI;
2043 put_u16(buf+6, 0); /* Will be fixed later */
2044 put_af3(buf+8, s->channel->afi);
2045 byte *pos = buf+11;
2046
2047 /* Encode attributes to temporary buffer */
2048 byte *abuf = alloca(MAX_ATTRS_LENGTH);
2049 la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2050 if (la < 0)
2051 {
2052 /* Attribute list too long */
2053 bgp_withdraw_bucket(s->channel, buck);
2054 return NULL;
2055 }
2056
2057 /* Encode the next hop */
2058 lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2059 *pos = lh;
2060 pos += 1+lh;
2061
2062 /* Reserved field */
2063 *pos++ = 0;
2064
2065 /* Encode the NLRI */
2066 lr = bgp_encode_nlri(s, buck, pos, end - la);
2067 pos += lr;
2068
2069 /* End of MP_REACH_NLRI atribute, update data length */
2070 put_u16(buf+6, pos-buf-8);
2071
2072 /* Copy remaining attributes */
2073 memcpy(pos, abuf, la);
2074 pos += la;
2075
2076 /* Initial UPDATE fields */
2077 put_u16(buf+0, 0);
2078 put_u16(buf+2, pos-buf-4);
2079
2080 return pos;
2081 }
2082
2083 #undef MAX_ATTRS_LENGTH
2084
2085 static byte *
2086 bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2087 {
2088 /*
2089 * 2 B Withdrawn Routes Length
2090 * var IPv4 Withdrawn Routes NLRI
2091 * 2 B Total Path Attribute Length (zero)
2092 * --- Path Attributes (unused)
2093 * --- IPv4 Network Layer Reachability Information (unused)
2094 */
2095
2096 uint len = bgp_encode_nlri(s, buck, buf+2, end);
2097
2098 put_u16(buf+0, len);
2099 put_u16(buf+2+len, 0);
2100
2101 return buf+4+len;
2102 }
2103
2104 static byte *
2105 bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2106 {
2107 /*
2108 * 2 B Withdrawn Routes Length (zero)
2109 * --- IPv4 Withdrawn Routes NLRI (unused)
2110 * 2 B Total Path Attribute Length
2111 * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
2112 * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
2113 * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
2114 * 2 B MP_UNREACH_NLRI data - Address Family Identifier
2115 * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2116 * var MP_UNREACH_NLRI data - Network Layer Reachability Information
2117 * --- IPv4 Network Layer Reachability Information (unused)
2118 */
2119
2120 uint len = bgp_encode_nlri(s, buck, buf+11, end);
2121
2122 put_u16(buf+0, 0);
2123 put_u16(buf+2, 7+len);
2124
2125 /* Begin of MP_UNREACH_NLRI atribute */
2126 buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2127 buf[5] = BA_MP_UNREACH_NLRI;
2128 put_u16(buf+6, 3+len);
2129 put_af3(buf+8, s->channel->afi);
2130
2131 return buf+11+len;
2132 }
2133
2134 static byte *
2135 bgp_create_update(struct bgp_channel *c, byte *buf)
2136 {
2137 struct bgp_proto *p = (void *) c->c.proto;
2138 struct bgp_bucket *buck;
2139 byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2140 byte *res = NULL;
2141
2142 again: ;
2143
2144 /* Initialize write state */
2145 struct bgp_write_state s = {
2146 .proto = p,
2147 .channel = c,
2148 .pool = bgp_linpool,
2149 .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2150 .as4_session = p->as4_session,
2151 .add_path = c->add_path_tx,
2152 .mpls = c->desc->mpls,
2153 };
2154
2155 /* Try unreachable bucket */
2156 if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2157 {
2158 res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2159 bgp_create_ip_unreach(&s, buck, buf, end):
2160 bgp_create_mp_unreach(&s, buck, buf, end);
2161
2162 goto done;
2163 }
2164
2165 /* Try reachable buckets */
2166 if (!EMPTY_LIST(c->bucket_queue))
2167 {
2168 buck = HEAD(c->bucket_queue);
2169
2170 /* Cleanup empty buckets */
2171 if (EMPTY_LIST(buck->prefixes))
2172 {
2173 bgp_free_bucket(c, buck);
2174 goto again;
2175 }
2176
2177 res = !s.mp_reach ?
2178 bgp_create_ip_reach(&s, buck, buf, end):
2179 bgp_create_mp_reach(&s, buck, buf, end);
2180
2181 if (EMPTY_LIST(buck->prefixes))
2182 bgp_free_bucket(c, buck);
2183 else
2184 bgp_defer_bucket(c, buck);
2185
2186 if (!res)
2187 goto again;
2188
2189 goto done;
2190 }
2191
2192 /* No more prefixes to send */
2193 return NULL;
2194
2195 done:
2196 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2197 lp_flush(s.pool);
2198
2199 return res;
2200 }
2201
2202 static byte *
2203 bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2204 {
2205 /* Empty update packet */
2206 put_u32(buf, 0);
2207
2208 return buf+4;
2209 }
2210
2211 static byte *
2212 bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2213 {
2214 put_u16(buf+0, 0);
2215 put_u16(buf+2, 6); /* length 4--9 */
2216
2217 /* Empty MP_UNREACH_NLRI atribute */
2218 buf[4] = BAF_OPTIONAL;
2219 buf[5] = BA_MP_UNREACH_NLRI;
2220 buf[6] = 3; /* Length 7--9 */
2221 put_af3(buf+7, c->afi);
2222
2223 return buf+10;
2224 }
2225
2226 static byte *
2227 bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2228 {
2229 struct bgp_proto *p = (void *) c->c.proto;
2230
2231 BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2232
2233 return (c->afi == BGP_AF_IPV4) ?
2234 bgp_create_ip_end_mark(c, buf):
2235 bgp_create_mp_end_mark(c, buf);
2236 }
2237
2238 static inline void
2239 bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2240 {
2241 struct bgp_proto *p = s->proto;
2242 struct bgp_channel *c = bgp_get_channel(p, afi);
2243
2244 BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2245
2246 if (!c)
2247 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2248
2249 if (c->load_state == BFS_LOADING)
2250 c->load_state = BFS_NONE;
2251
2252 if (p->p.gr_recovery)
2253 channel_graceful_restart_unlock(&c->c);
2254
2255 if (c->gr_active)
2256 bgp_graceful_restart_done(c);
2257 }
2258
2259 static inline void
2260 bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2261 {
2262 struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2263 rta *a = NULL;
2264
2265 if (!c)
2266 DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2267
2268 s->channel = c;
2269 s->add_path = c->add_path_rx;
2270 s->mpls = c->desc->mpls;
2271
2272 s->last_id = 0;
2273 s->last_src = s->proto->p.main_source;
2274
2275 /*
2276 * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2277 * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2278 * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2279 * decode_next_hop hooks) by restoring a->eattrs afterwards.
2280 */
2281
2282 if (ea)
2283 {
2284 a = allocz(RTA_MAX_SIZE);
2285
2286 a->source = RTS_BGP;
2287 a->scope = SCOPE_UNIVERSE;
2288 a->from = s->proto->cf->remote_ip;
2289 a->eattrs = ea;
2290
2291 c->desc->decode_next_hop(s, nh, nh_len, a);
2292
2293 /* Handle withdraw during next hop decoding */
2294 if (s->err_withdraw)
2295 a = NULL;
2296 }
2297
2298 c->desc->decode_nlri(s, nlri, len, a);
2299
2300 rta_free(s->cached_rta);
2301 s->cached_rta = NULL;
2302 }
2303
2304 static void
2305 bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2306 {
2307 struct bgp_proto *p = conn->bgp;
2308 ea_list *ea = NULL;
2309
2310 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2311
2312 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2313 if (conn->state == BS_OPENCONFIRM)
2314 bgp_conn_enter_established_state(conn);
2315
2316 if (conn->state != BS_ESTABLISHED)
2317 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2318
2319 bgp_start_timer(conn->hold_timer, conn->hold_time);
2320
2321 /* Initialize parse state */
2322 struct bgp_parse_state s = {
2323 .proto = p,
2324 .pool = bgp_linpool,
2325 .as4_session = p->as4_session,
2326 };
2327
2328 /* Parse error handler */
2329 if (setjmp(s.err_jmpbuf))
2330 {
2331 bgp_error(conn, 3, s.err_subcode, NULL, 0);
2332 goto done;
2333 }
2334
2335 /* Check minimal length */
2336 if (len < 23)
2337 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2338
2339 /* Skip fixed header */
2340 uint pos = 19;
2341
2342 /*
2343 * UPDATE message format
2344 *
2345 * 2 B IPv4 Withdrawn Routes Length
2346 * var IPv4 Withdrawn Routes NLRI
2347 * 2 B Total Path Attribute Length
2348 * var Path Attributes
2349 * var IPv4 Reachable Routes NLRI
2350 */
2351
2352 s.ip_unreach_len = get_u16(pkt + pos);
2353 s.ip_unreach_nlri = pkt + pos + 2;
2354 pos += 2 + s.ip_unreach_len;
2355
2356 if (pos + 2 > len)
2357 bgp_parse_error(&s, 1);
2358
2359 s.attr_len = get_u16(pkt + pos);
2360 s.attrs = pkt + pos + 2;
2361 pos += 2 + s.attr_len;
2362
2363 if (pos > len)
2364 bgp_parse_error(&s, 1);
2365
2366 s.ip_reach_len = len - pos;
2367 s.ip_reach_nlri = pkt + pos;
2368
2369
2370 if (s.attr_len)
2371 ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2372 else
2373 ea = NULL;
2374
2375 /* Check for End-of-RIB marker */
2376 if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2377 { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2378
2379 /* Check for MP End-of-RIB marker */
2380 if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2381 !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2382 { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2383
2384 if (s.ip_unreach_len)
2385 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2386
2387 if (s.mp_unreach_len)
2388 bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2389
2390 if (s.ip_reach_len)
2391 bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2392 ea, s.ip_next_hop_data, s.ip_next_hop_len);
2393
2394 if (s.mp_reach_len)
2395 bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2396 ea, s.mp_next_hop_data, s.mp_next_hop_len);
2397
2398 done:
2399 rta_free(s.cached_rta);
2400 lp_flush(s.pool);
2401 return;
2402 }
2403
2404 static uint
2405 bgp_find_update_afi(byte *pos, uint len)
2406 {
2407 /*
2408 * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2409 * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2410 * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2411 */
2412 if (len < 23)
2413 return 0;
2414
2415 /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2416 uint wlen = get_u16(pos + 19);
2417 uint alen = get_u16(pos + 21);
2418 ADVANCE(pos, len, 23);
2419
2420 /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2421 if ((wlen != 0) || (alen < len) || !alen)
2422 return BGP_AF_IPV4;
2423
2424 if (alen > len)
2425 return 0;
2426
2427 /* Process attribute list (alen == len) */
2428 while (len)
2429 {
2430 if (len < 2)
2431 return 0;
2432
2433 uint flags = pos[0];
2434 uint code = pos[1];
2435 ADVANCE(pos, len, 2);
2436
2437 uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2438 if (len < ll)
2439 return 0;
2440
2441 /* Read attribute length and move to attribute body */
2442 alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2443 ADVANCE(pos, len, ll);
2444
2445 if (len < alen)
2446 return 0;
2447
2448 /* Found MP NLRI */
2449 if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2450 {
2451 if (alen < 3)
2452 return 0;
2453
2454 return BGP_AF(get_u16(pos), pos[2]);
2455 }
2456
2457 /* Move to the next attribute */
2458 ADVANCE(pos, len, alen);
2459 }
2460
2461 /* No basic or MP NLRI, but there are some attributes -> error */
2462 return 0;
2463 }
2464
2465
2466 /*
2467 * ROUTE-REFRESH
2468 */
2469
2470 static inline byte *
2471 bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2472 {
2473 struct bgp_proto *p = (void *) c->c.proto;
2474
2475 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2476
2477 /* Original route refresh request, RFC 2918 */
2478 put_af4(buf, c->afi);
2479 buf[2] = BGP_RR_REQUEST;
2480
2481 return buf+4;
2482 }
2483
2484 static inline byte *
2485 bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2486 {
2487 struct bgp_proto *p = (void *) c->c.proto;
2488
2489 BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2490
2491 /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2492 put_af4(buf, c->afi);
2493 buf[2] = BGP_RR_BEGIN;
2494
2495 return buf+4;
2496 }
2497
2498 static inline byte *
2499 bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2500 {
2501 struct bgp_proto *p = (void *) c->c.proto;
2502
2503 BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2504
2505 /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2506 put_af4(buf, c->afi);
2507 buf[2] = BGP_RR_END;
2508
2509 return buf+4;
2510 }
2511
2512 static void
2513 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2514 {
2515 struct bgp_proto *p = conn->bgp;
2516
2517 if (conn->state != BS_ESTABLISHED)
2518 { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2519
2520 if (!conn->local_caps->route_refresh)
2521 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2522
2523 if (len < (BGP_HEADER_LENGTH + 4))
2524 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2525
2526 if (len > (BGP_HEADER_LENGTH + 4))
2527 { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2528
2529 struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2530 if (!c)
2531 {
2532 log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2533 p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2534 return;
2535 }
2536
2537 /* RFC 7313 redefined reserved field as RR message subtype */
2538 uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2539
2540 switch (subtype)
2541 {
2542 case BGP_RR_REQUEST:
2543 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2544 channel_request_feeding(&c->c);
2545 break;
2546
2547 case BGP_RR_BEGIN:
2548 BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2549 bgp_refresh_begin(c);
2550 break;
2551
2552 case BGP_RR_END:
2553 BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2554 bgp_refresh_end(c);
2555 break;
2556
2557 default:
2558 log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2559 p->p.name, subtype);
2560 break;
2561 }
2562 }
2563
2564 static inline struct bgp_channel *
2565 bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2566 {
2567 uint i = conn->last_channel;
2568
2569 /* Try the last channel, but at most several times */
2570 if ((conn->channels_to_send & (1 << i)) &&
2571 (conn->last_channel_count < 16))
2572 goto found;
2573
2574 /* Find channel with non-zero channels_to_send */
2575 do
2576 {
2577 i++;
2578 if (i >= p->channel_count)
2579 i = 0;
2580 }
2581 while (! (conn->channels_to_send & (1 << i)));
2582
2583 /* Use that channel */
2584 conn->last_channel = i;
2585 conn->last_channel_count = 0;
2586
2587 found:
2588 conn->last_channel_count++;
2589 return p->channel_map[i];
2590 }
2591
2592 static inline int
2593 bgp_send(struct bgp_conn *conn, uint type, uint len)
2594 {
2595 sock *sk = conn->sk;
2596 byte *buf = sk->tbuf;
2597
2598 memset(buf, 0xff, 16); /* Marker */
2599 put_u16(buf+16, len);
2600 buf[18] = type;
2601
2602 return sk_send(sk, len);
2603 }
2604
2605 /**
2606 * bgp_fire_tx - transmit packets
2607 * @conn: connection
2608 *
2609 * Whenever the transmit buffers of the underlying TCP connection
2610 * are free and we have any packets queued for sending, the socket functions
2611 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2612 * queued (Notification > Keepalive > Open > Update), assembling its header
2613 * and body and sending it to the connection.
2614 */
2615 static int
2616 bgp_fire_tx(struct bgp_conn *conn)
2617 {
2618 struct bgp_proto *p = conn->bgp;
2619 struct bgp_channel *c;
2620 byte *buf, *pkt, *end;
2621 uint s;
2622
2623 if (!conn->sk)
2624 return 0;
2625
2626 buf = conn->sk->tbuf;
2627 pkt = buf + BGP_HEADER_LENGTH;
2628 s = conn->packets_to_send;
2629
2630 if (s & (1 << PKT_SCHEDULE_CLOSE))
2631 {
2632 /* We can finally close connection and enter idle state */
2633 bgp_conn_enter_idle_state(conn);
2634 return 0;
2635 }
2636 if (s & (1 << PKT_NOTIFICATION))
2637 {
2638 conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2639 end = bgp_create_notification(conn, pkt);
2640 return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2641 }
2642 else if (s & (1 << PKT_KEEPALIVE))
2643 {
2644 conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2645 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2646 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2647 return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2648 }
2649 else if (s & (1 << PKT_OPEN))
2650 {
2651 conn->packets_to_send &= ~(1 << PKT_OPEN);
2652 end = bgp_create_open(conn, pkt);
2653 return bgp_send(conn, PKT_OPEN, end - buf);
2654 }
2655 else while (conn->channels_to_send)
2656 {
2657 c = bgp_get_channel_to_send(p, conn);
2658 s = c->packets_to_send;
2659
2660 if (s & (1 << PKT_ROUTE_REFRESH))
2661 {
2662 c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2663 end = bgp_create_route_refresh(c, pkt);
2664 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2665 }
2666 else if (s & (1 << PKT_BEGIN_REFRESH))
2667 {
2668 /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2669 c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2670 end = bgp_create_begin_refresh(c, pkt);
2671 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2672 }
2673 else if (s & (1 << PKT_UPDATE))
2674 {
2675 end = bgp_create_update(c, pkt);
2676 if (end)
2677 return bgp_send(conn, PKT_UPDATE, end - buf);
2678
2679 /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2680 c->packets_to_send = 0;
2681 conn->channels_to_send &= ~(1 << c->index);
2682
2683 if (c->feed_state == BFS_LOADED)
2684 {
2685 c->feed_state = BFS_NONE;
2686 end = bgp_create_end_mark(c, pkt);
2687 return bgp_send(conn, PKT_UPDATE, end - buf);
2688 }
2689
2690 else if (c->feed_state == BFS_REFRESHED)
2691 {
2692 c->feed_state = BFS_NONE;
2693 end = bgp_create_end_refresh(c, pkt);
2694 return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2695 }
2696 }
2697 else if (s)
2698 bug("Channel packets_to_send: %x", s);
2699
2700 c->packets_to_send = 0;
2701 conn->channels_to_send &= ~(1 << c->index);
2702 }
2703
2704 return 0;
2705 }
2706
2707 /**
2708 * bgp_schedule_packet - schedule a packet for transmission
2709 * @conn: connection
2710 * @c: channel
2711 * @type: packet type
2712 *
2713 * Schedule a packet of type @type to be sent as soon as possible.
2714 */
2715 void
2716 bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2717 {
2718 ASSERT(conn->sk);
2719
2720 DBG("BGP: Scheduling packet type %d\n", type);
2721
2722 if (c)
2723 {
2724 if (! conn->channels_to_send)
2725 {
2726 conn->last_channel = c->index;
2727 conn->last_channel_count = 0;
2728 }
2729
2730 c->packets_to_send |= 1 << type;
2731 conn->channels_to_send |= 1 << c->index;
2732 }
2733 else
2734 conn->packets_to_send |= 1 << type;
2735
2736 if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2737 ev_schedule(conn->tx_ev);
2738 }
2739
2740 void
2741 bgp_kick_tx(void *vconn)
2742 {
2743 struct bgp_conn *conn = vconn;
2744
2745 DBG("BGP: kicking TX\n");
2746 while (bgp_fire_tx(conn) > 0)
2747 ;
2748 }
2749
2750 void
2751 bgp_tx(sock *sk)
2752 {
2753 struct bgp_conn *conn = sk->data;
2754
2755 DBG("BGP: TX hook\n");
2756 while (bgp_fire_tx(conn) > 0)
2757 ;
2758 }
2759
2760
2761 static struct {
2762 byte major, minor;
2763 byte *msg;
2764 } bgp_msg_table[] = {
2765 { 1, 0, "Invalid message header" },
2766 { 1, 1, "Connection not synchronized" },
2767 { 1, 2, "Bad message length" },
2768 { 1, 3, "Bad message type" },
2769 { 2, 0, "Invalid OPEN message" },
2770 { 2, 1, "Unsupported version number" },
2771 { 2, 2, "Bad peer AS" },
2772 { 2, 3, "Bad BGP identifier" },
2773 { 2, 4, "Unsupported optional parameter" },
2774 { 2, 5, "Authentication failure" },
2775 { 2, 6, "Unacceptable hold time" },
2776 { 2, 7, "Required capability missing" }, /* [RFC5492] */
2777 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2778 { 3, 0, "Invalid UPDATE message" },
2779 { 3, 1, "Malformed attribute list" },
2780 { 3, 2, "Unrecognized well-known attribute" },
2781 { 3, 3, "Missing mandatory attribute" },
2782 { 3, 4, "Invalid attribute flags" },
2783 { 3, 5, "Invalid attribute length" },
2784 { 3, 6, "Invalid ORIGIN attribute" },
2785 { 3, 7, "AS routing loop" }, /* Deprecated */
2786 { 3, 8, "Invalid NEXT_HOP attribute" },
2787 { 3, 9, "Optional attribute error" },
2788 { 3, 10, "Invalid network field" },
2789 { 3, 11, "Malformed AS_PATH" },
2790 { 4, 0, "Hold timer expired" },
2791 { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2792 { 5, 1, "Unexpected message in OpenSent state" },
2793 { 5, 2, "Unexpected message in OpenConfirm state" },
2794 { 5, 3, "Unexpected message in Established state" },
2795 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2796 { 6, 1, "Maximum number of prefixes reached" },
2797 { 6, 2, "Administrative shutdown" },
2798 { 6, 3, "Peer de-configured" },
2799 { 6, 4, "Administrative reset" },
2800 { 6, 5, "Connection rejected" },
2801 { 6, 6, "Other configuration change" },
2802 { 6, 7, "Connection collision resolution" },
2803 { 6, 8, "Out of Resources" },
2804 { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2805 { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2806 };
2807
2808 /**
2809 * bgp_error_dsc - return BGP error description
2810 * @code: BGP error code
2811 * @subcode: BGP error subcode
2812 *
2813 * bgp_error_dsc() returns error description for BGP errors
2814 * which might be static string or given temporary buffer.
2815 */
2816 const char *
2817 bgp_error_dsc(uint code, uint subcode)
2818 {
2819 static char buff[32];
2820 uint i;
2821
2822 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2823 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2824 return bgp_msg_table[i].msg;
2825
2826 bsprintf(buff, "Unknown error %u.%u", code, subcode);
2827 return buff;
2828 }
2829
2830 /* RFC 8203 - shutdown communication message */
2831 static int
2832 bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2833 {
2834 byte *msg = data + 1;
2835 uint msg_len = data[0];
2836 uint i;
2837
2838 /* Handle zero length message */
2839 if (msg_len == 0)
2840 return 1;
2841
2842 /* Handle proper message */
2843 if ((msg_len > 128) && (msg_len + 1 > len))
2844 return 0;
2845
2846 /* Some elementary cleanup */
2847 for (i = 0; i < msg_len; i++)
2848 if (msg[i] < ' ')
2849 msg[i] = ' ';
2850
2851 proto_set_message(&p->p, msg, msg_len);
2852 *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2853 return 1;
2854 }
2855
2856 void
2857 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2858 {
2859 byte argbuf[256], *t = argbuf;
2860 uint i;
2861
2862 /* Don't report Cease messages generated by myself */
2863 if (code == 6 && class == BE_BGP_TX)
2864 return;
2865
2866 /* Reset shutdown message */
2867 if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2868 proto_set_message(&p->p, NULL, 0);
2869
2870 if (len)
2871 {
2872 /* Bad peer AS - we would like to print the AS */
2873 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2874 {
2875 t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2876 goto done;
2877 }
2878
2879 /* RFC 8203 - shutdown communication */
2880 if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2881 if (bgp_handle_message(p, data, len, &t))
2882 goto done;
2883
2884 *t++ = ':';
2885 *t++ = ' ';
2886 if (len > 16)
2887 len = 16;
2888 for (i=0; i<len; i++)
2889 t += bsprintf(t, "%02x", data[i]);
2890 }
2891
2892 done:
2893 *t = 0;
2894 const byte *dsc = bgp_error_dsc(code, subcode);
2895 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2896 }
2897
2898 static void
2899 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2900 {
2901 struct bgp_proto *p = conn->bgp;
2902
2903 if (len < 21)
2904 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2905
2906 uint code = pkt[19];
2907 uint subcode = pkt[20];
2908 int err = (code != 6);
2909
2910 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2911 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2912
2913 bgp_conn_enter_close_state(conn);
2914 bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2915
2916 if (err)
2917 {
2918 bgp_update_startup_delay(p);
2919 bgp_stop(p, 0, NULL, 0);
2920 }
2921 else
2922 {
2923 uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
2924 if (p->cf->disable_after_cease & subcode_bit)
2925 {
2926 log(L_INFO "%s: Disabled after Cease notification", p->p.name);
2927 p->startup_delay = 0;
2928 p->p.disabled = 1;
2929 }
2930 }
2931 }
2932
2933 static void
2934 bgp_rx_keepalive(struct bgp_conn *conn)
2935 {
2936 struct bgp_proto *p = conn->bgp;
2937
2938 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2939 bgp_start_timer(conn->hold_timer, conn->hold_time);
2940
2941 if (conn->state == BS_OPENCONFIRM)
2942 { bgp_conn_enter_established_state(conn); return; }
2943
2944 if (conn->state != BS_ESTABLISHED)
2945 bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2946 }
2947
2948
2949 /**
2950 * bgp_rx_packet - handle a received packet
2951 * @conn: BGP connection
2952 * @pkt: start of the packet
2953 * @len: packet size
2954 *
2955 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2956 * packet handler according to the packet type.
2957 */
2958 static void
2959 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2960 {
2961 byte type = pkt[18];
2962
2963 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2964
2965 if (conn->bgp->p.mrtdump & MD_MESSAGES)
2966 bgp_dump_message(conn, pkt, len);
2967
2968 switch (type)
2969 {
2970 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
2971 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
2972 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
2973 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
2974 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
2975 default: bgp_error(conn, 1, 3, pkt+18, 1);
2976 }
2977 }
2978
2979 /**
2980 * bgp_rx - handle received data
2981 * @sk: socket
2982 * @size: amount of data received
2983 *
2984 * bgp_rx() is called by the socket layer whenever new data arrive from
2985 * the underlying TCP connection. It assembles the data fragments to packets,
2986 * checks their headers and framing and passes complete packets to
2987 * bgp_rx_packet().
2988 */
2989 int
2990 bgp_rx(sock *sk, uint size)
2991 {
2992 struct bgp_conn *conn = sk->data;
2993 byte *pkt_start = sk->rbuf;
2994 byte *end = pkt_start + size;
2995 uint i, len;
2996
2997 DBG("BGP: RX hook: Got %d bytes\n", size);
2998 while (end >= pkt_start + BGP_HEADER_LENGTH)
2999 {
3000 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
3001 return 0;
3002 for(i=0; i<16; i++)
3003 if (pkt_start[i] != 0xff)
3004 {
3005 bgp_error(conn, 1, 1, NULL, 0);
3006 break;
3007 }
3008 len = get_u16(pkt_start+16);
3009 if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
3010 {
3011 bgp_error(conn, 1, 2, pkt_start+16, 2);
3012 break;
3013 }
3014 if (end < pkt_start + len)
3015 break;
3016 bgp_rx_packet(conn, pkt_start, len);
3017 pkt_start += len;
3018 }
3019 if (pkt_start != sk->rbuf)
3020 {
3021 memmove(sk->rbuf, pkt_start, end - pkt_start);
3022 sk->rpos = sk->rbuf + (end - pkt_start);
3023 }
3024 return 0;
3025 }