]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
Merge branch 'birdcl'
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 #undef LOCAL_DEBUG
10
11 #include "nest/bird.h"
12 #include "nest/iface.h"
13 #include "nest/protocol.h"
14 #include "nest/route.h"
15 #include "nest/attrs.h"
16 #include "nest/mrtdump.h"
17 #include "conf/conf.h"
18 #include "lib/unaligned.h"
19 #include "lib/socket.h"
20
21 #include "nest/cli.h"
22
23 #include "bgp.h"
24
25 static struct rate_limit rl_rcv_update, rl_snd_update;
26
27 /*
28 * MRT Dump format is not semantically specified.
29 * We will use these values in appropriate fields:
30 *
31 * Local AS, Remote AS - configured AS numbers for given BGP instance.
32 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
33 *
34 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
35 * changes) and MESSAGE (for received BGP messages).
36 *
37 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
38 * only when AS4 session is established and even in that case MESSAGE
39 * does not use AS4 variant for initial OPEN message. This strange
40 * behavior is here for compatibility with Quagga and Bgpdump,
41 */
42
43 static byte *
44 mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
45 {
46 struct bgp_proto *p = conn->bgp;
47
48 if (as4)
49 {
50 put_u32(buf+0, p->remote_as);
51 put_u32(buf+4, p->local_as);
52 buf+=8;
53 }
54 else
55 {
56 put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
57 put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS);
58 buf+=4;
59 }
60
61 put_u16(buf+0, p->neigh ? p->neigh->iface->index : 0);
62 put_u16(buf+2, BGP_AF);
63 buf+=4;
64 buf = ipa_put_addr(buf, conn->sk ? conn->sk->daddr : IPA_NONE);
65 buf = ipa_put_addr(buf, conn->sk ? conn->sk->saddr : IPA_NONE);
66
67 return buf;
68 }
69
70 static void
71 mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
72 {
73 byte buf[BGP_MAX_PACKET_LENGTH + 128];
74 byte *bp = buf + MRTDUMP_HDR_LENGTH;
75 int as4 = conn->bgp->as4_session;
76
77 bp = mrt_put_bgp4_hdr(bp, conn, as4);
78 memcpy(bp, pkt, len);
79 bp += len;
80 mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
81 buf, bp-buf);
82 }
83
84 static inline u16
85 convert_state(unsigned state)
86 {
87 /* Convert state from our BS_* values to values used in MRTDump */
88 return (state == BS_CLOSE) ? 1 : state + 1;
89 }
90
91 void
92 mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new)
93 {
94 byte buf[128];
95 byte *bp = buf + MRTDUMP_HDR_LENGTH;
96
97 bp = mrt_put_bgp4_hdr(bp, conn, 1);
98 put_u16(bp+0, convert_state(old));
99 put_u16(bp+2, convert_state(new));
100 bp += 4;
101 mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
102 }
103
104 static byte *
105 bgp_create_notification(struct bgp_conn *conn, byte *buf)
106 {
107 struct bgp_proto *p = conn->bgp;
108
109 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
110 buf[0] = conn->notify_code;
111 buf[1] = conn->notify_subcode;
112 memcpy(buf+2, conn->notify_data, conn->notify_size);
113 return buf + 2 + conn->notify_size;
114 }
115
116 #ifdef IPV6
117 static byte *
118 bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf)
119 {
120 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
121 *buf++ = 4; /* Capability data length */
122 *buf++ = 0; /* We support AF IPv6 */
123 *buf++ = BGP_AF_IPV6;
124 *buf++ = 0; /* RFU */
125 *buf++ = 1; /* and SAFI 1 */
126 return buf;
127 }
128
129 #else
130
131 static byte *
132 bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf)
133 {
134 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
135 *buf++ = 4; /* Capability data length */
136 *buf++ = 0; /* We support AF IPv4 */
137 *buf++ = BGP_AF_IPV4;
138 *buf++ = 0; /* RFU */
139 *buf++ = 1; /* and SAFI 1 */
140 return buf;
141 }
142 #endif
143
144 static byte *
145 bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf)
146 {
147 *buf++ = 2; /* Capability 2: Support for route refresh */
148 *buf++ = 0; /* Capability data length */
149 return buf;
150 }
151
152 static byte *
153 bgp_put_cap_as4(struct bgp_conn *conn, byte *buf)
154 {
155 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
156 *buf++ = 4; /* Capability data length */
157 put_u32(buf, conn->bgp->local_as);
158 return buf + 4;
159 }
160
161 static byte *
162 bgp_create_open(struct bgp_conn *conn, byte *buf)
163 {
164 struct bgp_proto *p = conn->bgp;
165 byte *cap;
166 int cap_len;
167
168 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
169 BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id);
170 buf[0] = BGP_VERSION;
171 put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS);
172 put_u16(buf+3, p->cf->hold_time);
173 put_u32(buf+5, p->local_id);
174
175 if (conn->start_state == BSS_CONNECT_NOCAP)
176 {
177 BGP_TRACE(D_PACKETS, "Skipping capabilities");
178 buf[9] = 0;
179 return buf + 10;
180 }
181
182 /* Skipped 3 B for length field and Capabilities parameter header */
183 cap = buf + 12;
184
185 #ifndef IPV6
186 if (p->cf->advertise_ipv4)
187 cap = bgp_put_cap_ipv4(conn, cap);
188 #endif
189
190 #ifdef IPV6
191 cap = bgp_put_cap_ipv6(conn, cap);
192 #endif
193
194 if (p->cf->enable_refresh)
195 cap = bgp_put_cap_rr(conn, cap);
196
197 if (conn->want_as4_support)
198 cap = bgp_put_cap_as4(conn, cap);
199
200 cap_len = cap - buf - 12;
201 if (cap_len > 0)
202 {
203 buf[9] = cap_len + 2; /* Optional params len */
204 buf[10] = 2; /* Option: Capability list */
205 buf[11] = cap_len; /* Option length */
206 return cap;
207 }
208 else
209 {
210 buf[9] = 0; /* No optional parameters */
211 return buf + 10;
212 }
213 }
214
215 static unsigned int
216 bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsigned int remains)
217 {
218 byte *start = w;
219 ip_addr a;
220 int bytes;
221
222 while (!EMPTY_LIST(buck->prefixes) && remains >= (1+sizeof(ip_addr)))
223 {
224 struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
225 DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen);
226 *w++ = px->n.pxlen;
227 bytes = (px->n.pxlen + 7) / 8;
228 a = px->n.prefix;
229 ipa_hton(a);
230 memcpy(w, &a, bytes);
231 w += bytes;
232 remains -= bytes + 1;
233 rem_node(&px->bucket_node);
234 fib_delete(&p->prefix_fib, px);
235 }
236 return w - start;
237 }
238
239 static void
240 bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck)
241 {
242 while (!EMPTY_LIST(buck->prefixes))
243 {
244 struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
245 log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen);
246 rem_node(&px->bucket_node);
247 fib_delete(&p->prefix_fib, px);
248 }
249 }
250
251 #ifndef IPV6 /* IPv4 version */
252
253 static byte *
254 bgp_create_update(struct bgp_conn *conn, byte *buf)
255 {
256 struct bgp_proto *p = conn->bgp;
257 struct bgp_bucket *buck;
258 int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
259 byte *w;
260 int wd_size = 0;
261 int r_size = 0;
262 int a_size = 0;
263
264 w = buf+2;
265 if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
266 {
267 DBG("Withdrawn routes:\n");
268 wd_size = bgp_encode_prefixes(p, w, buck, remains);
269 w += wd_size;
270 remains -= wd_size;
271 }
272 put_u16(buf, wd_size);
273
274 if (remains >= 3072)
275 {
276 while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
277 {
278 if (EMPTY_LIST(buck->prefixes))
279 {
280 DBG("Deleting empty bucket %p\n", buck);
281 rem_node(&buck->send_node);
282 bgp_free_bucket(p, buck);
283 continue;
284 }
285
286 DBG("Processing bucket %p\n", buck);
287 a_size = bgp_encode_attrs(p, w+2, buck->eattrs, 2048);
288
289 if (a_size < 0)
290 {
291 log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
292 bgp_flush_prefixes(p, buck);
293 rem_node(&buck->send_node);
294 bgp_free_bucket(p, buck);
295 continue;
296 }
297
298 put_u16(w, a_size);
299 w += a_size + 2;
300 r_size = bgp_encode_prefixes(p, w, buck, remains - a_size);
301 w += r_size;
302 break;
303 }
304 }
305 if (!a_size) /* Attributes not already encoded */
306 {
307 put_u16(w, 0);
308 w += 2;
309 }
310 if (wd_size || r_size)
311 {
312 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
313 return w;
314 }
315 else
316 return NULL;
317 }
318
319 #else /* IPv6 version */
320
321 static inline int
322 same_iface(struct bgp_proto *p, ip_addr *ip)
323 {
324 neighbor *n = neigh_find(&p->p, ip, 0);
325 return n && p->neigh && n->iface == p->neigh->iface;
326 }
327
328 static byte *
329 bgp_create_update(struct bgp_conn *conn, byte *buf)
330 {
331 struct bgp_proto *p = conn->bgp;
332 struct bgp_bucket *buck;
333 int size, second, rem_stored;
334 int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
335 byte *w, *w_stored, *tmp, *tstart;
336 ip_addr *ipp, ip, ip_ll;
337 ea_list *ea;
338 eattr *nh;
339
340 put_u16(buf, 0);
341 w = buf+4;
342
343 if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
344 {
345 DBG("Withdrawn routes:\n");
346 tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8);
347 *tmp++ = 0;
348 *tmp++ = BGP_AF_IPV6;
349 *tmp++ = 1;
350 ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11);
351 size = bgp_encode_attrs(p, w, ea, remains);
352 ASSERT(size >= 0);
353 w += size;
354 remains -= size;
355 }
356
357 if (remains >= 3072)
358 {
359 while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
360 {
361 if (EMPTY_LIST(buck->prefixes))
362 {
363 DBG("Deleting empty bucket %p\n", buck);
364 rem_node(&buck->send_node);
365 bgp_free_bucket(p, buck);
366 continue;
367 }
368
369 DBG("Processing bucket %p\n", buck);
370 rem_stored = remains;
371 w_stored = w;
372
373 size = bgp_encode_attrs(p, w, buck->eattrs, 2048);
374 if (size < 0)
375 {
376 log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
377 bgp_flush_prefixes(p, buck);
378 rem_node(&buck->send_node);
379 bgp_free_bucket(p, buck);
380 continue;
381 }
382 w += size;
383 remains -= size;
384
385 /* We have two addresses here in NEXT_HOP eattr. Really.
386 Unless NEXT_HOP was modified by filter */
387 nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
388 ASSERT(nh);
389 second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
390 ipp = (ip_addr *) nh->u.ptr->data;
391 ip = ipp[0];
392 ip_ll = IPA_NONE;
393
394 if (ipa_equal(ip, p->source_addr))
395 ip_ll = p->local_link;
396 else
397 {
398 /* If we send a route with 'third party' next hop destinated
399 * in the same interface, we should also send a link local
400 * next hop address. We use the received one (stored in the
401 * other part of BA_NEXT_HOP eattr). If we didn't received
402 * it (for example it is a static route), we can't use
403 * 'third party' next hop and we have to use local IP address
404 * as next hop. Sending original next hop address without
405 * link local address seems to be a natural way to solve that
406 * problem, but it is contrary to RFC 2545 and Quagga does not
407 * accept such routes.
408 *
409 * There are two cases, either we have global IP, or
410 * IPA_NONE if the neighbor is link-local. For IPA_NONE,
411 * we suppose it is on the same iface, see bgp_update_attrs().
412 */
413
414 if (ipa_zero(ip) || same_iface(p, &ip))
415 {
416 if (second && ipa_nonzero(ipp[1]))
417 ip_ll = ipp[1];
418 else
419 {
420 switch (p->cf->missing_lladdr)
421 {
422 case MLL_SELF:
423 ip = p->source_addr;
424 ip_ll = p->local_link;
425 break;
426 case MLL_DROP:
427 log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name);
428 w = w_stored;
429 remains = rem_stored;
430 bgp_flush_prefixes(p, buck);
431 rem_node(&buck->send_node);
432 bgp_free_bucket(p, buck);
433 continue;
434 case MLL_IGNORE:
435 break;
436 }
437 }
438 }
439 }
440
441 tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8);
442 *tmp++ = 0;
443 *tmp++ = BGP_AF_IPV6;
444 *tmp++ = 1;
445
446 if (ipa_has_link_scope(ip))
447 ip = IPA_NONE;
448
449 if (ipa_nonzero(ip_ll))
450 {
451 *tmp++ = 32;
452 ipa_hton(ip);
453 memcpy(tmp, &ip, 16);
454 ipa_hton(ip_ll);
455 memcpy(tmp+16, &ip_ll, 16);
456 tmp += 32;
457 }
458 else
459 {
460 *tmp++ = 16;
461 ipa_hton(ip);
462 memcpy(tmp, &ip, 16);
463 tmp += 16;
464 }
465
466 *tmp++ = 0; /* No SNPA information */
467 tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1));
468 ea->attrs[0].u.ptr->length = tmp - tstart;
469 size = bgp_encode_attrs(p, w, ea, remains);
470 ASSERT(size >= 0);
471 w += size;
472 break;
473 }
474 }
475
476 size = w - (buf+4);
477 put_u16(buf+2, size);
478 lp_flush(bgp_linpool);
479 if (size)
480 {
481 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
482 return w;
483 }
484 else
485 return NULL;
486 }
487
488 #endif
489
490 static byte *
491 bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
492 {
493 struct bgp_proto *p = conn->bgp;
494 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
495
496 *buf++ = 0;
497 *buf++ = BGP_AF;
498 *buf++ = 0; /* RFU */
499 *buf++ = 1; /* and SAFI 1 */
500 return buf;
501 }
502
503 static void
504 bgp_create_header(byte *buf, unsigned int len, unsigned int type)
505 {
506 memset(buf, 0xff, 16); /* Marker */
507 put_u16(buf+16, len);
508 buf[18] = type;
509 }
510
511 /**
512 * bgp_fire_tx - transmit packets
513 * @conn: connection
514 *
515 * Whenever the transmit buffers of the underlying TCP connection
516 * are free and we have any packets queued for sending, the socket functions
517 * call bgp_fire_tx() which takes care of selecting the highest priority packet
518 * queued (Notification > Keepalive > Open > Update), assembling its header
519 * and body and sending it to the connection.
520 */
521 static int
522 bgp_fire_tx(struct bgp_conn *conn)
523 {
524 struct bgp_proto *p = conn->bgp;
525 unsigned int s = conn->packets_to_send;
526 sock *sk = conn->sk;
527 byte *buf, *pkt, *end;
528 int type;
529
530 if (!sk)
531 {
532 conn->packets_to_send = 0;
533 return 0;
534 }
535 buf = sk->tbuf;
536 pkt = buf + BGP_HEADER_LENGTH;
537
538 if (s & (1 << PKT_SCHEDULE_CLOSE))
539 {
540 /* We can finally close connection and enter idle state */
541 bgp_conn_enter_idle_state(conn);
542 return 0;
543 }
544 if (s & (1 << PKT_NOTIFICATION))
545 {
546 s = 1 << PKT_SCHEDULE_CLOSE;
547 type = PKT_NOTIFICATION;
548 end = bgp_create_notification(conn, pkt);
549 }
550 else if (s & (1 << PKT_KEEPALIVE))
551 {
552 s &= ~(1 << PKT_KEEPALIVE);
553 type = PKT_KEEPALIVE;
554 end = pkt; /* Keepalives carry no data */
555 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
556 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
557 }
558 else if (s & (1 << PKT_OPEN))
559 {
560 s &= ~(1 << PKT_OPEN);
561 type = PKT_OPEN;
562 end = bgp_create_open(conn, pkt);
563 }
564 else if (s & (1 << PKT_ROUTE_REFRESH))
565 {
566 s &= ~(1 << PKT_ROUTE_REFRESH);
567 type = PKT_ROUTE_REFRESH;
568 end = bgp_create_route_refresh(conn, pkt);
569 }
570 else if (s & (1 << PKT_UPDATE))
571 {
572 end = bgp_create_update(conn, pkt);
573 type = PKT_UPDATE;
574 if (!end)
575 {
576 conn->packets_to_send = 0;
577 return 0;
578 }
579 }
580 else
581 return 0;
582 conn->packets_to_send = s;
583 bgp_create_header(buf, end - buf, type);
584 return sk_send(sk, end - buf);
585 }
586
587 /**
588 * bgp_schedule_packet - schedule a packet for transmission
589 * @conn: connection
590 * @type: packet type
591 *
592 * Schedule a packet of type @type to be sent as soon as possible.
593 */
594 void
595 bgp_schedule_packet(struct bgp_conn *conn, int type)
596 {
597 DBG("BGP: Scheduling packet type %d\n", type);
598 conn->packets_to_send |= 1 << type;
599 if (conn->sk && conn->sk->tpos == conn->sk->tbuf)
600 ev_schedule(conn->tx_ev);
601 }
602
603 void
604 bgp_kick_tx(void *vconn)
605 {
606 struct bgp_conn *conn = vconn;
607
608 DBG("BGP: kicking TX\n");
609 while (bgp_fire_tx(conn) > 0)
610 ;
611 }
612
613 void
614 bgp_tx(sock *sk)
615 {
616 struct bgp_conn *conn = sk->data;
617
618 DBG("BGP: TX hook\n");
619 while (bgp_fire_tx(conn) > 0)
620 ;
621 }
622
623 /* Capatibility negotiation as per RFC 2842 */
624
625 void
626 bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
627 {
628 // struct bgp_proto *p = conn->bgp;
629 int cl;
630
631 while (len > 0)
632 {
633 if (len < 2 || len < 2 + opt[1])
634 goto err;
635
636 cl = opt[1];
637
638 switch (opt[0])
639 {
640 case 2: /* Route refresh capability, RFC 2918 */
641 if (cl != 0)
642 goto err;
643 conn->peer_refresh_support = 1;
644 break;
645
646 case 65: /* AS4 capability, RFC 4893 */
647 if (cl != 4)
648 goto err;
649 conn->peer_as4_support = 1;
650 if (conn->want_as4_support)
651 conn->advertised_as = get_u32(opt + 2);
652 break;
653
654 /* We can safely ignore all other capabilities */
655 }
656 len -= 2 + cl;
657 opt += 2 + cl;
658 }
659 return;
660
661 err:
662 bgp_error(conn, 2, 0, NULL, 0);
663 return;
664 }
665
666 static int
667 bgp_parse_options(struct bgp_conn *conn, byte *opt, int len)
668 {
669 struct bgp_proto *p = conn->bgp;
670 int ol;
671
672 while (len > 0)
673 {
674 if (len < 2 || len < 2 + opt[1])
675 { bgp_error(conn, 2, 0, NULL, 0); return 0; }
676 #ifdef LOCAL_DEBUG
677 {
678 int i;
679 DBG("\tOption %02x:", opt[0]);
680 for(i=0; i<opt[1]; i++)
681 DBG(" %02x", opt[2+i]);
682 DBG("\n");
683 }
684 #endif
685
686 ol = opt[1];
687 switch (opt[0])
688 {
689 case 2:
690 if (conn->start_state == BSS_CONNECT_NOCAP)
691 BGP_TRACE(D_PACKETS, "Ignoring received capabilities");
692 else
693 bgp_parse_capabilities(conn, opt + 2, ol);
694 break;
695
696 default:
697 /*
698 * BGP specs don't tell us to send which option
699 * we didn't recognize, but it's common practice
700 * to do so. Also, capability negotiation with
701 * Cisco routers doesn't work without that.
702 */
703 bgp_error(conn, 2, 4, opt, ol);
704 return 0;
705 }
706 len -= 2 + ol;
707 opt += 2 + ol;
708 }
709 return 0;
710 }
711
712 static void
713 bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
714 {
715 struct bgp_conn *other;
716 struct bgp_proto *p = conn->bgp;
717 unsigned hold;
718 u16 base_as;
719 u32 id;
720
721 /* Check state */
722 if (conn->state != BS_OPENSENT)
723 { bgp_error(conn, 5, 0, NULL, 0); return; }
724
725 /* Check message contents */
726 if (len < 29 || len != 29 + pkt[28])
727 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
728 if (pkt[19] != BGP_VERSION)
729 { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */
730 conn->advertised_as = base_as = get_u16(pkt+20);
731 hold = get_u16(pkt+22);
732 id = get_u32(pkt+24);
733 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id);
734
735 if (bgp_parse_options(conn, pkt+29, pkt[28]))
736 return;
737
738 if (hold > 0 && hold < 3)
739 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
740
741 if (!id || id == 0xffffffff || id == p->local_id)
742 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
743
744 if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
745 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
746
747 if (conn->advertised_as != p->remote_as)
748 {
749 if (conn->peer_as4_support)
750 {
751 u32 val = htonl(conn->advertised_as);
752 bgp_error(conn, 2, 2, (byte *) &val, 4);
753 }
754 else
755 bgp_error(conn, 2, 2, pkt+20, 2);
756
757 return;
758 }
759
760 /* Check the other connection */
761 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
762 switch (other->state)
763 {
764 case BS_IDLE:
765 case BS_CONNECT:
766 case BS_ACTIVE:
767 case BS_OPENSENT:
768 case BS_CLOSE:
769 break;
770 case BS_OPENCONFIRM:
771 if ((p->local_id < id) == (conn == &p->incoming_conn))
772 {
773 /* Should close the other connection */
774 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
775 bgp_error(other, 6, 7, NULL, 0);
776 break;
777 }
778 /* Fall thru */
779 case BS_ESTABLISHED:
780 /* Should close this connection */
781 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
782 bgp_error(conn, 6, 7, NULL, 0);
783 return;
784 default:
785 bug("bgp_rx_open: Unknown state");
786 }
787
788 /* Update our local variables */
789 conn->hold_time = MIN(hold, p->cf->hold_time);
790 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
791 p->remote_id = id;
792 p->as4_session = conn->want_as4_support && conn->peer_as4_support;
793
794 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session);
795
796 bgp_schedule_packet(conn, PKT_KEEPALIVE);
797 bgp_start_timer(conn->hold_timer, conn->hold_time);
798 bgp_conn_enter_openconfirm_state(conn);
799 }
800
801 #define DECODE_PREFIX(pp, ll) do { \
802 int b = *pp++; \
803 int q; \
804 ll--; \
805 if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \
806 q = (b+7) / 8; \
807 if (ll < q) { err=1; goto done; } \
808 memcpy(&prefix, pp, q); \
809 pp += q; \
810 ll -= q; \
811 ipa_ntoh(prefix); \
812 prefix = ipa_and(prefix, ipa_mkmask(b)); \
813 pxlen = b; \
814 } while (0)
815
816 static inline int
817 bgp_set_next_hop(struct bgp_proto *p, rta *a)
818 {
819 struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
820 ip_addr *nexthop = (ip_addr *) nh->u.ptr->data;
821
822 #ifdef IPV6
823 int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]);
824
825 /* First address should not be link-local, but may be zero in direct mode */
826 if (ipa_has_link_scope(*nexthop))
827 *nexthop = IPA_NONE;
828 #else
829 int second = 0;
830 #endif
831
832 if (p->cf->gw_mode == GW_DIRECT)
833 {
834 neighbor *ng = NULL;
835
836 if (ipa_nonzero(*nexthop))
837 ng = neigh_find(&p->p, nexthop, 0);
838 else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */
839 ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0);
840
841 /* Fallback */
842 if (!ng)
843 ng = p->neigh;
844
845 if (ng->scope == SCOPE_HOST)
846 return 0;
847
848 a->dest = RTD_ROUTER;
849 a->gw = ng->addr;
850 a->iface = ng->iface;
851 a->hostentry = NULL;
852 a->igp_metric = 0;
853 }
854 else /* GW_RECURSIVE */
855 {
856 if (ipa_zero(*nexthop))
857 return 0;
858
859 rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
860 }
861
862 return 1;
863 }
864
865 #ifndef IPV6 /* IPv4 version */
866
867 static void
868 bgp_do_rx_update(struct bgp_conn *conn,
869 byte *withdrawn, int withdrawn_len,
870 byte *nlri, int nlri_len,
871 byte *attrs, int attr_len)
872 {
873 struct bgp_proto *p = conn->bgp;
874 net *n;
875 rta *a0, *a = NULL;
876 ip_addr prefix;
877 int pxlen, err = 0;
878
879 /* Withdraw routes */
880 while (withdrawn_len)
881 {
882 DECODE_PREFIX(withdrawn, withdrawn_len);
883 DBG("Withdraw %I/%d\n", prefix, pxlen);
884 if (n = net_find(p->p.table, prefix, pxlen))
885 rte_update(p->p.table, n, &p->p, &p->p, NULL);
886 }
887
888 if (!attr_len && !nlri_len) /* shortcut */
889 return;
890
891 a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len);
892
893 if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
894 return;
895
896 if (a0 && nlri_len && bgp_set_next_hop(p, a0))
897 a = rta_lookup(a0);
898
899 while (nlri_len)
900 {
901 DECODE_PREFIX(nlri, nlri_len);
902 DBG("Add %I/%d\n", prefix, pxlen);
903
904 if (a)
905 {
906 rte *e = rte_get_temp(rta_clone(a));
907 e->net = net_get(p->p.table, prefix, pxlen);
908 e->pflags = 0;
909 e->u.bgp.suppressed = 0;
910 rte_update(p->p.table, e->net, &p->p, &p->p, e);
911 }
912 else
913 {
914 /* Forced withdraw as a result of soft error */
915 if (n = net_find(p->p.table, prefix, pxlen))
916 rte_update(p->p.table, n, &p->p, &p->p, NULL);
917 }
918 }
919
920 done:
921 if (a)
922 rta_free(a);
923
924 if (err)
925 bgp_error(conn, 3, err, NULL, 0);
926
927 return;
928 }
929
930 #else /* IPv6 version */
931
932 #define DO_NLRI(name) \
933 start = x = p->name##_start; \
934 len = len0 = p->name##_len; \
935 if (len) \
936 { \
937 if (len < 3) { err=9; goto done; } \
938 af = get_u16(x); \
939 sub = x[2]; \
940 x += 3; \
941 len -= 3; \
942 DBG("\tNLRI AF=%d sub=%d len=%d\n", af, sub, len);\
943 } \
944 else \
945 af = 0; \
946 if (af == BGP_AF_IPV6)
947
948 static void
949 bgp_attach_next_hop(rta *a0, byte *x)
950 {
951 ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
952 memcpy(nh, x+1, 16);
953 ipa_ntoh(nh[0]);
954
955 /* We store received link local address in the other part of BA_NEXT_HOP eattr. */
956 if (*x == 32)
957 {
958 memcpy(nh+1, x+17, 16);
959 ipa_ntoh(nh[1]);
960 }
961 else
962 nh[1] = IPA_NONE;
963 }
964
965
966 static void
967 bgp_do_rx_update(struct bgp_conn *conn,
968 byte *withdrawn, int withdrawn_len,
969 byte *nlri, int nlri_len,
970 byte *attrs, int attr_len)
971 {
972 struct bgp_proto *p = conn->bgp;
973 byte *start, *x;
974 int len, len0;
975 unsigned af, sub;
976 net *n;
977 rta *a0, *a = NULL;
978 ip_addr prefix;
979 int pxlen, err = 0;
980
981 p->mp_reach_len = 0;
982 p->mp_unreach_len = 0;
983 a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0);
984
985 if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
986 return;
987
988 DO_NLRI(mp_unreach)
989 {
990 while (len)
991 {
992 DECODE_PREFIX(x, len);
993 DBG("Withdraw %I/%d\n", prefix, pxlen);
994 if (n = net_find(p->p.table, prefix, pxlen))
995 rte_update(p->p.table, n, &p->p, &p->p, NULL);
996 }
997 }
998
999 DO_NLRI(mp_reach)
1000 {
1001 /* Create fake NEXT_HOP attribute */
1002 if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2)
1003 { err = 9; goto done; }
1004
1005 if (a0)
1006 bgp_attach_next_hop(a0, x);
1007
1008 /* Also ignore one reserved byte */
1009 len -= *x + 2;
1010 x += *x + 2;
1011
1012 if (a0 && bgp_set_next_hop(p, a0))
1013 a = rta_lookup(a0);
1014
1015 while (len)
1016 {
1017 DECODE_PREFIX(x, len);
1018 DBG("Add %I/%d\n", prefix, pxlen);
1019
1020 if (a)
1021 {
1022 rte *e = rte_get_temp(rta_clone(a));
1023 e->net = net_get(p->p.table, prefix, pxlen);
1024 e->pflags = 0;
1025 e->u.bgp.suppressed = 0;
1026 rte_update(p->p.table, e->net, &p->p, &p->p, e);
1027 }
1028 else
1029 {
1030 /* Forced withdraw as a result of soft error */
1031 if (n = net_find(p->p.table, prefix, pxlen))
1032 rte_update(p->p.table, n, &p->p, &p->p, NULL);
1033 }
1034 }
1035 }
1036
1037 done:
1038 if (a)
1039 rta_free(a);
1040
1041 if (err) /* Use subcode 9, not err */
1042 bgp_error(conn, 3, 9, NULL, 0);
1043
1044 return;
1045 }
1046
1047 #endif
1048
1049 static void
1050 bgp_rx_update(struct bgp_conn *conn, byte *pkt, int len)
1051 {
1052 struct bgp_proto *p = conn->bgp;
1053 byte *withdrawn, *attrs, *nlri;
1054 int withdrawn_len, attr_len, nlri_len;
1055
1056 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
1057
1058 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1059 if (conn->state == BS_OPENCONFIRM)
1060 bgp_conn_enter_established_state(conn);
1061
1062 if (conn->state != BS_ESTABLISHED)
1063 { bgp_error(conn, 5, 0, NULL, 0); return; }
1064 bgp_start_timer(conn->hold_timer, conn->hold_time);
1065
1066 /* Find parts of the packet and check sizes */
1067 if (len < 23)
1068 {
1069 bgp_error(conn, 1, 2, pkt+16, 2);
1070 return;
1071 }
1072 withdrawn = pkt + 21;
1073 withdrawn_len = get_u16(pkt + 19);
1074 if (withdrawn_len + 23 > len)
1075 goto malformed;
1076 attrs = withdrawn + withdrawn_len + 2;
1077 attr_len = get_u16(attrs - 2);
1078 if (withdrawn_len + attr_len + 23 > len)
1079 goto malformed;
1080 nlri = attrs + attr_len;
1081 nlri_len = len - withdrawn_len - attr_len - 23;
1082 if (!attr_len && nlri_len)
1083 goto malformed;
1084 DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len);
1085
1086 lp_flush(bgp_linpool);
1087
1088 bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len);
1089 return;
1090
1091 malformed:
1092 bgp_error(conn, 3, 1, NULL, 0);
1093 }
1094
1095 static struct {
1096 byte major, minor;
1097 byte *msg;
1098 } bgp_msg_table[] = {
1099 { 1, 0, "Invalid message header" },
1100 { 1, 1, "Connection not synchronized" },
1101 { 1, 2, "Bad message length" },
1102 { 1, 3, "Bad message type" },
1103 { 2, 0, "Invalid OPEN message" },
1104 { 2, 1, "Unsupported version number" },
1105 { 2, 2, "Bad peer AS" },
1106 { 2, 3, "Bad BGP identifier" },
1107 { 2, 4, "Unsupported optional parameter" },
1108 { 2, 5, "Authentication failure" },
1109 { 2, 6, "Unacceptable hold time" },
1110 { 2, 7, "Required capability missing" }, /* [RFC3392] */
1111 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
1112 { 3, 0, "Invalid UPDATE message" },
1113 { 3, 1, "Malformed attribute list" },
1114 { 3, 2, "Unrecognized well-known attribute" },
1115 { 3, 3, "Missing mandatory attribute" },
1116 { 3, 4, "Invalid attribute flags" },
1117 { 3, 5, "Invalid attribute length" },
1118 { 3, 6, "Invalid ORIGIN attribute" },
1119 { 3, 7, "AS routing loop" }, /* Deprecated */
1120 { 3, 8, "Invalid NEXT_HOP attribute" },
1121 { 3, 9, "Optional attribute error" },
1122 { 3, 10, "Invalid network field" },
1123 { 3, 11, "Malformed AS_PATH" },
1124 { 4, 0, "Hold timer expired" },
1125 { 5, 0, "Finite state machine error" },
1126 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
1127 { 6, 1, "Maximum number of prefixes reached" },
1128 { 6, 2, "Administrative shutdown" },
1129 { 6, 3, "Peer de-configured" },
1130 { 6, 4, "Administrative reset" },
1131 { 6, 5, "Connection rejected" },
1132 { 6, 6, "Other configuration change" },
1133 { 6, 7, "Connection collision resolution" },
1134 { 6, 8, "Out of Resources" }
1135 };
1136
1137 /**
1138 * bgp_error_dsc - return BGP error description
1139 * @code: BGP error code
1140 * @subcode: BGP error subcode
1141 *
1142 * bgp_error_dsc() returns error description for BGP errors
1143 * which might be static string or given temporary buffer.
1144 */
1145 const char *
1146 bgp_error_dsc(unsigned code, unsigned subcode)
1147 {
1148 static char buff[32];
1149 unsigned i;
1150 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
1151 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
1152 {
1153 return bgp_msg_table[i].msg;
1154 }
1155
1156 bsprintf(buff, "Unknown error %d.%d", code, subcode);
1157 return buff;
1158 }
1159
1160 void
1161 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len)
1162 {
1163 const byte *name;
1164 byte *t, argbuf[36];
1165 unsigned i;
1166
1167 /* Don't report Cease messages generated by myself */
1168 if (code == 6 && class == BE_BGP_TX)
1169 return;
1170
1171 name = bgp_error_dsc(code, subcode);
1172 t = argbuf;
1173 if (len)
1174 {
1175 *t++ = ':';
1176 *t++ = ' ';
1177
1178 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
1179 {
1180 /* Bad peer AS - we would like to print the AS */
1181 t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data));
1182 goto done;
1183 }
1184 if (len > 16)
1185 len = 16;
1186 for (i=0; i<len; i++)
1187 t += bsprintf(t, "%02x", data[i]);
1188 }
1189 done:
1190 *t = 0;
1191 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
1192 }
1193
1194 static void
1195 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, int len)
1196 {
1197 struct bgp_proto *p = conn->bgp;
1198 if (len < 21)
1199 {
1200 bgp_error(conn, 1, 2, pkt+16, 2);
1201 return;
1202 }
1203
1204 unsigned code = pkt[19];
1205 unsigned subcode = pkt[20];
1206 int err = (code != 6);
1207
1208 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
1209 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
1210
1211 #ifndef IPV6
1212 if ((code == 2) && ((subcode == 4) || (subcode == 7))
1213 /* Error related to capability:
1214 * 4 - Peer does not support capabilities at all.
1215 * 7 - Peer request some capability. Strange unless it is IPv6 only peer.
1216 */
1217 && (p->cf->capabilities == 2)
1218 /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */
1219 && (conn->start_state == BSS_CONNECT)
1220 /* Failed connection attempt have used capabilities */
1221 && (p->cf->remote_as <= 0xFFFF))
1222 /* Not possible with disabled capabilities */
1223 {
1224 /* We try connect without capabilities */
1225 log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name);
1226 p->start_state = BSS_CONNECT_NOCAP;
1227 err = 0;
1228 }
1229 #endif
1230
1231 bgp_conn_enter_close_state(conn);
1232 bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE);
1233
1234 if (err)
1235 {
1236 bgp_update_startup_delay(p);
1237 bgp_stop(p, 0);
1238 }
1239 }
1240
1241 static void
1242 bgp_rx_keepalive(struct bgp_conn *conn)
1243 {
1244 struct bgp_proto *p = conn->bgp;
1245
1246 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
1247 bgp_start_timer(conn->hold_timer, conn->hold_time);
1248 switch (conn->state)
1249 {
1250 case BS_OPENCONFIRM:
1251 bgp_conn_enter_established_state(conn);
1252 break;
1253 case BS_ESTABLISHED:
1254 break;
1255 default:
1256 bgp_error(conn, 5, 0, NULL, 0);
1257 }
1258 }
1259
1260 static void
1261 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, int len)
1262 {
1263 struct bgp_proto *p = conn->bgp;
1264
1265 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
1266
1267 if (conn->state != BS_ESTABLISHED)
1268 { bgp_error(conn, 5, 0, NULL, 0); return; }
1269
1270 if (!p->cf->enable_refresh)
1271 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
1272
1273 if (len != (BGP_HEADER_LENGTH + 4))
1274 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1275
1276 /* FIXME - we ignore AFI/SAFI values, as we support
1277 just one value and even an error code for an invalid
1278 request is not defined */
1279
1280 proto_request_feeding(&p->p);
1281 }
1282
1283
1284 /**
1285 * bgp_rx_packet - handle a received packet
1286 * @conn: BGP connection
1287 * @pkt: start of the packet
1288 * @len: packet size
1289 *
1290 * bgp_rx_packet() takes a newly received packet and calls the corresponding
1291 * packet handler according to the packet type.
1292 */
1293 static void
1294 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
1295 {
1296 byte type = pkt[18];
1297
1298 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
1299
1300 if (conn->bgp->p.mrtdump & MD_MESSAGES)
1301 mrt_dump_bgp_packet(conn, pkt, len);
1302
1303 switch (type)
1304 {
1305 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
1306 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
1307 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
1308 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
1309 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
1310 default: bgp_error(conn, 1, 3, pkt+18, 1);
1311 }
1312 }
1313
1314 /**
1315 * bgp_rx - handle received data
1316 * @sk: socket
1317 * @size: amount of data received
1318 *
1319 * bgp_rx() is called by the socket layer whenever new data arrive from
1320 * the underlying TCP connection. It assembles the data fragments to packets,
1321 * checks their headers and framing and passes complete packets to
1322 * bgp_rx_packet().
1323 */
1324 int
1325 bgp_rx(sock *sk, int size)
1326 {
1327 struct bgp_conn *conn = sk->data;
1328 byte *pkt_start = sk->rbuf;
1329 byte *end = pkt_start + size;
1330 unsigned i, len;
1331
1332 DBG("BGP: RX hook: Got %d bytes\n", size);
1333 while (end >= pkt_start + BGP_HEADER_LENGTH)
1334 {
1335 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
1336 return 0;
1337 for(i=0; i<16; i++)
1338 if (pkt_start[i] != 0xff)
1339 {
1340 bgp_error(conn, 1, 1, NULL, 0);
1341 break;
1342 }
1343 len = get_u16(pkt_start+16);
1344 if (len < BGP_HEADER_LENGTH || len > BGP_MAX_PACKET_LENGTH)
1345 {
1346 bgp_error(conn, 1, 2, pkt_start+16, 2);
1347 break;
1348 }
1349 if (end < pkt_start + len)
1350 break;
1351 bgp_rx_packet(conn, pkt_start, len);
1352 pkt_start += len;
1353 }
1354 if (pkt_start != sk->rbuf)
1355 {
1356 memmove(sk->rbuf, pkt_start, end - pkt_start);
1357 sk->rpos = sk->rbuf + (end - pkt_start);
1358 }
1359 return 0;
1360 }