]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
Fixes a new bug in BGP route ordering.
[thirdparty/bird.git] / proto / bgp / packets.c
1 /*
2 * BIRD -- BGP Packet Processing
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 #undef LOCAL_DEBUG
10
11 #include "nest/bird.h"
12 #include "nest/iface.h"
13 #include "nest/protocol.h"
14 #include "nest/route.h"
15 #include "nest/attrs.h"
16 #include "nest/mrtdump.h"
17 #include "conf/conf.h"
18 #include "lib/unaligned.h"
19 #include "lib/socket.h"
20
21 #include "nest/cli.h"
22
23 #include "bgp.h"
24
25 static struct rate_limit rl_rcv_update, rl_snd_update;
26
27 /*
28 * MRT Dump format is not semantically specified.
29 * We will use these values in appropriate fields:
30 *
31 * Local AS, Remote AS - configured AS numbers for given BGP instance.
32 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
33 *
34 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
35 * changes) and MESSAGE (for received BGP messages).
36 *
37 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
38 * only when AS4 session is established and even in that case MESSAGE
39 * does not use AS4 variant for initial OPEN message. This strange
40 * behavior is here for compatibility with Quagga and Bgpdump,
41 */
42
43 static byte *
44 mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
45 {
46 struct bgp_proto *p = conn->bgp;
47
48 if (as4)
49 {
50 put_u32(buf+0, p->remote_as);
51 put_u32(buf+4, p->local_as);
52 buf+=8;
53 }
54 else
55 {
56 put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
57 put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS);
58 buf+=4;
59 }
60
61 put_u16(buf+0, p->neigh ? p->neigh->iface->index : 0);
62 put_u16(buf+2, BGP_AF);
63 buf+=4;
64 buf = ipa_put_addr(buf, conn->sk ? conn->sk->daddr : IPA_NONE);
65 buf = ipa_put_addr(buf, conn->sk ? conn->sk->saddr : IPA_NONE);
66
67 return buf;
68 }
69
70 static void
71 mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
72 {
73 byte buf[BGP_MAX_PACKET_LENGTH + 128];
74 byte *bp = buf + MRTDUMP_HDR_LENGTH;
75 int as4 = conn->bgp->as4_session;
76
77 bp = mrt_put_bgp4_hdr(bp, conn, as4);
78 memcpy(bp, pkt, len);
79 bp += len;
80 mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
81 buf, bp-buf);
82 }
83
84 static inline u16
85 convert_state(unsigned state)
86 {
87 /* Convert state from our BS_* values to values used in MRTDump */
88 return (state == BS_CLOSE) ? 1 : state + 1;
89 }
90
91 void
92 mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new)
93 {
94 byte buf[128];
95 byte *bp = buf + MRTDUMP_HDR_LENGTH;
96
97 bp = mrt_put_bgp4_hdr(bp, conn, 1);
98 put_u16(bp+0, convert_state(old));
99 put_u16(bp+2, convert_state(new));
100 bp += 4;
101 mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
102 }
103
104 static byte *
105 bgp_create_notification(struct bgp_conn *conn, byte *buf)
106 {
107 struct bgp_proto *p = conn->bgp;
108
109 BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
110 buf[0] = conn->notify_code;
111 buf[1] = conn->notify_subcode;
112 memcpy(buf+2, conn->notify_data, conn->notify_size);
113 return buf + 2 + conn->notify_size;
114 }
115
116 #ifdef IPV6
117 static byte *
118 bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf)
119 {
120 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
121 *buf++ = 4; /* Capability data length */
122 *buf++ = 0; /* We support AF IPv6 */
123 *buf++ = BGP_AF_IPV6;
124 *buf++ = 0; /* RFU */
125 *buf++ = 1; /* and SAFI 1 */
126 return buf;
127 }
128
129 #else
130
131 static byte *
132 bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf)
133 {
134 *buf++ = 1; /* Capability 1: Multiprotocol extensions */
135 *buf++ = 4; /* Capability data length */
136 *buf++ = 0; /* We support AF IPv4 */
137 *buf++ = BGP_AF_IPV4;
138 *buf++ = 0; /* RFU */
139 *buf++ = 1; /* and SAFI 1 */
140 return buf;
141 }
142 #endif
143
144 static byte *
145 bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf)
146 {
147 *buf++ = 2; /* Capability 2: Support for route refresh */
148 *buf++ = 0; /* Capability data length */
149 return buf;
150 }
151
152 static byte *
153 bgp_put_cap_as4(struct bgp_conn *conn, byte *buf)
154 {
155 *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
156 *buf++ = 4; /* Capability data length */
157 put_u32(buf, conn->bgp->local_as);
158 return buf + 4;
159 }
160
161 static byte *
162 bgp_create_open(struct bgp_conn *conn, byte *buf)
163 {
164 struct bgp_proto *p = conn->bgp;
165 byte *cap;
166 int cap_len;
167
168 BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
169 BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id);
170 buf[0] = BGP_VERSION;
171 put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS);
172 put_u16(buf+3, p->cf->hold_time);
173 put_u32(buf+5, p->local_id);
174
175 if (conn->start_state == BSS_CONNECT_NOCAP)
176 {
177 BGP_TRACE(D_PACKETS, "Skipping capabilities");
178 buf[9] = 0;
179 return buf + 10;
180 }
181
182 /* Skipped 3 B for length field and Capabilities parameter header */
183 cap = buf + 12;
184
185 #ifndef IPV6
186 if (p->cf->advertise_ipv4)
187 cap = bgp_put_cap_ipv4(conn, cap);
188 #endif
189
190 #ifdef IPV6
191 cap = bgp_put_cap_ipv6(conn, cap);
192 #endif
193
194 if (p->cf->enable_refresh)
195 cap = bgp_put_cap_rr(conn, cap);
196
197 if (conn->want_as4_support)
198 cap = bgp_put_cap_as4(conn, cap);
199
200 cap_len = cap - buf - 12;
201 if (cap_len > 0)
202 {
203 buf[9] = cap_len + 2; /* Optional params len */
204 buf[10] = 2; /* Option: Capability list */
205 buf[11] = cap_len; /* Option length */
206 return cap;
207 }
208 else
209 {
210 buf[9] = 0; /* No optional parameters */
211 return buf + 10;
212 }
213 }
214
215 static unsigned int
216 bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsigned int remains)
217 {
218 byte *start = w;
219 ip_addr a;
220 int bytes;
221
222 while (!EMPTY_LIST(buck->prefixes) && remains >= (1+sizeof(ip_addr)))
223 {
224 struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
225 DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen);
226 *w++ = px->n.pxlen;
227 bytes = (px->n.pxlen + 7) / 8;
228 a = px->n.prefix;
229 ipa_hton(a);
230 memcpy(w, &a, bytes);
231 w += bytes;
232 remains -= bytes + 1;
233 rem_node(&px->bucket_node);
234 fib_delete(&p->prefix_fib, px);
235 }
236 return w - start;
237 }
238
239 static void
240 bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck)
241 {
242 while (!EMPTY_LIST(buck->prefixes))
243 {
244 struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
245 log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen);
246 rem_node(&px->bucket_node);
247 fib_delete(&p->prefix_fib, px);
248 }
249 }
250
251 #ifndef IPV6 /* IPv4 version */
252
253 static byte *
254 bgp_create_update(struct bgp_conn *conn, byte *buf)
255 {
256 struct bgp_proto *p = conn->bgp;
257 struct bgp_bucket *buck;
258 int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
259 byte *w;
260 int wd_size = 0;
261 int r_size = 0;
262 int a_size = 0;
263
264 w = buf+2;
265 if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
266 {
267 DBG("Withdrawn routes:\n");
268 wd_size = bgp_encode_prefixes(p, w, buck, remains);
269 w += wd_size;
270 remains -= wd_size;
271 }
272 put_u16(buf, wd_size);
273
274 if (remains >= 3072)
275 {
276 while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
277 {
278 if (EMPTY_LIST(buck->prefixes))
279 {
280 DBG("Deleting empty bucket %p\n", buck);
281 rem_node(&buck->send_node);
282 bgp_free_bucket(p, buck);
283 continue;
284 }
285
286 DBG("Processing bucket %p\n", buck);
287 a_size = bgp_encode_attrs(p, w+2, buck->eattrs, 2048);
288
289 if (a_size < 0)
290 {
291 log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
292 bgp_flush_prefixes(p, buck);
293 rem_node(&buck->send_node);
294 bgp_free_bucket(p, buck);
295 continue;
296 }
297
298 put_u16(w, a_size);
299 w += a_size + 2;
300 r_size = bgp_encode_prefixes(p, w, buck, remains - a_size);
301 w += r_size;
302 break;
303 }
304 }
305 if (!a_size) /* Attributes not already encoded */
306 {
307 put_u16(w, 0);
308 w += 2;
309 }
310 if (wd_size || r_size)
311 {
312 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
313 return w;
314 }
315 else
316 return NULL;
317 }
318
319 #else /* IPv6 version */
320
321 static inline int
322 same_iface(struct bgp_proto *p, ip_addr *ip)
323 {
324 neighbor *n = neigh_find(&p->p, ip, 0);
325 return n && p->neigh && n->iface == p->neigh->iface;
326 }
327
328 static byte *
329 bgp_create_update(struct bgp_conn *conn, byte *buf)
330 {
331 struct bgp_proto *p = conn->bgp;
332 struct bgp_bucket *buck;
333 int size, second, rem_stored;
334 int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
335 byte *w, *w_stored, *tmp, *tstart;
336 ip_addr *ipp, ip, ip_ll;
337 ea_list *ea;
338 eattr *nh;
339
340 put_u16(buf, 0);
341 w = buf+4;
342
343 if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
344 {
345 DBG("Withdrawn routes:\n");
346 tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8);
347 *tmp++ = 0;
348 *tmp++ = BGP_AF_IPV6;
349 *tmp++ = 1;
350 ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11);
351 size = bgp_encode_attrs(p, w, ea, remains);
352 ASSERT(size >= 0);
353 w += size;
354 remains -= size;
355 }
356
357 if (remains >= 3072)
358 {
359 while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
360 {
361 if (EMPTY_LIST(buck->prefixes))
362 {
363 DBG("Deleting empty bucket %p\n", buck);
364 rem_node(&buck->send_node);
365 bgp_free_bucket(p, buck);
366 continue;
367 }
368
369 DBG("Processing bucket %p\n", buck);
370 rem_stored = remains;
371 w_stored = w;
372
373 size = bgp_encode_attrs(p, w, buck->eattrs, 2048);
374 if (size < 0)
375 {
376 log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
377 bgp_flush_prefixes(p, buck);
378 rem_node(&buck->send_node);
379 bgp_free_bucket(p, buck);
380 continue;
381 }
382 w += size;
383 remains -= size;
384
385 /* We have two addresses here in NEXT_HOP eattr. Really.
386 Unless NEXT_HOP was modified by filter */
387 nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
388 ASSERT(nh);
389 second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
390 ipp = (ip_addr *) nh->u.ptr->data;
391 ip = ipp[0];
392 ip_ll = IPA_NONE;
393
394 if (ipa_equal(ip, p->source_addr))
395 ip_ll = p->local_link;
396 else
397 {
398 /* If we send a route with 'third party' next hop destinated
399 * in the same interface, we should also send a link local
400 * next hop address. We use the received one (stored in the
401 * other part of BA_NEXT_HOP eattr). If we didn't received
402 * it (for example it is a static route), we can't use
403 * 'third party' next hop and we have to use local IP address
404 * as next hop. Sending original next hop address without
405 * link local address seems to be a natural way to solve that
406 * problem, but it is contrary to RFC 2545 and Quagga does not
407 * accept such routes.
408 *
409 * There are two cases, either we have global IP, or
410 * IPA_NONE if the neighbor is link-local. For IPA_NONE,
411 * we suppose it is on the same iface, see bgp_update_attrs().
412 */
413
414 if (ipa_zero(ip) || same_iface(p, &ip))
415 {
416 if (second && ipa_nonzero(ipp[1]))
417 ip_ll = ipp[1];
418 else
419 {
420 switch (p->cf->missing_lladdr)
421 {
422 case MLL_SELF:
423 ip = p->source_addr;
424 ip_ll = p->local_link;
425 break;
426 case MLL_DROP:
427 log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name);
428 w = w_stored;
429 remains = rem_stored;
430 bgp_flush_prefixes(p, buck);
431 rem_node(&buck->send_node);
432 bgp_free_bucket(p, buck);
433 continue;
434 case MLL_IGNORE:
435 break;
436 }
437 }
438 }
439 }
440
441 tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8);
442 *tmp++ = 0;
443 *tmp++ = BGP_AF_IPV6;
444 *tmp++ = 1;
445
446 if (ipa_has_link_scope(ip))
447 ip = IPA_NONE;
448
449 if (ipa_nonzero(ip_ll))
450 {
451 *tmp++ = 32;
452 ipa_hton(ip);
453 memcpy(tmp, &ip, 16);
454 ipa_hton(ip_ll);
455 memcpy(tmp+16, &ip_ll, 16);
456 tmp += 32;
457 }
458 else
459 {
460 *tmp++ = 16;
461 ipa_hton(ip);
462 memcpy(tmp, &ip, 16);
463 tmp += 16;
464 }
465
466 *tmp++ = 0; /* No SNPA information */
467 tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1));
468 ea->attrs[0].u.ptr->length = tmp - tstart;
469 size = bgp_encode_attrs(p, w, ea, remains);
470 ASSERT(size >= 0);
471 w += size;
472 break;
473 }
474 }
475
476 size = w - (buf+4);
477 put_u16(buf+2, size);
478 lp_flush(bgp_linpool);
479 if (size)
480 {
481 BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
482 return w;
483 }
484 else
485 return NULL;
486 }
487
488 #endif
489
490 static byte *
491 bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
492 {
493 struct bgp_proto *p = conn->bgp;
494 BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
495
496 *buf++ = 0;
497 *buf++ = BGP_AF;
498 *buf++ = 0; /* RFU */
499 *buf++ = 1; /* and SAFI 1 */
500 return buf;
501 }
502
503 static void
504 bgp_create_header(byte *buf, unsigned int len, unsigned int type)
505 {
506 memset(buf, 0xff, 16); /* Marker */
507 put_u16(buf+16, len);
508 buf[18] = type;
509 }
510
511 /**
512 * bgp_fire_tx - transmit packets
513 * @conn: connection
514 *
515 * Whenever the transmit buffers of the underlying TCP connection
516 * are free and we have any packets queued for sending, the socket functions
517 * call bgp_fire_tx() which takes care of selecting the highest priority packet
518 * queued (Notification > Keepalive > Open > Update), assembling its header
519 * and body and sending it to the connection.
520 */
521 static int
522 bgp_fire_tx(struct bgp_conn *conn)
523 {
524 struct bgp_proto *p = conn->bgp;
525 unsigned int s = conn->packets_to_send;
526 sock *sk = conn->sk;
527 byte *buf, *pkt, *end;
528 int type;
529
530 if (!sk)
531 {
532 conn->packets_to_send = 0;
533 return 0;
534 }
535 buf = sk->tbuf;
536 pkt = buf + BGP_HEADER_LENGTH;
537
538 if (s & (1 << PKT_SCHEDULE_CLOSE))
539 {
540 /* We can finally close connection and enter idle state */
541 bgp_conn_enter_idle_state(conn);
542 return 0;
543 }
544 if (s & (1 << PKT_NOTIFICATION))
545 {
546 s = 1 << PKT_SCHEDULE_CLOSE;
547 type = PKT_NOTIFICATION;
548 end = bgp_create_notification(conn, pkt);
549 }
550 else if (s & (1 << PKT_KEEPALIVE))
551 {
552 s &= ~(1 << PKT_KEEPALIVE);
553 type = PKT_KEEPALIVE;
554 end = pkt; /* Keepalives carry no data */
555 BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
556 bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
557 }
558 else if (s & (1 << PKT_OPEN))
559 {
560 s &= ~(1 << PKT_OPEN);
561 type = PKT_OPEN;
562 end = bgp_create_open(conn, pkt);
563 }
564 else if (s & (1 << PKT_ROUTE_REFRESH))
565 {
566 s &= ~(1 << PKT_ROUTE_REFRESH);
567 type = PKT_ROUTE_REFRESH;
568 end = bgp_create_route_refresh(conn, pkt);
569 }
570 else if (s & (1 << PKT_UPDATE))
571 {
572 end = bgp_create_update(conn, pkt);
573 type = PKT_UPDATE;
574 if (!end)
575 {
576 conn->packets_to_send = 0;
577 return 0;
578 }
579 }
580 else
581 return 0;
582 conn->packets_to_send = s;
583 bgp_create_header(buf, end - buf, type);
584 return sk_send(sk, end - buf);
585 }
586
587 /**
588 * bgp_schedule_packet - schedule a packet for transmission
589 * @conn: connection
590 * @type: packet type
591 *
592 * Schedule a packet of type @type to be sent as soon as possible.
593 */
594 void
595 bgp_schedule_packet(struct bgp_conn *conn, int type)
596 {
597 DBG("BGP: Scheduling packet type %d\n", type);
598 conn->packets_to_send |= 1 << type;
599 if (conn->sk && conn->sk->tpos == conn->sk->tbuf)
600 ev_schedule(conn->tx_ev);
601 }
602
603 void
604 bgp_kick_tx(void *vconn)
605 {
606 struct bgp_conn *conn = vconn;
607
608 DBG("BGP: kicking TX\n");
609 while (bgp_fire_tx(conn) > 0)
610 ;
611 }
612
613 void
614 bgp_tx(sock *sk)
615 {
616 struct bgp_conn *conn = sk->data;
617
618 DBG("BGP: TX hook\n");
619 while (bgp_fire_tx(conn) > 0)
620 ;
621 }
622
623 /* Capatibility negotiation as per RFC 2842 */
624
625 void
626 bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
627 {
628 // struct bgp_proto *p = conn->bgp;
629 int cl;
630
631 while (len > 0)
632 {
633 if (len < 2 || len < 2 + opt[1])
634 goto err;
635
636 cl = opt[1];
637
638 switch (opt[0])
639 {
640 case 2: /* Route refresh capability, RFC 2918 */
641 if (cl != 0)
642 goto err;
643 conn->peer_refresh_support = 1;
644 break;
645
646 case 65: /* AS4 capability, RFC 4893 */
647 if (cl != 4)
648 goto err;
649 conn->peer_as4_support = 1;
650 if (conn->want_as4_support)
651 conn->advertised_as = get_u32(opt + 2);
652 break;
653
654 /* We can safely ignore all other capabilities */
655 }
656 len -= 2 + cl;
657 opt += 2 + cl;
658 }
659 return;
660
661 err:
662 bgp_error(conn, 2, 0, NULL, 0);
663 return;
664 }
665
666 static int
667 bgp_parse_options(struct bgp_conn *conn, byte *opt, int len)
668 {
669 struct bgp_proto *p = conn->bgp;
670 int ol;
671
672 while (len > 0)
673 {
674 if (len < 2 || len < 2 + opt[1])
675 { bgp_error(conn, 2, 0, NULL, 0); return 0; }
676 #ifdef LOCAL_DEBUG
677 {
678 int i;
679 DBG("\tOption %02x:", opt[0]);
680 for(i=0; i<opt[1]; i++)
681 DBG(" %02x", opt[2+i]);
682 DBG("\n");
683 }
684 #endif
685
686 ol = opt[1];
687 switch (opt[0])
688 {
689 case 2:
690 if (conn->start_state == BSS_CONNECT_NOCAP)
691 BGP_TRACE(D_PACKETS, "Ignoring received capabilities");
692 else
693 bgp_parse_capabilities(conn, opt + 2, ol);
694 break;
695
696 default:
697 /*
698 * BGP specs don't tell us to send which option
699 * we didn't recognize, but it's common practice
700 * to do so. Also, capability negotiation with
701 * Cisco routers doesn't work without that.
702 */
703 bgp_error(conn, 2, 4, opt, ol);
704 return 0;
705 }
706 len -= 2 + ol;
707 opt += 2 + ol;
708 }
709 return 0;
710 }
711
712 static void
713 bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
714 {
715 struct bgp_conn *other;
716 struct bgp_proto *p = conn->bgp;
717 unsigned hold;
718 u16 base_as;
719 u32 id;
720
721 /* Check state */
722 if (conn->state != BS_OPENSENT)
723 { bgp_error(conn, 5, 0, NULL, 0); return; }
724
725 /* Check message contents */
726 if (len < 29 || len != 29 + pkt[28])
727 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
728 if (pkt[19] != BGP_VERSION)
729 { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */
730 conn->advertised_as = base_as = get_u16(pkt+20);
731 hold = get_u16(pkt+22);
732 id = get_u32(pkt+24);
733 BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id);
734
735 if (bgp_parse_options(conn, pkt+29, pkt[28]))
736 return;
737
738 if (hold > 0 && hold < 3)
739 { bgp_error(conn, 2, 6, pkt+22, 2); return; }
740
741 if (!id || id == 0xffffffff || id == p->local_id)
742 { bgp_error(conn, 2, 3, pkt+24, -4); return; }
743
744 if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
745 log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
746
747 if (conn->advertised_as != p->remote_as)
748 {
749 if (conn->peer_as4_support)
750 {
751 u32 val = htonl(conn->advertised_as);
752 bgp_error(conn, 2, 2, (byte *) &val, 4);
753 }
754 else
755 bgp_error(conn, 2, 2, pkt+20, 2);
756
757 return;
758 }
759
760 /* Check the other connection */
761 other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
762 switch (other->state)
763 {
764 case BS_IDLE:
765 case BS_CONNECT:
766 case BS_ACTIVE:
767 case BS_OPENSENT:
768 case BS_CLOSE:
769 break;
770 case BS_OPENCONFIRM:
771 if ((p->local_id < id) == (conn == &p->incoming_conn))
772 {
773 /* Should close the other connection */
774 BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
775 bgp_error(other, 6, 7, NULL, 0);
776 break;
777 }
778 /* Fall thru */
779 case BS_ESTABLISHED:
780 /* Should close this connection */
781 BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
782 bgp_error(conn, 6, 7, NULL, 0);
783 return;
784 default:
785 bug("bgp_rx_open: Unknown state");
786 }
787
788 /* Update our local variables */
789 conn->hold_time = MIN(hold, p->cf->hold_time);
790 conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
791 p->remote_id = id;
792 p->as4_session = conn->want_as4_support && conn->peer_as4_support;
793
794 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session);
795
796 bgp_schedule_packet(conn, PKT_KEEPALIVE);
797 bgp_start_timer(conn->hold_timer, conn->hold_time);
798 bgp_conn_enter_openconfirm_state(conn);
799 }
800
801 #define DECODE_PREFIX(pp, ll) do { \
802 int b = *pp++; \
803 int q; \
804 ll--; \
805 if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \
806 q = (b+7) / 8; \
807 if (ll < q) { err=1; goto done; } \
808 memcpy(&prefix, pp, q); \
809 pp += q; \
810 ll -= q; \
811 ipa_ntoh(prefix); \
812 prefix = ipa_and(prefix, ipa_mkmask(b)); \
813 pxlen = b; \
814 } while (0)
815
816 static inline int
817 bgp_set_next_hop(struct bgp_proto *p, rta *a)
818 {
819 struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
820 ip_addr *nexthop = (ip_addr *) nh->u.ptr->data;
821
822 #ifdef IPV6
823 int second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
824
825 /* First address should not be link-local, but may be zero in direct mode */
826 if (ipa_has_link_scope(*nexthop))
827 *nexthop = IPA_NONE;
828 #else
829 int second = 0;
830 #endif
831
832 if (p->cf->gw_mode == GW_DIRECT)
833 {
834 neighbor *ng = NULL;
835
836 if (ipa_nonzero(*nexthop))
837 ng = neigh_find(&p->p, nexthop, 0);
838 else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */
839 ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0);
840
841 /* Fallback */
842 if (!ng)
843 ng = p->neigh;
844
845 if (ng->scope == SCOPE_HOST)
846 return 0;
847
848 a->dest = RTD_ROUTER;
849 a->gw = ng->addr;
850 a->iface = ng->iface;
851 a->hostentry = NULL;
852 a->igp_metric = 0;
853 }
854 else /* GW_RECURSIVE */
855 {
856 if (ipa_zero(*nexthop))
857 return 0;
858
859 rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
860 }
861
862 return 1;
863 }
864
865 #ifndef IPV6 /* IPv4 version */
866
867 static void
868 bgp_do_rx_update(struct bgp_conn *conn,
869 byte *withdrawn, int withdrawn_len,
870 byte *nlri, int nlri_len,
871 byte *attrs, int attr_len)
872 {
873 struct bgp_proto *p = conn->bgp;
874 net *n;
875 rta *a0, *a = NULL;
876 ip_addr prefix;
877 int pxlen, err = 0;
878
879 /* Withdraw routes */
880 while (withdrawn_len)
881 {
882 DECODE_PREFIX(withdrawn, withdrawn_len);
883 DBG("Withdraw %I/%d\n", prefix, pxlen);
884 if (n = net_find(p->p.table, prefix, pxlen))
885 rte_update(p->p.table, n, &p->p, &p->p, NULL);
886 }
887
888 if (!attr_len && !nlri_len) /* shortcut */
889 return;
890
891 a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len);
892
893 if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
894 return;
895
896 if (a0 && nlri_len && bgp_set_next_hop(p, a0))
897 a = rta_lookup(a0);
898
899 while (nlri_len)
900 {
901 DECODE_PREFIX(nlri, nlri_len);
902 DBG("Add %I/%d\n", prefix, pxlen);
903
904 if (a)
905 {
906 rte *e = rte_get_temp(rta_clone(a));
907 e->net = net_get(p->p.table, prefix, pxlen);
908 e->pflags = 0;
909 e->u.bgp.suppressed = 0;
910 rte_update(p->p.table, e->net, &p->p, &p->p, e);
911 }
912 else
913 {
914 /* Forced withdraw as a result of soft error */
915 if (n = net_find(p->p.table, prefix, pxlen))
916 rte_update(p->p.table, n, &p->p, &p->p, NULL);
917 }
918
919 if (bgp_apply_limits(p) < 0)
920 goto done;
921 }
922
923 done:
924 if (a)
925 rta_free(a);
926
927 if (err)
928 bgp_error(conn, 3, err, NULL, 0);
929
930 return;
931 }
932
933 #else /* IPv6 version */
934
935 #define DO_NLRI(name) \
936 start = x = p->name##_start; \
937 len = len0 = p->name##_len; \
938 if (len) \
939 { \
940 if (len < 3) { err=9; goto done; } \
941 af = get_u16(x); \
942 sub = x[2]; \
943 x += 3; \
944 len -= 3; \
945 DBG("\tNLRI AF=%d sub=%d len=%d\n", af, sub, len);\
946 } \
947 else \
948 af = 0; \
949 if (af == BGP_AF_IPV6)
950
951 static void
952 bgp_attach_next_hop(rta *a0, byte *x)
953 {
954 ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
955 memcpy(nh, x+1, 16);
956 ipa_ntoh(nh[0]);
957
958 /* We store received link local address in the other part of BA_NEXT_HOP eattr. */
959 if (*x == 32)
960 {
961 memcpy(nh+1, x+17, 16);
962 ipa_ntoh(nh[1]);
963 }
964 else
965 nh[1] = IPA_NONE;
966 }
967
968
969 static void
970 bgp_do_rx_update(struct bgp_conn *conn,
971 byte *withdrawn, int withdrawn_len,
972 byte *nlri, int nlri_len,
973 byte *attrs, int attr_len)
974 {
975 struct bgp_proto *p = conn->bgp;
976 byte *start, *x;
977 int len, len0;
978 unsigned af, sub;
979 net *n;
980 rta *a0, *a = NULL;
981 ip_addr prefix;
982 int pxlen, err = 0;
983
984 p->mp_reach_len = 0;
985 p->mp_unreach_len = 0;
986 a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0);
987
988 if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
989 return;
990
991 DO_NLRI(mp_unreach)
992 {
993 while (len)
994 {
995 DECODE_PREFIX(x, len);
996 DBG("Withdraw %I/%d\n", prefix, pxlen);
997 if (n = net_find(p->p.table, prefix, pxlen))
998 rte_update(p->p.table, n, &p->p, &p->p, NULL);
999 }
1000 }
1001
1002 DO_NLRI(mp_reach)
1003 {
1004 /* Create fake NEXT_HOP attribute */
1005 if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2)
1006 { err = 9; goto done; }
1007
1008 if (a0)
1009 bgp_attach_next_hop(a0, x);
1010
1011 /* Also ignore one reserved byte */
1012 len -= *x + 2;
1013 x += *x + 2;
1014
1015 if (a0 && bgp_set_next_hop(p, a0))
1016 a = rta_lookup(a0);
1017
1018 while (len)
1019 {
1020 DECODE_PREFIX(x, len);
1021 DBG("Add %I/%d\n", prefix, pxlen);
1022
1023 if (a)
1024 {
1025 rte *e = rte_get_temp(rta_clone(a));
1026 e->net = net_get(p->p.table, prefix, pxlen);
1027 e->pflags = 0;
1028 e->u.bgp.suppressed = 0;
1029 rte_update(p->p.table, e->net, &p->p, &p->p, e);
1030 }
1031 else
1032 {
1033 /* Forced withdraw as a result of soft error */
1034 if (n = net_find(p->p.table, prefix, pxlen))
1035 rte_update(p->p.table, n, &p->p, &p->p, NULL);
1036 }
1037
1038 if (bgp_apply_limits(p) < 0)
1039 goto done;
1040 }
1041 }
1042
1043 done:
1044 if (a)
1045 rta_free(a);
1046
1047 if (err) /* Use subcode 9, not err */
1048 bgp_error(conn, 3, 9, NULL, 0);
1049
1050 return;
1051 }
1052
1053 #endif
1054
1055 static void
1056 bgp_rx_update(struct bgp_conn *conn, byte *pkt, int len)
1057 {
1058 struct bgp_proto *p = conn->bgp;
1059 byte *withdrawn, *attrs, *nlri;
1060 int withdrawn_len, attr_len, nlri_len;
1061
1062 BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
1063
1064 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1065 if (conn->state == BS_OPENCONFIRM)
1066 bgp_conn_enter_established_state(conn);
1067
1068 if (conn->state != BS_ESTABLISHED)
1069 { bgp_error(conn, 5, 0, NULL, 0); return; }
1070 bgp_start_timer(conn->hold_timer, conn->hold_time);
1071
1072 /* Find parts of the packet and check sizes */
1073 if (len < 23)
1074 {
1075 bgp_error(conn, 1, 2, pkt+16, 2);
1076 return;
1077 }
1078 withdrawn = pkt + 21;
1079 withdrawn_len = get_u16(pkt + 19);
1080 if (withdrawn_len + 23 > len)
1081 goto malformed;
1082 attrs = withdrawn + withdrawn_len + 2;
1083 attr_len = get_u16(attrs - 2);
1084 if (withdrawn_len + attr_len + 23 > len)
1085 goto malformed;
1086 nlri = attrs + attr_len;
1087 nlri_len = len - withdrawn_len - attr_len - 23;
1088 if (!attr_len && nlri_len)
1089 goto malformed;
1090 DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len);
1091
1092 lp_flush(bgp_linpool);
1093
1094 bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len);
1095 return;
1096
1097 malformed:
1098 bgp_error(conn, 3, 1, NULL, 0);
1099 }
1100
1101 static struct {
1102 byte major, minor;
1103 byte *msg;
1104 } bgp_msg_table[] = {
1105 { 1, 0, "Invalid message header" },
1106 { 1, 1, "Connection not synchronized" },
1107 { 1, 2, "Bad message length" },
1108 { 1, 3, "Bad message type" },
1109 { 2, 0, "Invalid OPEN message" },
1110 { 2, 1, "Unsupported version number" },
1111 { 2, 2, "Bad peer AS" },
1112 { 2, 3, "Bad BGP identifier" },
1113 { 2, 4, "Unsupported optional parameter" },
1114 { 2, 5, "Authentication failure" },
1115 { 2, 6, "Unacceptable hold time" },
1116 { 2, 7, "Required capability missing" }, /* [RFC3392] */
1117 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
1118 { 3, 0, "Invalid UPDATE message" },
1119 { 3, 1, "Malformed attribute list" },
1120 { 3, 2, "Unrecognized well-known attribute" },
1121 { 3, 3, "Missing mandatory attribute" },
1122 { 3, 4, "Invalid attribute flags" },
1123 { 3, 5, "Invalid attribute length" },
1124 { 3, 6, "Invalid ORIGIN attribute" },
1125 { 3, 7, "AS routing loop" }, /* Deprecated */
1126 { 3, 8, "Invalid NEXT_HOP attribute" },
1127 { 3, 9, "Optional attribute error" },
1128 { 3, 10, "Invalid network field" },
1129 { 3, 11, "Malformed AS_PATH" },
1130 { 4, 0, "Hold timer expired" },
1131 { 5, 0, "Finite state machine error" },
1132 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
1133 { 6, 1, "Maximum number of prefixes reached" },
1134 { 6, 2, "Administrative shutdown" },
1135 { 6, 3, "Peer de-configured" },
1136 { 6, 4, "Administrative reset" },
1137 { 6, 5, "Connection rejected" },
1138 { 6, 6, "Other configuration change" },
1139 { 6, 7, "Connection collision resolution" },
1140 { 6, 8, "Out of Resources" }
1141 };
1142
1143 /**
1144 * bgp_error_dsc - return BGP error description
1145 * @code: BGP error code
1146 * @subcode: BGP error subcode
1147 *
1148 * bgp_error_dsc() returns error description for BGP errors
1149 * which might be static string or given temporary buffer.
1150 */
1151 const char *
1152 bgp_error_dsc(unsigned code, unsigned subcode)
1153 {
1154 static char buff[32];
1155 unsigned i;
1156 for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
1157 if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
1158 {
1159 return bgp_msg_table[i].msg;
1160 }
1161
1162 bsprintf(buff, "Unknown error %d.%d", code, subcode);
1163 return buff;
1164 }
1165
1166 void
1167 bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len)
1168 {
1169 const byte *name;
1170 byte *t, argbuf[36];
1171 unsigned i;
1172
1173 /* Don't report Cease messages generated by myself */
1174 if (code == 6 && class == BE_BGP_TX)
1175 return;
1176
1177 name = bgp_error_dsc(code, subcode);
1178 t = argbuf;
1179 if (len)
1180 {
1181 *t++ = ':';
1182 *t++ = ' ';
1183
1184 if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
1185 {
1186 /* Bad peer AS - we would like to print the AS */
1187 t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data));
1188 goto done;
1189 }
1190 if (len > 16)
1191 len = 16;
1192 for (i=0; i<len; i++)
1193 t += bsprintf(t, "%02x", data[i]);
1194 }
1195 done:
1196 *t = 0;
1197 log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
1198 }
1199
1200 static void
1201 bgp_rx_notification(struct bgp_conn *conn, byte *pkt, int len)
1202 {
1203 struct bgp_proto *p = conn->bgp;
1204 if (len < 21)
1205 {
1206 bgp_error(conn, 1, 2, pkt+16, 2);
1207 return;
1208 }
1209
1210 unsigned code = pkt[19];
1211 unsigned subcode = pkt[20];
1212 int err = (code != 6);
1213
1214 bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
1215 bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
1216
1217 #ifndef IPV6
1218 if ((code == 2) && ((subcode == 4) || (subcode == 7))
1219 /* Error related to capability:
1220 * 4 - Peer does not support capabilities at all.
1221 * 7 - Peer request some capability. Strange unless it is IPv6 only peer.
1222 */
1223 && (p->cf->capabilities == 2)
1224 /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */
1225 && (conn->start_state == BSS_CONNECT)
1226 /* Failed connection attempt have used capabilities */
1227 && (p->cf->remote_as <= 0xFFFF))
1228 /* Not possible with disabled capabilities */
1229 {
1230 /* We try connect without capabilities */
1231 log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name);
1232 p->start_state = BSS_CONNECT_NOCAP;
1233 err = 0;
1234 }
1235 #endif
1236
1237 bgp_conn_enter_close_state(conn);
1238 bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE);
1239
1240 if (err)
1241 {
1242 bgp_update_startup_delay(p);
1243 bgp_stop(p, 0);
1244 }
1245 }
1246
1247 static void
1248 bgp_rx_keepalive(struct bgp_conn *conn)
1249 {
1250 struct bgp_proto *p = conn->bgp;
1251
1252 BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
1253 bgp_start_timer(conn->hold_timer, conn->hold_time);
1254 switch (conn->state)
1255 {
1256 case BS_OPENCONFIRM:
1257 bgp_conn_enter_established_state(conn);
1258 break;
1259 case BS_ESTABLISHED:
1260 break;
1261 default:
1262 bgp_error(conn, 5, 0, NULL, 0);
1263 }
1264 }
1265
1266 static void
1267 bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, int len)
1268 {
1269 struct bgp_proto *p = conn->bgp;
1270
1271 BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
1272
1273 if (conn->state != BS_ESTABLISHED)
1274 { bgp_error(conn, 5, 0, NULL, 0); return; }
1275
1276 if (!p->cf->enable_refresh)
1277 { bgp_error(conn, 1, 3, pkt+18, 1); return; }
1278
1279 if (len != (BGP_HEADER_LENGTH + 4))
1280 { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1281
1282 /* FIXME - we ignore AFI/SAFI values, as we support
1283 just one value and even an error code for an invalid
1284 request is not defined */
1285
1286 proto_request_feeding(&p->p);
1287 }
1288
1289
1290 /**
1291 * bgp_rx_packet - handle a received packet
1292 * @conn: BGP connection
1293 * @pkt: start of the packet
1294 * @len: packet size
1295 *
1296 * bgp_rx_packet() takes a newly received packet and calls the corresponding
1297 * packet handler according to the packet type.
1298 */
1299 static void
1300 bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
1301 {
1302 byte type = pkt[18];
1303
1304 DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
1305
1306 if (conn->bgp->p.mrtdump & MD_MESSAGES)
1307 mrt_dump_bgp_packet(conn, pkt, len);
1308
1309 switch (type)
1310 {
1311 case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
1312 case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
1313 case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
1314 case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
1315 case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
1316 default: bgp_error(conn, 1, 3, pkt+18, 1);
1317 }
1318 }
1319
1320 /**
1321 * bgp_rx - handle received data
1322 * @sk: socket
1323 * @size: amount of data received
1324 *
1325 * bgp_rx() is called by the socket layer whenever new data arrive from
1326 * the underlying TCP connection. It assembles the data fragments to packets,
1327 * checks their headers and framing and passes complete packets to
1328 * bgp_rx_packet().
1329 */
1330 int
1331 bgp_rx(sock *sk, int size)
1332 {
1333 struct bgp_conn *conn = sk->data;
1334 byte *pkt_start = sk->rbuf;
1335 byte *end = pkt_start + size;
1336 unsigned i, len;
1337
1338 DBG("BGP: RX hook: Got %d bytes\n", size);
1339 while (end >= pkt_start + BGP_HEADER_LENGTH)
1340 {
1341 if ((conn->state == BS_CLOSE) || (conn->sk != sk))
1342 return 0;
1343 for(i=0; i<16; i++)
1344 if (pkt_start[i] != 0xff)
1345 {
1346 bgp_error(conn, 1, 1, NULL, 0);
1347 break;
1348 }
1349 len = get_u16(pkt_start+16);
1350 if (len < BGP_HEADER_LENGTH || len > BGP_MAX_PACKET_LENGTH)
1351 {
1352 bgp_error(conn, 1, 2, pkt_start+16, 2);
1353 break;
1354 }
1355 if (end < pkt_start + len)
1356 break;
1357 bgp_rx_packet(conn, pkt_start, len);
1358 pkt_start += len;
1359 }
1360 if (pkt_start != sk->rbuf)
1361 {
1362 memmove(sk->rbuf, pkt_start, end - pkt_start);
1363 sk->rpos = sk->rbuf + (end - pkt_start);
1364 }
1365 return 0;
1366 }