]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/attrs.c
Merge branch 'master' into add-path
[thirdparty/bird.git] / proto / bgp / attrs.c
1 /*
2 * BIRD -- BGP Attributes
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 #undef LOCAL_DEBUG
10
11 #include <stdlib.h>
12
13 #include "nest/bird.h"
14 #include "nest/iface.h"
15 #include "nest/protocol.h"
16 #include "nest/route.h"
17 #include "nest/attrs.h"
18 #include "conf/conf.h"
19 #include "lib/resource.h"
20 #include "lib/string.h"
21 #include "lib/unaligned.h"
22
23 #include "bgp.h"
24
25 /*
26 * UPDATE message error handling
27 *
28 * All checks from RFC 4271 6.3 are done as specified with these exceptions:
29 * - The semantic check of an IP address from NEXT_HOP attribute is missing.
30 * - Checks of some optional attribute values are missing.
31 * - Syntactic and semantic checks of NLRIs (done in DECODE_PREFIX())
32 * are probably inadequate.
33 *
34 * Loop detection based on AS_PATH causes updates to be withdrawn. RFC
35 * 4271 does not explicitly specifiy the behavior in that case.
36 *
37 * Loop detection related to route reflection (based on ORIGINATOR_ID
38 * and CLUSTER_LIST) causes updates to be withdrawn. RFC 4456 8
39 * specifies that such updates should be ignored, but that is generally
40 * a bad idea.
41 *
42 * Error checking of optional transitive attributes is done according to
43 * draft-ietf-idr-optional-transitive-03, but errors are handled always
44 * as withdraws.
45 *
46 * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed,
47 * but unknown segments cause a session drop with Malformed AS_PATH
48 * error (see validate_path()). The behavior in such case is not
49 * explicitly specified by RFC 4271. RFC 5065 specifies that
50 * inconsistent AS_CONFED_* segments should cause a session drop, but
51 * implementations that pass invalid AS_CONFED_* segments are
52 * widespread.
53 *
54 * Error handling of AS4_* attributes is done as specified by
55 * draft-ietf-idr-rfc4893bis-03. There are several possible
56 * inconsistencies between AGGREGATOR and AS4_AGGREGATOR that are not
57 * handled by that draft, these are logged and ignored (see
58 * bgp_reconstruct_4b_attrs()).
59 */
60
61 static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH
62 #ifndef IPV6
63 ,BA_NEXT_HOP
64 #endif
65 };
66
67 struct attr_desc {
68 char *name;
69 int expected_length;
70 int expected_flags;
71 int type;
72 int allow_in_ebgp;
73 int (*validate)(struct bgp_proto *p, byte *attr, int len);
74 void (*format)(eattr *ea, byte *buf, int buflen);
75 };
76
77 #define IGNORE -1
78 #define WITHDRAW -2
79
80 static int
81 bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED)
82 {
83 if (*a > 2)
84 return 6;
85 return 0;
86 }
87
88 static void
89 bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED)
90 {
91 static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };
92
93 bsprintf(buf, bgp_origin_names[a->u.data]);
94 }
95
96 static int
97 path_segment_contains(byte *p, int bs, u32 asn)
98 {
99 int i;
100 int len = p[1];
101 p += 2;
102
103 for(i=0; i<len; i++)
104 {
105 u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p);
106 if (asn2 == asn)
107 return 1;
108 p += bs;
109 }
110
111 return 0;
112 }
113
114 /* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */
115 static int
116 validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, unsigned int *ilength)
117 {
118 int res = 0;
119 u8 *a, *dst;
120 int len, plen, copy;
121
122 dst = a = idata;
123 len = *ilength;
124
125 while (len)
126 {
127 if (len < 2)
128 return -1;
129
130 plen = 2 + bs * a[1];
131 if (len < plen)
132 return -1;
133
134 switch (a[0])
135 {
136 case AS_PATH_SET:
137 copy = 1;
138 res++;
139 break;
140
141 case AS_PATH_SEQUENCE:
142 copy = 1;
143 res += a[1];
144 break;
145
146 case AS_PATH_CONFED_SEQUENCE:
147 case AS_PATH_CONFED_SET:
148 if (as_path && path_segment_contains(a, bs, p->remote_as))
149 {
150 log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name);
151 return -1;
152 }
153
154 log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment",
155 p->p.name, as_path ? "AS" : "AS4");
156 copy = 0;
157 break;
158
159 default:
160 return -1;
161 }
162
163 if (copy)
164 {
165 if (dst != a)
166 memmove(dst, a, plen);
167 dst += plen;
168 }
169
170 len -= plen;
171 a += plen;
172 }
173
174 *ilength = dst - idata;
175 return res;
176 }
177
178 static inline int
179 validate_as_path(struct bgp_proto *p, byte *a, int *len)
180 {
181 return validate_path(p, 1, p->as4_session ? 4 : 2, a, len);
182 }
183
184 static inline int
185 validate_as4_path(struct bgp_proto *p, struct adata *path)
186 {
187 return validate_path(p, 0, 4, path->data, &path->length);
188 }
189
190 static int
191 bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a, int len)
192 {
193 #ifdef IPV6
194 return IGNORE;
195 #else
196 ip_addr addr;
197
198 memcpy(&addr, a, len);
199 ipa_ntoh(addr);
200 if (ipa_classify(addr) & IADDR_HOST)
201 return 0;
202 else
203 return 8;
204 #endif
205 }
206
207 static void
208 bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED)
209 {
210 ip_addr *ipp = (ip_addr *) a->u.ptr->data;
211 #ifdef IPV6
212 /* in IPv6, we might have two addresses in NEXT HOP */
213 if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1]))
214 {
215 bsprintf(buf, "%I %I", ipp[0], ipp[1]);
216 return;
217 }
218 #endif
219
220 bsprintf(buf, "%I", ipp[0]);
221 }
222
223 static int
224 bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len)
225 {
226 int exp_len = p->as4_session ? 8 : 6;
227
228 return (len == exp_len) ? 0 : WITHDRAW;
229 }
230
231 static void
232 bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED)
233 {
234 struct adata *ad = a->u.ptr;
235 byte *data = ad->data;
236 u32 as;
237
238 as = get_u32(data);
239 data += 4;
240
241 bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as);
242 }
243
244 static int
245 bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
246 {
247 return ((len % 4) == 0) ? 0 : WITHDRAW;
248 }
249
250 static int
251 bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
252 {
253 return ((len % 4) == 0) ? 0 : 5;
254 }
255
256 static void
257 bgp_format_cluster_list(eattr *a, byte *buf, int buflen)
258 {
259 /* Truncates cluster lists larger than buflen, probably not a problem */
260 int_set_format(a->u.ptr, 0, -1, buf, buflen);
261 }
262
263 static int
264 bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
265 {
266 #ifdef IPV6
267 p->mp_reach_start = a;
268 p->mp_reach_len = len;
269 #endif
270 return IGNORE;
271 }
272
273 static int
274 bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
275 {
276 #ifdef IPV6
277 p->mp_unreach_start = a;
278 p->mp_unreach_len = len;
279 #endif
280 return IGNORE;
281 }
282
283 static int
284 bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
285 {
286 return ((len % 8) == 0) ? 0 : WITHDRAW;
287 }
288
289
290 static struct attr_desc bgp_attr_table[] = {
291 { NULL, -1, 0, 0, 0, /* Undefined */
292 NULL, NULL },
293 { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_ORIGIN */
294 bgp_check_origin, bgp_format_origin },
295 { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1, /* BA_AS_PATH */
296 NULL, NULL }, /* is checked by validate_as_path() as a special case */
297 { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1, /* BA_NEXT_HOP */
298 bgp_check_next_hop, bgp_format_next_hop },
299 { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1, /* BA_MULTI_EXIT_DISC */
300 NULL, NULL },
301 { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 0, /* BA_LOCAL_PREF */
302 NULL, NULL },
303 { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_ATOMIC_AGGR */
304 NULL, NULL },
305 { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AGGREGATOR */
306 bgp_check_aggregator, bgp_format_aggregator },
307 { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1, /* BA_COMMUNITY */
308 bgp_check_community, NULL },
309 { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0, /* BA_ORIGINATOR_ID */
310 NULL, NULL },
311 { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0, /* BA_CLUSTER_LIST */
312 bgp_check_cluster_list, bgp_format_cluster_list },
313 { .name = NULL }, /* BA_DPA */
314 { .name = NULL }, /* BA_ADVERTISER */
315 { .name = NULL }, /* BA_RCID_PATH */
316 { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_REACH_NLRI */
317 bgp_check_reach_nlri, NULL },
318 { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_UNREACH_NLRI */
319 bgp_check_unreach_nlri, NULL },
320 { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1, /* BA_EXT_COMMUNITY */
321 bgp_check_ext_community, NULL },
322 { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
323 NULL, NULL },
324 { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
325 NULL, NULL }
326 };
327
328 /* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH.
329 * It does not matter as this attribute does not appear on routes in the routing table.
330 */
331
332 #define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)
333
334 static inline struct adata *
335 bgp_alloc_adata(struct linpool *pool, unsigned len)
336 {
337 struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len);
338 ad->length = len;
339 return ad;
340 }
341
342 static void
343 bgp_set_attr(eattr *e, unsigned attr, uintptr_t val)
344 {
345 ASSERT(ATTR_KNOWN(attr));
346 e->id = EA_CODE(EAP_BGP, attr);
347 e->type = bgp_attr_table[attr].type;
348 e->flags = bgp_attr_table[attr].expected_flags;
349 if (e->type & EAF_EMBEDDED)
350 e->u.data = val;
351 else
352 e->u.ptr = (struct adata *) val;
353 }
354
355 static byte *
356 bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len)
357 {
358 struct adata *ad = bgp_alloc_adata(pool, len);
359 bgp_set_attr(e, attr, (uintptr_t) ad);
360 return ad->data;
361 }
362
363 void
364 bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val)
365 {
366 ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
367 a->next = *to;
368 *to = a;
369 a->flags = EALF_SORTED;
370 a->count = 1;
371 bgp_set_attr(a->attrs, attr, val);
372 }
373
374 byte *
375 bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len)
376 {
377 struct adata *ad = bgp_alloc_adata(pool, len);
378 bgp_attach_attr(to, pool, attr, (uintptr_t) ad);
379 return ad->data;
380 }
381
382 static int
383 bgp_encode_attr_hdr(byte *dst, unsigned int flags, unsigned code, int len)
384 {
385 int wlen;
386
387 DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags);
388
389 if (len < 256)
390 {
391 *dst++ = flags;
392 *dst++ = code;
393 *dst++ = len;
394 wlen = 3;
395 }
396 else
397 {
398 *dst++ = flags | BAF_EXT_LEN;
399 *dst++ = code;
400 put_u16(dst, len);
401 wlen = 4;
402 }
403
404 return wlen;
405 }
406
407 static void
408 aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used)
409 {
410 byte *src = aggr->data;
411 *new_used = 0;
412
413 u32 as = get_u32(src);
414 if (as > 0xFFFF)
415 {
416 as = AS_TRANS;
417 *new_used = 1;
418 }
419 put_u16(dst, as);
420
421 /* Copy IPv4 address */
422 memcpy(dst + 2, src + 4, 4);
423 }
424
425 static void
426 aggregator_convert_to_new(struct adata *aggr, byte *dst)
427 {
428 byte *src = aggr->data;
429
430 u32 as = get_u16(src);
431 put_u32(dst, as);
432
433 /* Copy IPv4 address */
434 memcpy(dst + 4, src + 2, 4);
435 }
436
437 static int
438 bgp_get_attr_len(eattr *a)
439 {
440 int len;
441 if (ATTR_KNOWN(EA_ID(a->id)))
442 {
443 int code = EA_ID(a->id);
444 struct attr_desc *desc = &bgp_attr_table[code];
445 len = desc->expected_length;
446 if (len < 0)
447 {
448 ASSERT(!(a->type & EAF_EMBEDDED));
449 len = a->u.ptr->length;
450 }
451 }
452 else
453 {
454 ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE);
455 len = a->u.ptr->length;
456 }
457
458 return len;
459 }
460
461 #define ADVANCE(w, r, l) do { r -= l; w += l; } while (0)
462
463 /**
464 * bgp_encode_attrs - encode BGP attributes
465 * @p: BGP instance
466 * @w: buffer
467 * @attrs: a list of extended attributes
468 * @remains: remaining space in the buffer
469 *
470 * The bgp_encode_attrs() function takes a list of extended attributes
471 * and converts it to its BGP representation (a part of an Update message).
472 *
473 * Result: Length of the attribute block generated or -1 if not enough space.
474 */
475 unsigned int
476 bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains)
477 {
478 unsigned int i, code, type, flags;
479 byte *start = w;
480 int len, rv;
481
482 for(i=0; i<attrs->count; i++)
483 {
484 eattr *a = &attrs->attrs[i];
485 ASSERT(EA_PROTO(a->id) == EAP_BGP);
486 code = EA_ID(a->id);
487
488 #ifdef IPV6
489 /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
490 if (code == BA_NEXT_HOP)
491 continue;
492 #endif
493
494 /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
495 * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH
496 * as optional AS4_PATH attribute.
497 */
498 if ((code == BA_AS_PATH) && (! p->as4_session))
499 {
500 len = a->u.ptr->length;
501
502 if (remains < (len + 4))
503 goto err_no_buffer;
504
505 /* Using temporary buffer because don't know a length of created attr
506 * and therefore a length of a header. Perhaps i should better always
507 * use BAF_EXT_LEN. */
508
509 byte buf[len];
510 int new_used;
511 int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used);
512
513 DBG("BGP: Encoding old AS_PATH\n");
514 rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl);
515 ADVANCE(w, remains, rv);
516 memcpy(w, buf, nl);
517 ADVANCE(w, remains, nl);
518
519 if (! new_used)
520 continue;
521
522 if (remains < (len + 4))
523 goto err_no_buffer;
524
525 /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments
526 * here but we don't support confederations and such paths we already
527 * discarded in bgp_check_as_path().
528 */
529
530 DBG("BGP: Encoding AS4_PATH\n");
531 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len);
532 ADVANCE(w, remains, rv);
533 memcpy(w, a->u.ptr->data, len);
534 ADVANCE(w, remains, len);
535
536 continue;
537 }
538
539 /* The same issue with AGGREGATOR attribute */
540 if ((code == BA_AGGREGATOR) && (! p->as4_session))
541 {
542 int new_used;
543
544 len = 6;
545 if (remains < (len + 3))
546 goto err_no_buffer;
547
548 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len);
549 ADVANCE(w, remains, rv);
550 aggregator_convert_to_old(a->u.ptr, w, &new_used);
551 ADVANCE(w, remains, len);
552
553 if (! new_used)
554 continue;
555
556 len = 8;
557 if (remains < (len + 3))
558 goto err_no_buffer;
559
560 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len);
561 ADVANCE(w, remains, rv);
562 memcpy(w, a->u.ptr->data, len);
563 ADVANCE(w, remains, len);
564
565 continue;
566 }
567
568 /* Standard path continues here ... */
569
570 type = a->type & EAF_TYPE_MASK;
571 flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL);
572 len = bgp_get_attr_len(a);
573
574 /* Skip empty sets */
575 if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET)) && (len == 0))
576 continue;
577
578 if (remains < len + 4)
579 goto err_no_buffer;
580
581 rv = bgp_encode_attr_hdr(w, flags, code, len);
582 ADVANCE(w, remains, rv);
583
584 switch (type)
585 {
586 case EAF_TYPE_INT:
587 case EAF_TYPE_ROUTER_ID:
588 if (len == 4)
589 put_u32(w, a->u.data);
590 else
591 *w = a->u.data;
592 break;
593 case EAF_TYPE_IP_ADDRESS:
594 {
595 ip_addr ip = *(ip_addr *)a->u.ptr->data;
596 ipa_hton(ip);
597 memcpy(w, &ip, len);
598 break;
599 }
600 case EAF_TYPE_INT_SET:
601 case EAF_TYPE_EC_SET:
602 {
603 u32 *z = int_set_get_data(a->u.ptr);
604 int i;
605 for(i=0; i<len; i+=4)
606 put_u32(w+i, *z++);
607 break;
608 }
609 case EAF_TYPE_OPAQUE:
610 case EAF_TYPE_AS_PATH:
611 memcpy(w, a->u.ptr->data, len);
612 break;
613 default:
614 bug("bgp_encode_attrs: unknown attribute type %02x", a->type);
615 }
616 ADVANCE(w, remains, len);
617 }
618 return w - start;
619
620 err_no_buffer:
621 return -1;
622 }
623
624 /*
625 static void
626 bgp_init_prefix(struct fib_node *N)
627 {
628 struct bgp_prefix *p = (struct bgp_prefix *) N;
629 p->bucket_node.next = NULL;
630 }
631 */
632
633 static int
634 bgp_compare_u32(const u32 *x, const u32 *y)
635 {
636 return (*x < *y) ? -1 : (*x > *y) ? 1 : 0;
637 }
638
639 static inline void
640 bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt)
641 {
642 memcpy(dest, src, sizeof(u32) * cnt);
643 qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32);
644 }
645
646 static int
647 bgp_compare_ec(const u32 *xp, const u32 *yp)
648 {
649 u64 x = ec_get(xp, 0);
650 u64 y = ec_get(yp, 0);
651 return (x < y) ? -1 : (x > y) ? 1 : 0;
652 }
653
654 static inline void
655 bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal)
656 {
657 u32 *dst = int_set_get_data(ad);
658
659 /* Remove non-transitive communities (EC_TBIT active) on external sessions */
660 if (! internal)
661 {
662 int len = int_set_get_size(ad);
663 u32 *t = dst;
664 int i;
665
666 for (i=0; i < len; i += 2)
667 {
668 if (src[i] & EC_TBIT)
669 continue;
670
671 *t++ = src[i];
672 *t++ = src[i+1];
673 }
674
675 ad->length = (t - dst) * 4;
676 }
677 else
678 memcpy(dst, src, ad->length);
679
680 qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec);
681 }
682
683 static void
684 bgp_rehash_buckets(struct bgp_proto *p)
685 {
686 struct bgp_bucket **old = p->bucket_hash;
687 struct bgp_bucket **new;
688 unsigned oldn = p->hash_size;
689 unsigned i, e, mask;
690 struct bgp_bucket *b;
691
692 p->hash_size = p->hash_limit;
693 DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size);
694 p->hash_limit *= 4;
695 if (p->hash_limit >= 65536)
696 p->hash_limit = ~0;
697 new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
698 mask = p->hash_size - 1;
699 for (i=0; i<oldn; i++)
700 while (b = old[i])
701 {
702 old[i] = b->hash_next;
703 e = b->hash & mask;
704 b->hash_next = new[e];
705 if (b->hash_next)
706 b->hash_next->hash_prev = b;
707 b->hash_prev = NULL;
708 new[e] = b;
709 }
710 mb_free(old);
711 }
712
713 static struct bgp_bucket *
714 bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash)
715 {
716 struct bgp_bucket *b;
717 unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
718 unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
719 unsigned size = sizeof(struct bgp_bucket) + ea_size;
720 unsigned i;
721 byte *dest;
722 unsigned index = hash & (p->hash_size - 1);
723
724 /* Gather total size of non-inline attributes */
725 for (i=0; i<new->count; i++)
726 {
727 eattr *a = &new->attrs[i];
728 if (!(a->type & EAF_EMBEDDED))
729 size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
730 }
731
732 /* Create the bucket and hash it */
733 b = mb_alloc(p->p.pool, size);
734 b->hash_next = p->bucket_hash[index];
735 if (b->hash_next)
736 b->hash_next->hash_prev = b;
737 p->bucket_hash[index] = b;
738 b->hash_prev = NULL;
739 b->hash = hash;
740 add_tail(&p->bucket_queue, &b->send_node);
741 init_list(&b->prefixes);
742 memcpy(b->eattrs, new, ea_size);
743 dest = ((byte *)b->eattrs) + ea_size_aligned;
744
745 /* Copy values of non-inline attributes */
746 for (i=0; i<new->count; i++)
747 {
748 eattr *a = &b->eattrs->attrs[i];
749 if (!(a->type & EAF_EMBEDDED))
750 {
751 struct adata *oa = a->u.ptr;
752 struct adata *na = (struct adata *) dest;
753 memcpy(na, oa, sizeof(struct adata) + oa->length);
754 a->u.ptr = na;
755 dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
756 }
757 }
758
759 /* If needed, rehash */
760 p->hash_count++;
761 if (p->hash_count > p->hash_limit)
762 bgp_rehash_buckets(p);
763
764 return b;
765 }
766
767 static struct bgp_bucket *
768 bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate)
769 {
770 ea_list *new;
771 unsigned i, cnt, hash, code;
772 eattr *a, *d;
773 u32 seen = 0;
774 struct bgp_bucket *b;
775
776 /* Merge the attribute list */
777 new = alloca(ea_scan(attrs));
778 ea_merge(attrs, new);
779 ea_sort(new);
780
781 /* Normalize attributes */
782 d = new->attrs;
783 cnt = new->count;
784 new->count = 0;
785 for(i=0; i<cnt; i++)
786 {
787 a = &new->attrs[i];
788 if (EA_PROTO(a->id) != EAP_BGP)
789 continue;
790 code = EA_ID(a->id);
791 if (ATTR_KNOWN(code))
792 {
793 if (!bgp_attr_table[code].allow_in_ebgp && !p->is_internal)
794 continue;
795 /* The flags might have been zero if the attr was added by filters */
796 a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags;
797 if (code < 32)
798 seen |= 1 << code;
799 }
800 else
801 {
802 /* Don't re-export unknown non-transitive attributes */
803 if (!(a->flags & BAF_TRANSITIVE))
804 continue;
805 }
806 *d = *a;
807 if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL))
808 d->flags |= BAF_PARTIAL;
809 switch (d->type & EAF_TYPE_MASK)
810 {
811 case EAF_TYPE_INT_SET:
812 {
813 struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
814 z->length = d->u.ptr->length;
815 bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4);
816 d->u.ptr = z;
817 break;
818 }
819 case EAF_TYPE_EC_SET:
820 {
821 struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
822 z->length = d->u.ptr->length;
823 bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal);
824 d->u.ptr = z;
825 break;
826 }
827 default: ;
828 }
829 d++;
830 new->count++;
831 }
832
833 /* Hash */
834 hash = ea_hash(new);
835 for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next)
836 if (b->hash == hash && ea_same(b->eattrs, new))
837 {
838 DBG("Found bucket.\n");
839 return b;
840 }
841
842 /* Ensure that there are all mandatory attributes */
843 for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++)
844 if (!(seen & (1 << bgp_mandatory_attrs[i])))
845 {
846 log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen);
847 return NULL;
848 }
849
850 /* Check if next hop is valid */
851 a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP));
852 if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data))
853 {
854 log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen);
855 return NULL;
856 }
857
858 /* Create new bucket */
859 DBG("Creating bucket.\n");
860 return bgp_new_bucket(p, new, hash);
861 }
862
863 void
864 bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
865 {
866 if (buck->hash_next)
867 buck->hash_next->hash_prev = buck->hash_prev;
868 if (buck->hash_prev)
869 buck->hash_prev->hash_next = buck->hash_next;
870 else
871 p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next;
872 mb_free(buck);
873 }
874
875
876 /* Prefix hash table */
877
878 static inline u32 prefix_hash(ip_addr prefix, int pxlen, u32 path_id, u32 order)
879 {
880 u32 x = ipa_hash(prefix) + pxlen + path_id;
881 return (x * 2902958171u) >> (32 - order);
882 }
883
884 static inline u32 px_hash_size(struct bgp_proto *p)
885 { return 1 << p->px_hash_order; }
886
887 void
888 bgp_init_prefix_table(struct bgp_proto *p, u32 order)
889 {
890 p->px_hash_count = 0;
891 p->px_hash_order = order;
892 p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *));
893 p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix));
894 }
895
896 static void
897 bgp_rehash_prefix_table(struct bgp_proto *p, int step)
898 {
899 struct bgp_prefix **old_tab, *px, *px_next;
900 u32 old_size, hash, i;
901
902 old_tab = p->prefix_table;
903 old_size = px_hash_size(p);
904
905 p->px_hash_order += step;
906 p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *));
907
908 for (i = 0; i < old_size; i++)
909 for (px = old_tab[i]; px; px = px_next)
910 {
911 px_next = px->next;
912 hash = prefix_hash(px->n.prefix, px->n.pxlen, px->path_id, p->px_hash_order);
913 px->next = p->prefix_table[hash];
914 p->prefix_table[hash] = px;
915 }
916
917 mb_free(old_tab);
918 }
919
920 static struct bgp_prefix *
921 bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id)
922 {
923 struct bgp_prefix *bp;
924 u32 hash = prefix_hash(prefix, pxlen, path_id, p->px_hash_order);
925
926 for (bp = p->prefix_table[hash]; bp; bp = bp->next)
927 if (bp->n.pxlen == pxlen && ipa_equal(bp->n.prefix, prefix) && bp->path_id == path_id)
928 return bp;
929
930 bp = sl_alloc(p->prefix_slab);
931 bp->n.prefix = prefix;
932 bp->n.pxlen = pxlen;
933 bp->path_id = path_id;
934 bp->next = p->prefix_table[hash];
935 p->prefix_table[hash] = bp;
936
937 bp->bucket_node.next = NULL;
938
939 p->px_hash_count++;
940 if ((p->px_hash_count > px_hash_size(p)) && (p->px_hash_order < 18))
941 bgp_rehash_prefix_table(p, 1);
942
943 return bp;
944 }
945
946 void
947 bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp)
948 {
949 struct bgp_prefix **bpp;
950 u32 hash = prefix_hash(bp->n.prefix, bp->n.pxlen, bp->path_id, p->px_hash_order);
951
952 for (bpp = &p->prefix_table[hash]; *bpp; *bpp = (*bpp)->next)
953 if (*bpp == bp)
954 break;
955
956 *bpp = bp->next;
957 sl_free(p->prefix_slab, bp);
958
959 p->px_hash_count--;
960 if ((p->px_hash_count < (px_hash_size(p) / 4)) && (p->px_hash_order > 10))
961 bgp_rehash_prefix_table(p, -1);
962 }
963
964
965 void
966 bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs)
967 {
968 struct bgp_proto *p = (struct bgp_proto *) P;
969 struct bgp_bucket *buck;
970 struct bgp_prefix *px;
971 rte *key;
972 u32 path_id;
973
974 DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down");
975
976 if (new)
977 {
978 key = new;
979 buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP);
980 if (!buck) /* Inconsistent attribute list */
981 return;
982 }
983 else
984 {
985 key = old;
986 if (!(buck = p->withdraw_bucket))
987 {
988 buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket));
989 init_list(&buck->prefixes);
990 }
991 }
992 path_id = p->add_path_tx ? key->attrs->src->global_id : 0;
993 px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id);
994 if (px->bucket_node.next)
995 {
996 DBG("\tRemoving old entry.\n");
997 rem_node(&px->bucket_node);
998 }
999 add_tail(&buck->prefixes, &px->bucket_node);
1000 bgp_schedule_packet(p->conn, PKT_UPDATE);
1001 }
1002
1003 static int
1004 bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool)
1005 {
1006 ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr));
1007 rta *rta = e->attrs;
1008 byte *z;
1009
1010 ea->next = *attrs;
1011 *attrs = ea;
1012 ea->flags = EALF_SORTED;
1013 ea->count = 4;
1014
1015 bgp_set_attr(ea->attrs, BA_ORIGIN,
1016 ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP);
1017
1018 if (p->is_internal)
1019 bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0);
1020 else
1021 {
1022 z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6);
1023 z[0] = AS_PATH_SEQUENCE;
1024 z[1] = 1; /* 1 AS */
1025 put_u32(z+2, p->local_as);
1026 }
1027
1028 /* iBGP -> use gw, eBGP multi-hop -> use source_addr,
1029 eBGP single-hop -> use gw if on the same iface */
1030 z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
1031 if (p->cf->next_hop_self ||
1032 rta->dest != RTD_ROUTER ||
1033 ipa_equal(rta->gw, IPA_NONE) ||
1034 ipa_has_link_scope(rta->gw) ||
1035 (!p->is_internal && !p->cf->next_hop_keep &&
1036 (!p->neigh || (rta->iface != p->neigh->iface))))
1037 set_next_hop(z, p->source_addr);
1038 else
1039 set_next_hop(z, rta->gw);
1040
1041 bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref);
1042
1043 return 0; /* Leave decision to the filters */
1044 }
1045
1046
1047 static inline int
1048 bgp_as_path_loopy(struct bgp_proto *p, rta *a)
1049 {
1050 int num = p->cf->allow_local_as + 1;
1051 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1052 return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num));
1053 }
1054
1055 static inline int
1056 bgp_originator_id_loopy(struct bgp_proto *p, rta *a)
1057 {
1058 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1059 return (e && (e->u.data == p->local_id));
1060 }
1061
1062 static inline int
1063 bgp_cluster_list_loopy(struct bgp_proto *p, rta *a)
1064 {
1065 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1066 return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id));
1067 }
1068
1069
1070 static inline void
1071 bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as)
1072 {
1073 eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1074 bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as));
1075 }
1076
1077 static inline void
1078 bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid)
1079 {
1080 eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1081 bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_add(pool, a ? a->u.ptr : NULL, cid));
1082 }
1083
1084 static int
1085 bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr)
1086 {
1087 eattr *a;
1088
1089 if (!p->is_internal && !p->rs_client)
1090 {
1091 bgp_path_prepend(e, attrs, pool, p->local_as);
1092
1093 /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
1094 * propagated to other neighboring ASes.
1095 * Perhaps it would be better to undefine it.
1096 */
1097 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1098 if (a)
1099 bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0);
1100 }
1101
1102 /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr,
1103 * eBGP single-hop -> keep next_hop if on the same iface.
1104 * If the next_hop is zero (i.e. link-local), keep only if on the same iface.
1105 *
1106 * Note that same-iface-check uses iface from route, which is based on gw.
1107 */
1108 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
1109 if (a && !p->cf->next_hop_self &&
1110 (p->cf->next_hop_keep ||
1111 (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) ||
1112 (p->neigh && (e->attrs->iface == p->neigh->iface))))
1113 {
1114 /* Leave the original next hop attribute, will check later where does it point */
1115 }
1116 else
1117 {
1118 /* Need to create new one */
1119 byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
1120 set_next_hop(b, p->source_addr);
1121 }
1122
1123 if (rr)
1124 {
1125 /* Handling route reflection, RFC 4456 */
1126 struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto;
1127
1128 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1129 if (!a)
1130 bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id);
1131
1132 /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
1133 bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id);
1134
1135 /* Two RR clients with different cluster ID, hmmm */
1136 if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id))
1137 bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id);
1138 }
1139
1140 return 0; /* Leave decision to the filters */
1141 }
1142
1143 static int
1144 bgp_community_filter(struct bgp_proto *p, rte *e)
1145 {
1146 eattr *a;
1147 struct adata *d;
1148
1149 /* Check if we aren't forbidden to export the route by communities */
1150 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
1151 if (a)
1152 {
1153 d = a->u.ptr;
1154 if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
1155 {
1156 DBG("\tNO_ADVERTISE\n");
1157 return 1;
1158 }
1159 if (!p->is_internal &&
1160 (int_set_contains(d, BGP_COMM_NO_EXPORT) ||
1161 int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)))
1162 {
1163 DBG("\tNO_EXPORT\n");
1164 return 1;
1165 }
1166 }
1167
1168 return 0;
1169 }
1170
1171 int
1172 bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool)
1173 {
1174 rte *e = *new;
1175 struct bgp_proto *p = (struct bgp_proto *) P;
1176 struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ?
1177 (struct bgp_proto *) e->attrs->src->proto : NULL;
1178
1179 if (p == new_bgp) /* Poison reverse updates */
1180 return -1;
1181 if (new_bgp)
1182 {
1183 /* We should check here for cluster list loop, because the receiving BGP instance
1184 might have different cluster ID */
1185 if (bgp_cluster_list_loopy(p, e->attrs))
1186 return -1;
1187
1188 if (p->cf->interpret_communities && bgp_community_filter(p, e))
1189 return -1;
1190
1191 if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal)
1192 {
1193 /* Redistribution of internal routes with IBGP */
1194 if (p->rr_client || new_bgp->rr_client)
1195 /* Route reflection, RFC 4456 */
1196 return bgp_update_attrs(p, e, attrs, pool, 1);
1197 else
1198 return -1;
1199 }
1200 else
1201 return bgp_update_attrs(p, e, attrs, pool, 0);
1202 }
1203 else
1204 return bgp_create_attrs(p, e, attrs, pool);
1205 }
1206
1207 static inline u32
1208 bgp_get_neighbor(rte *r)
1209 {
1210 eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1211 u32 as;
1212
1213 if (e && as_path_get_first(e->u.ptr, &as))
1214 return as;
1215 else
1216 return ((struct bgp_proto *) r->attrs->src->proto)->remote_as;
1217 }
1218
1219 static inline int
1220 rte_resolvable(rte *rt)
1221 {
1222 int rd = rt->attrs->dest;
1223 return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH);
1224 }
1225
1226 int
1227 bgp_rte_better(rte *new, rte *old)
1228 {
1229 struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->src->proto;
1230 struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->src->proto;
1231 eattr *x, *y;
1232 u32 n, o;
1233
1234 /* Skip suppressed routes (see bgp_rte_recalculate()) */
1235 n = new->u.bgp.suppressed;
1236 o = old->u.bgp.suppressed;
1237 if (n > o)
1238 return 0;
1239 if (n < o)
1240 return 1;
1241
1242 /* RFC 4271 9.1.2.1. Route resolvability test */
1243 n = rte_resolvable(new);
1244 o = rte_resolvable(old);
1245 if (n > o)
1246 return 1;
1247 if (n < o)
1248 return 0;
1249
1250 /* Start with local preferences */
1251 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
1252 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
1253 n = x ? x->u.data : new_bgp->cf->default_local_pref;
1254 o = y ? y->u.data : old_bgp->cf->default_local_pref;
1255 if (n > o)
1256 return 1;
1257 if (n < o)
1258 return 0;
1259
1260 /* RFC 4271 9.1.2.2. a) Use AS path lengths */
1261 if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
1262 {
1263 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1264 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1265 n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
1266 o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
1267 if (n < o)
1268 return 1;
1269 if (n > o)
1270 return 0;
1271 }
1272
1273 /* RFC 4271 9.1.2.2. b) Use origins */
1274 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1275 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1276 n = x ? x->u.data : ORIGIN_INCOMPLETE;
1277 o = y ? y->u.data : ORIGIN_INCOMPLETE;
1278 if (n < o)
1279 return 1;
1280 if (n > o)
1281 return 0;
1282
1283 /* RFC 4271 9.1.2.2. c) Compare MED's */
1284 /* Proper RFC 4271 path selection cannot be interpreted as finding
1285 * the best path in some ordering. It is implemented partially in
1286 * bgp_rte_recalculate() when deterministic_med option is
1287 * active. Without that option, the behavior is just an
1288 * approximation, which in specific situations may lead to
1289 * persistent routing loops, because it is nondeterministic - it
1290 * depends on the order in which routes appeared. But it is also the
1291 * same behavior as used by default in Cisco routers, so it is
1292 * probably not a big issue.
1293 */
1294 if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
1295 (bgp_get_neighbor(new) == bgp_get_neighbor(old)))
1296 {
1297 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1298 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1299 n = x ? x->u.data : new_bgp->cf->default_med;
1300 o = y ? y->u.data : old_bgp->cf->default_med;
1301 if (n < o)
1302 return 1;
1303 if (n > o)
1304 return 0;
1305 }
1306
1307 /* RFC 4271 9.1.2.2. d) Prefer external peers */
1308 if (new_bgp->is_internal > old_bgp->is_internal)
1309 return 0;
1310 if (new_bgp->is_internal < old_bgp->is_internal)
1311 return 1;
1312
1313 /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
1314 n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0;
1315 o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0;
1316 if (n < o)
1317 return 1;
1318 if (n > o)
1319 return 0;
1320
1321 /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
1322 /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
1323 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1324 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1325 n = x ? x->u.data : new_bgp->remote_id;
1326 o = y ? y->u.data : old_bgp->remote_id;
1327
1328 /* RFC 5004 - prefer older routes */
1329 /* (if both are external and from different peer) */
1330 if ((new_bgp->cf->prefer_older || old_bgp->cf->prefer_older) &&
1331 !new_bgp->is_internal && n != o)
1332 return 0;
1333
1334 /* rest of RFC 4271 9.1.2.2. f) */
1335 if (n < o)
1336 return 1;
1337 if (n > o)
1338 return 0;
1339
1340 /* RFC 4456 9. b) Compare cluster list lengths */
1341 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1342 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1343 n = x ? int_set_get_size(x->u.ptr) : 0;
1344 o = y ? int_set_get_size(y->u.ptr) : 0;
1345 if (n < o)
1346 return 1;
1347 if (n > o)
1348 return 0;
1349
1350 /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
1351 return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
1352 }
1353
1354
1355 static inline int
1356 same_group(rte *r, u32 lpref, u32 lasn)
1357 {
1358 return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
1359 }
1360
1361 static inline int
1362 use_deterministic_med(rte *r)
1363 {
1364 struct proto *P = r->attrs->src->proto;
1365 return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med;
1366 }
1367
1368 int
1369 bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
1370 {
1371 rte *r, *s;
1372 rte *key = new ? new : old;
1373 u32 lpref = key->pref;
1374 u32 lasn = bgp_get_neighbor(key);
1375 int old_is_group_best = 0;
1376
1377 /*
1378 * Proper RFC 4271 path selection is a bit complicated, it cannot be
1379 * implemented just by rte_better(), because it is not a linear
1380 * ordering. But it can be splitted to two levels, where the lower
1381 * level chooses the best routes in each group of routes from the
1382 * same neighboring AS and higher level chooses the best route (with
1383 * a slightly different ordering) between the best-in-group routes.
1384 *
1385 * When deterministic_med is disabled, we just ignore this issue and
1386 * choose the best route by bgp_rte_better() alone. If enabled, the
1387 * lower level of the route selection is done here (for the group
1388 * to which the changed route belongs), all routes in group are
1389 * marked as suppressed, just chosen best-in-group is not.
1390 *
1391 * Global best route selection then implements higher level by
1392 * choosing between non-suppressed routes (as they are always
1393 * preferred over suppressed routes). Routes from BGP protocols
1394 * that do not set deterministic_med are just never suppressed. As
1395 * they do not participate in the lower level selection, it is OK
1396 * that this fn is not called for them.
1397 *
1398 * The idea is simple, the implementation is more problematic,
1399 * mostly because of optimizations in rte_recalculate() that
1400 * avoids full recalculation in most cases.
1401 *
1402 * We can assume that at least one of new, old is non-NULL and both
1403 * are from the same protocol with enabled deterministic_med. We
1404 * group routes by both neighbor AS (lasn) and preference (lpref),
1405 * because bgp_rte_better() does not handle preference itself.
1406 */
1407
1408 /* If new and old are from different groups, we just process that
1409 as two independent events */
1410 if (new && old && !same_group(old, lpref, lasn))
1411 {
1412 int i1, i2;
1413 i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
1414 i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
1415 return i1 || i2;
1416 }
1417
1418 /*
1419 * We could find the best-in-group and then make some shortcuts like
1420 * in rte_recalculate, but as we would have to walk through all
1421 * net->routes just to find it, it is probably not worth. So we
1422 * just have two simpler fast cases that use just the old route.
1423 * We also set suppressed flag to avoid using it in bgp_rte_better().
1424 */
1425
1426 if (new)
1427 new->u.bgp.suppressed = 1;
1428
1429 if (old)
1430 {
1431 old_is_group_best = !old->u.bgp.suppressed;
1432 old->u.bgp.suppressed = 1;
1433 int new_is_better = new && bgp_rte_better(new, old);
1434
1435 /* The first case - replace not best with worse (or remove not best) */
1436 if (!old_is_group_best && !new_is_better)
1437 return 0;
1438
1439 /* The second case - replace the best with better */
1440 if (old_is_group_best && new_is_better)
1441 {
1442 /* new is best-in-group, the see discussion below - this is
1443 a special variant of NBG && OBG. From OBG we can deduce
1444 that same_group(old_best) iff (old == old_best) */
1445 new->u.bgp.suppressed = 0;
1446 return (old == old_best);
1447 }
1448 }
1449
1450 /* The default case - find a new best-in-group route */
1451 r = new; /* new may not be in the list */
1452 for (s=net->routes; rte_is_valid(s); s=s->next)
1453 if (use_deterministic_med(s) && same_group(s, lpref, lasn))
1454 {
1455 s->u.bgp.suppressed = 1;
1456 if (!r || bgp_rte_better(s, r))
1457 r = s;
1458 }
1459
1460 /* Simple case - the last route in group disappears */
1461 if (!r)
1462 return 0;
1463
1464 /* Found best-in-group */
1465 r->u.bgp.suppressed = 0;
1466
1467 /*
1468 * There are generally two reasons why we have to force
1469 * recalculation (return 1): First, the new route may be wrongfully
1470 * chosen to be the best in the first case check in
1471 * rte_recalculate(), this may happen only if old_best is from the
1472 * same group. Second, another (different than new route)
1473 * best-in-group is chosen and that may be the proper best (although
1474 * rte_recalculate() without ignore that possibility).
1475 *
1476 * There are three possible cases according to whether the old route
1477 * was the best in group (OBG, stored in old_is_group_best) and
1478 * whether the new route is the best in group (NBG, tested by r == new).
1479 * These cases work even if old or new is NULL.
1480 *
1481 * NBG -> new is a possible candidate for the best route, so we just
1482 * check for the first reason using same_group().
1483 *
1484 * !NBG && OBG -> Second reason applies, return 1
1485 *
1486 * !NBG && !OBG -> Best in group does not change, old != old_best,
1487 * rte_better(new, old_best) is false and therefore
1488 * the first reason does not apply, return 0
1489 */
1490
1491 if (r == new)
1492 return old_best && same_group(old_best, lpref, lasn);
1493 else
1494 return old_is_group_best;
1495 }
1496
1497 static struct adata *
1498 bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
1499 {
1500 struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8);
1501 newa->length = 8;
1502 aggregator_convert_to_new(old, newa->data);
1503 return newa;
1504 }
1505
1506
1507 /* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
1508 * and append path old4 (in 4B format).
1509 */
1510 static struct adata *
1511 bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool)
1512 {
1513 byte buf[old2->length * 2];
1514
1515 int ol = as_path_convert_to_new(old2, buf, req_as);
1516 int nl = ol + (old4 ? old4->length : 0);
1517
1518 struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl);
1519 newa->length = nl;
1520 memcpy(newa->data, buf, ol);
1521 if (old4) memcpy(newa->data + ol, old4->data, old4->length);
1522
1523 return newa;
1524 }
1525
1526 static int
1527 as4_aggregator_valid(struct adata *aggr)
1528 {
1529 return aggr->length == 8;
1530 }
1531
1532
1533 /* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
1534 static void
1535 bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool)
1536 {
1537 eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1538 eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH));
1539 eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR));
1540 eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR));
1541 int a4_removed = 0;
1542
1543 if (a4 && !as4_aggregator_valid(a4->u.ptr))
1544 {
1545 log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name);
1546 a4 = NULL;
1547 a4_removed = 1;
1548 }
1549
1550 if (a2)
1551 {
1552 u32 a2_as = get_u16(a2->u.ptr->data);
1553
1554 if (a4)
1555 {
1556 if (a2_as != AS_TRANS)
1557 {
1558 /* Routes were aggregated by old router and therefore AS4_PATH
1559 * and AS4_AGGREGATOR is invalid
1560 *
1561 * Convert AS_PATH and AGGREGATOR to 4B format and finish.
1562 */
1563
1564 a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
1565 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
1566
1567 return;
1568 }
1569 else
1570 {
1571 /* Common case, use AS4_AGGREGATOR attribute */
1572 a2->u.ptr = a4->u.ptr;
1573 }
1574 }
1575 else
1576 {
1577 /* Common case, use old AGGREGATOR attribute */
1578 a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
1579
1580 if ((a2_as == AS_TRANS) && !a4_removed)
1581 log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name);
1582 }
1583 }
1584 else
1585 if (a4)
1586 log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name);
1587
1588 int p2_len = as_path_getlen_int(p2->u.ptr, 2);
1589 int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1;
1590
1591 if (p4 && (p4_len < 0))
1592 log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name);
1593
1594 if ((p4_len <= 0) || (p2_len < p4_len))
1595 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
1596 else
1597 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool);
1598 }
1599
1600 static void
1601 bgp_remove_as4_attrs(struct bgp_proto *p, rta *a)
1602 {
1603 unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH);
1604 unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR);
1605 ea_list **el = &(a->eattrs);
1606
1607 /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */
1608 while (*el != NULL)
1609 {
1610 unsigned fid = (*el)->attrs[0].id;
1611
1612 if ((fid == id1) || (fid == id2))
1613 {
1614 *el = (*el)->next;
1615 if (p->as4_session)
1616 log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name);
1617 }
1618 else
1619 el = &((*el)->next);
1620 }
1621 }
1622
1623 /**
1624 * bgp_decode_attrs - check and decode BGP attributes
1625 * @conn: connection
1626 * @attr: start of attribute block
1627 * @len: length of attribute block
1628 * @pool: linear pool to make all the allocations in
1629 * @mandatory: 1 iff presence of mandatory attributes has to be checked
1630 *
1631 * This function takes a BGP attribute block (a part of an Update message), checks
1632 * its consistency and converts it to a list of BIRD route attributes represented
1633 * by a &rta.
1634 */
1635 struct rta *
1636 bgp_decode_attrs(struct bgp_conn *conn, byte *attr, unsigned int len, struct linpool *pool, int mandatory)
1637 {
1638 struct bgp_proto *bgp = conn->bgp;
1639 rta *a = lp_alloc(pool, sizeof(struct rta));
1640 unsigned int flags, code, l, i, type;
1641 int errcode;
1642 byte *z, *attr_start;
1643 byte seen[256/8];
1644 ea_list *ea;
1645 struct adata *ad;
1646 int withdraw = 0;
1647
1648 bzero(a, sizeof(rta));
1649 a->source = RTS_BGP;
1650 a->scope = SCOPE_UNIVERSE;
1651 a->cast = RTC_UNICAST;
1652 /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */
1653 a->from = bgp->cf->remote_ip;
1654
1655 /* Parse the attributes */
1656 bzero(seen, sizeof(seen));
1657 DBG("BGP: Parsing attributes\n");
1658 while (len)
1659 {
1660 if (len < 2)
1661 goto malformed;
1662 attr_start = attr;
1663 flags = *attr++;
1664 code = *attr++;
1665 len -= 2;
1666 if (flags & BAF_EXT_LEN)
1667 {
1668 if (len < 2)
1669 goto malformed;
1670 l = get_u16(attr);
1671 attr += 2;
1672 len -= 2;
1673 }
1674 else
1675 {
1676 if (len < 1)
1677 goto malformed;
1678 l = *attr++;
1679 len--;
1680 }
1681 if (l > len)
1682 goto malformed;
1683 len -= l;
1684 z = attr;
1685 attr += l;
1686 DBG("Attr %02x %02x %d\n", code, flags, l);
1687 if (seen[code/8] & (1 << (code%8)))
1688 goto malformed;
1689 if (ATTR_KNOWN(code))
1690 {
1691 struct attr_desc *desc = &bgp_attr_table[code];
1692 if (desc->expected_length >= 0 && desc->expected_length != (int) l)
1693 { errcode = 5; goto err; }
1694 if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
1695 { errcode = 4; goto err; }
1696 if (!desc->allow_in_ebgp && !bgp->is_internal)
1697 continue;
1698 if (desc->validate)
1699 {
1700 errcode = desc->validate(bgp, z, l);
1701 if (errcode > 0)
1702 goto err;
1703 if (errcode == IGNORE)
1704 continue;
1705 if (errcode <= WITHDRAW)
1706 {
1707 log(L_WARN "%s: Attribute %s is malformed, withdrawing update",
1708 bgp->p.name, desc->name);
1709 withdraw = 1;
1710 }
1711 }
1712 else if (code == BA_AS_PATH)
1713 {
1714 /* Special case as it might also trim the attribute */
1715 if (validate_as_path(bgp, z, &l) < 0)
1716 { errcode = 11; goto err; }
1717 }
1718 type = desc->type;
1719 }
1720 else /* Unknown attribute */
1721 {
1722 if (!(flags & BAF_OPTIONAL))
1723 { errcode = 2; goto err; }
1724 type = EAF_TYPE_OPAQUE;
1725 }
1726
1727 // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag
1728 // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL))
1729 // { errcode = 4; goto err; }
1730
1731 seen[code/8] |= (1 << (code%8));
1732 ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
1733 ea->next = a->eattrs;
1734 a->eattrs = ea;
1735 ea->flags = 0;
1736 ea->count = 1;
1737 ea->attrs[0].id = EA_CODE(EAP_BGP, code);
1738 ea->attrs[0].flags = flags;
1739 ea->attrs[0].type = type;
1740 if (type & EAF_EMBEDDED)
1741 ad = NULL;
1742 else
1743 {
1744 ad = lp_alloc(pool, sizeof(struct adata) + l);
1745 ea->attrs[0].u.ptr = ad;
1746 ad->length = l;
1747 memcpy(ad->data, z, l);
1748 }
1749 switch (type)
1750 {
1751 case EAF_TYPE_ROUTER_ID:
1752 case EAF_TYPE_INT:
1753 if (l == 1)
1754 ea->attrs[0].u.data = *z;
1755 else
1756 ea->attrs[0].u.data = get_u32(z);
1757 break;
1758 case EAF_TYPE_IP_ADDRESS:
1759 ipa_ntoh(*(ip_addr *)ad->data);
1760 break;
1761 case EAF_TYPE_INT_SET:
1762 case EAF_TYPE_EC_SET:
1763 {
1764 u32 *z = (u32 *) ad->data;
1765 for(i=0; i<ad->length/4; i++)
1766 z[i] = ntohl(z[i]);
1767 break;
1768 }
1769 }
1770 }
1771
1772 if (withdraw)
1773 goto withdraw;
1774
1775 #ifdef IPV6
1776 /* If we received MP_REACH_NLRI we should check mandatory attributes */
1777 if (bgp->mp_reach_len != 0)
1778 mandatory = 1;
1779 #endif
1780
1781 /* If there is no (reachability) NLRI, we should exit now */
1782 if (! mandatory)
1783 return a;
1784
1785 /* Check if all mandatory attributes are present */
1786 for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++)
1787 {
1788 code = bgp_mandatory_attrs[i];
1789 if (!(seen[code/8] & (1 << (code%8))))
1790 {
1791 bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1);
1792 return NULL;
1793 }
1794 }
1795
1796 /* When receiving attributes from non-AS4-aware BGP speaker,
1797 * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
1798 */
1799 if (! bgp->as4_session)
1800 bgp_reconstruct_4b_atts(bgp, a, pool);
1801
1802 bgp_remove_as4_attrs(bgp, a);
1803
1804 /* If the AS path attribute contains our AS, reject the routes */
1805 if (bgp_as_path_loopy(bgp, a))
1806 goto withdraw;
1807
1808 /* Two checks for IBGP loops caused by route reflection, RFC 4456 */
1809 if (bgp_originator_id_loopy(bgp, a) ||
1810 bgp_cluster_list_loopy(bgp, a))
1811 goto withdraw;
1812
1813 /* If there's no local preference, define one */
1814 if (!(seen[0] & (1 << BA_LOCAL_PREF)))
1815 bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref);
1816
1817 return a;
1818
1819 withdraw:
1820 return NULL;
1821
1822 malformed:
1823 bgp_error(conn, 3, 1, NULL, 0);
1824 return NULL;
1825
1826 err:
1827 bgp_error(conn, 3, errcode, attr_start, z+l-attr_start);
1828 return NULL;
1829 }
1830
1831 int
1832 bgp_get_attr(eattr *a, byte *buf, int buflen)
1833 {
1834 unsigned int i = EA_ID(a->id);
1835 struct attr_desc *d;
1836 int len;
1837
1838 if (ATTR_KNOWN(i))
1839 {
1840 d = &bgp_attr_table[i];
1841 len = bsprintf(buf, "%s", d->name);
1842 buf += len;
1843 if (d->format)
1844 {
1845 *buf++ = ':';
1846 *buf++ = ' ';
1847 d->format(a, buf, buflen - len - 2);
1848 return GA_FULL;
1849 }
1850 return GA_NAME;
1851 }
1852 bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
1853 return GA_NAME;
1854 }
1855
1856 void
1857 bgp_init_bucket_table(struct bgp_proto *p)
1858 {
1859 p->hash_size = 256;
1860 p->hash_limit = p->hash_size * 4;
1861 p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
1862 init_list(&p->bucket_queue);
1863 p->withdraw_bucket = NULL;
1864 // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
1865 }
1866
1867 void
1868 bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
1869 {
1870 eattr *p = ea_find(attrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1871 eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1872 u32 origas;
1873
1874 buf += bsprintf(buf, " (%d", e->pref);
1875
1876 if (e->u.bgp.suppressed)
1877 buf += bsprintf(buf, "-");
1878
1879 if (e->attrs->hostentry)
1880 {
1881 if (!rte_resolvable(e))
1882 buf += bsprintf(buf, "/-");
1883 else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
1884 buf += bsprintf(buf, "/?");
1885 else
1886 buf += bsprintf(buf, "/%d", e->attrs->igp_metric);
1887 }
1888 buf += bsprintf(buf, ") [");
1889
1890 if (p && as_path_get_last(p->u.ptr, &origas))
1891 buf += bsprintf(buf, "AS%u", origas);
1892 if (o)
1893 buf += bsprintf(buf, "%c", "ie?"[o->u.data]);
1894 strcpy(buf, "]");
1895 }