]> git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/attrs.c
Integrated IP functions.
[thirdparty/bird.git] / proto / bgp / attrs.c
1 /*
2 * BIRD -- BGP Attributes
3 *
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 #undef LOCAL_DEBUG
10
11 #include <stdlib.h>
12
13 #include "nest/bird.h"
14 #include "nest/iface.h"
15 #include "nest/protocol.h"
16 #include "nest/route.h"
17 #include "nest/attrs.h"
18 #include "conf/conf.h"
19 #include "lib/resource.h"
20 #include "lib/string.h"
21 #include "lib/unaligned.h"
22
23 #include "bgp.h"
24
25 /*
26 * UPDATE message error handling
27 *
28 * All checks from RFC 4271 6.3 are done as specified with these exceptions:
29 * - The semantic check of an IP address from NEXT_HOP attribute is missing.
30 * - Checks of some optional attribute values are missing.
31 * - Syntactic and semantic checks of NLRIs (done in DECODE_PREFIX())
32 * are probably inadequate.
33 *
34 * Loop detection based on AS_PATH causes updates to be withdrawn. RFC
35 * 4271 does not explicitly specifiy the behavior in that case.
36 *
37 * Loop detection related to route reflection (based on ORIGINATOR_ID
38 * and CLUSTER_LIST) causes updates to be withdrawn. RFC 4456 8
39 * specifies that such updates should be ignored, but that is generally
40 * a bad idea.
41 *
42 * Error checking of optional transitive attributes is done according to
43 * draft-ietf-idr-optional-transitive-03, but errors are handled always
44 * as withdraws.
45 *
46 * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed,
47 * but unknown segments cause a session drop with Malformed AS_PATH
48 * error (see validate_path()). The behavior in such case is not
49 * explicitly specified by RFC 4271. RFC 5065 specifies that
50 * inconsistent AS_CONFED_* segments should cause a session drop, but
51 * implementations that pass invalid AS_CONFED_* segments are
52 * widespread.
53 *
54 * Error handling of AS4_* attributes is done as specified by
55 * draft-ietf-idr-rfc4893bis-03. There are several possible
56 * inconsistencies between AGGREGATOR and AS4_AGGREGATOR that are not
57 * handled by that draft, these are logged and ignored (see
58 * bgp_reconstruct_4b_attrs()).
59 */
60
61
62 static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH
63 #ifndef IPV6
64 ,BA_NEXT_HOP
65 #endif
66 };
67
68 struct attr_desc {
69 char *name;
70 int expected_length;
71 int expected_flags;
72 int type;
73 int allow_in_ebgp;
74 int (*validate)(struct bgp_proto *p, byte *attr, int len);
75 void (*format)(eattr *ea, byte *buf, int buflen);
76 };
77
78 #define IGNORE -1
79 #define WITHDRAW -2
80
81 static int
82 bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED)
83 {
84 if (*a > 2)
85 return 6;
86 return 0;
87 }
88
89 static void
90 bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED)
91 {
92 static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };
93
94 bsprintf(buf, bgp_origin_names[a->u.data]);
95 }
96
97 static int
98 path_segment_contains(byte *p, int bs, u32 asn)
99 {
100 int i;
101 int len = p[1];
102 p += 2;
103
104 for(i=0; i<len; i++)
105 {
106 u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p);
107 if (asn2 == asn)
108 return 1;
109 p += bs;
110 }
111
112 return 0;
113 }
114
115 /* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */
116 static int
117 validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, unsigned int *ilength)
118 {
119 int res = 0;
120 u8 *a, *dst;
121 int len, plen, copy;
122
123 dst = a = idata;
124 len = *ilength;
125
126 while (len)
127 {
128 if (len < 2)
129 return -1;
130
131 plen = 2 + bs * a[1];
132 if (len < plen)
133 return -1;
134
135 switch (a[0])
136 {
137 case AS_PATH_SET:
138 copy = 1;
139 res++;
140 break;
141
142 case AS_PATH_SEQUENCE:
143 copy = 1;
144 res += a[1];
145 break;
146
147 case AS_PATH_CONFED_SEQUENCE:
148 case AS_PATH_CONFED_SET:
149 if (as_path && path_segment_contains(a, bs, p->remote_as))
150 {
151 log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name);
152 return -1;
153 }
154
155 log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment",
156 p->p.name, as_path ? "AS" : "AS4");
157 copy = 0;
158 break;
159
160 default:
161 return -1;
162 }
163
164 if (copy)
165 {
166 if (dst != a)
167 memmove(dst, a, plen);
168 dst += plen;
169 }
170
171 len -= plen;
172 a += plen;
173 }
174
175 *ilength = dst - idata;
176 return res;
177 }
178
179 static inline int
180 validate_as_path(struct bgp_proto *p, byte *a, int *len)
181 {
182 return validate_path(p, 1, p->as4_session ? 4 : 2, a, len);
183 }
184
185 static inline int
186 validate_as4_path(struct bgp_proto *p, struct adata *path)
187 {
188 return validate_path(p, 0, 4, path->data, &path->length);
189 }
190
191 static int
192 bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a, int len)
193 {
194 #ifdef IPV6
195 return IGNORE;
196 #else
197 ip_addr addr;
198
199 memcpy(&addr, a, len);
200 ipa_ntoh(addr);
201 if (ipa_classify(addr) & IADDR_HOST)
202 return 0;
203 else
204 return 8;
205 #endif
206 }
207
208 static void
209 bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED)
210 {
211 ip_addr *ipp = (ip_addr *) a->u.ptr->data;
212 #ifdef IPV6
213 /* in IPv6, we might have two addresses in NEXT HOP */
214 if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1]))
215 {
216 bsprintf(buf, "%I %I", ipp[0], ipp[1]);
217 return;
218 }
219 #endif
220
221 bsprintf(buf, "%I", ipp[0]);
222 }
223
224 static int
225 bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len)
226 {
227 int exp_len = p->as4_session ? 8 : 6;
228
229 return (len == exp_len) ? 0 : WITHDRAW;
230 }
231
232 static void
233 bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED)
234 {
235 struct adata *ad = a->u.ptr;
236 byte *data = ad->data;
237 u32 as;
238
239 as = get_u32(data);
240 data += 4;
241
242 bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as);
243 }
244
245 static int
246 bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
247 {
248 return ((len % 4) == 0) ? 0 : WITHDRAW;
249 }
250
251 static int
252 bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
253 {
254 return ((len % 4) == 0) ? 0 : 5;
255 }
256
257 static void
258 bgp_format_cluster_list(eattr *a, byte *buf, int buflen)
259 {
260 /* Truncates cluster lists larger than buflen, probably not a problem */
261 int_set_format(a->u.ptr, 0, -1, buf, buflen);
262 }
263
264 static int
265 bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
266 {
267 #ifdef IPV6
268 p->mp_reach_start = a;
269 p->mp_reach_len = len;
270 #endif
271 return IGNORE;
272 }
273
274 static int
275 bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
276 {
277 #ifdef IPV6
278 p->mp_unreach_start = a;
279 p->mp_unreach_len = len;
280 #endif
281 return IGNORE;
282 }
283
284 static int
285 bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
286 {
287 return ((len % 8) == 0) ? 0 : WITHDRAW;
288 }
289
290
291 static struct attr_desc bgp_attr_table[] = {
292 { NULL, -1, 0, 0, 0, /* Undefined */
293 NULL, NULL },
294 { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_ORIGIN */
295 bgp_check_origin, bgp_format_origin },
296 { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1, /* BA_AS_PATH */
297 NULL, NULL }, /* is checked by validate_as_path() as a special case */
298 { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1, /* BA_NEXT_HOP */
299 bgp_check_next_hop, bgp_format_next_hop },
300 { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1, /* BA_MULTI_EXIT_DISC */
301 NULL, NULL },
302 { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 0, /* BA_LOCAL_PREF */
303 NULL, NULL },
304 { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_ATOMIC_AGGR */
305 NULL, NULL },
306 { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AGGREGATOR */
307 bgp_check_aggregator, bgp_format_aggregator },
308 { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1, /* BA_COMMUNITY */
309 bgp_check_community, NULL },
310 { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0, /* BA_ORIGINATOR_ID */
311 NULL, NULL },
312 { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0, /* BA_CLUSTER_LIST */
313 bgp_check_cluster_list, bgp_format_cluster_list },
314 { .name = NULL }, /* BA_DPA */
315 { .name = NULL }, /* BA_ADVERTISER */
316 { .name = NULL }, /* BA_RCID_PATH */
317 { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_REACH_NLRI */
318 bgp_check_reach_nlri, NULL },
319 { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_UNREACH_NLRI */
320 bgp_check_unreach_nlri, NULL },
321 { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1, /* BA_EXT_COMMUNITY */
322 bgp_check_ext_community, NULL },
323 { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
324 NULL, NULL },
325 { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
326 NULL, NULL }
327 };
328
329 /* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH.
330 * It does not matter as this attribute does not appear on routes in the routing table.
331 */
332
333 #define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)
334
335 static inline struct adata *
336 bgp_alloc_adata(struct linpool *pool, unsigned len)
337 {
338 struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len);
339 ad->length = len;
340 return ad;
341 }
342
343 static void
344 bgp_set_attr(eattr *e, unsigned attr, uintptr_t val)
345 {
346 ASSERT(ATTR_KNOWN(attr));
347 e->id = EA_CODE(EAP_BGP, attr);
348 e->type = bgp_attr_table[attr].type;
349 e->flags = bgp_attr_table[attr].expected_flags;
350 if (e->type & EAF_EMBEDDED)
351 e->u.data = val;
352 else
353 e->u.ptr = (struct adata *) val;
354 }
355
356 static byte *
357 bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len)
358 {
359 struct adata *ad = bgp_alloc_adata(pool, len);
360 bgp_set_attr(e, attr, (uintptr_t) ad);
361 return ad->data;
362 }
363
364 void
365 bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val)
366 {
367 ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
368 a->next = *to;
369 *to = a;
370 a->flags = EALF_SORTED;
371 a->count = 1;
372 bgp_set_attr(a->attrs, attr, val);
373 }
374
375 byte *
376 bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len)
377 {
378 struct adata *ad = bgp_alloc_adata(pool, len);
379 bgp_attach_attr(to, pool, attr, (uintptr_t) ad);
380 return ad->data;
381 }
382
383 static int
384 bgp_encode_attr_hdr(byte *dst, unsigned int flags, unsigned code, int len)
385 {
386 int wlen;
387
388 DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags);
389
390 if (len < 256)
391 {
392 *dst++ = flags;
393 *dst++ = code;
394 *dst++ = len;
395 wlen = 3;
396 }
397 else
398 {
399 *dst++ = flags | BAF_EXT_LEN;
400 *dst++ = code;
401 put_u16(dst, len);
402 wlen = 4;
403 }
404
405 return wlen;
406 }
407
408 static void
409 aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used)
410 {
411 byte *src = aggr->data;
412 *new_used = 0;
413
414 u32 as = get_u32(src);
415 if (as > 0xFFFF)
416 {
417 as = AS_TRANS;
418 *new_used = 1;
419 }
420 put_u16(dst, as);
421
422 /* Copy IPv4 address */
423 memcpy(dst + 2, src + 4, 4);
424 }
425
426 static void
427 aggregator_convert_to_new(struct adata *aggr, byte *dst)
428 {
429 byte *src = aggr->data;
430
431 u32 as = get_u16(src);
432 put_u32(dst, as);
433
434 /* Copy IPv4 address */
435 memcpy(dst + 4, src + 2, 4);
436 }
437
438 static int
439 bgp_get_attr_len(eattr *a)
440 {
441 int len;
442 if (ATTR_KNOWN(EA_ID(a->id)))
443 {
444 int code = EA_ID(a->id);
445 struct attr_desc *desc = &bgp_attr_table[code];
446 len = desc->expected_length;
447 if (len < 0)
448 {
449 ASSERT(!(a->type & EAF_EMBEDDED));
450 len = a->u.ptr->length;
451 }
452 }
453 else
454 {
455 ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE);
456 len = a->u.ptr->length;
457 }
458
459 return len;
460 }
461
462 #define ADVANCE(w, r, l) do { r -= l; w += l; } while (0)
463
464 /**
465 * bgp_encode_attrs - encode BGP attributes
466 * @p: BGP instance
467 * @w: buffer
468 * @attrs: a list of extended attributes
469 * @remains: remaining space in the buffer
470 *
471 * The bgp_encode_attrs() function takes a list of extended attributes
472 * and converts it to its BGP representation (a part of an Update message).
473 *
474 * Result: Length of the attribute block generated or -1 if not enough space.
475 */
476 unsigned int
477 bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains)
478 {
479 unsigned int i, code, type, flags;
480 byte *start = w;
481 int len, rv;
482
483 for(i=0; i<attrs->count; i++)
484 {
485 eattr *a = &attrs->attrs[i];
486 ASSERT(EA_PROTO(a->id) == EAP_BGP);
487 code = EA_ID(a->id);
488
489 #ifdef IPV6
490 /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
491 if (code == BA_NEXT_HOP)
492 continue;
493 #endif
494
495 /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
496 * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH
497 * as optional AS4_PATH attribute.
498 */
499 if ((code == BA_AS_PATH) && (! p->as4_session))
500 {
501 len = a->u.ptr->length;
502
503 if (remains < (len + 4))
504 goto err_no_buffer;
505
506 /* Using temporary buffer because don't know a length of created attr
507 * and therefore a length of a header. Perhaps i should better always
508 * use BAF_EXT_LEN. */
509
510 byte buf[len];
511 int new_used;
512 int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used);
513
514 DBG("BGP: Encoding old AS_PATH\n");
515 rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl);
516 ADVANCE(w, remains, rv);
517 memcpy(w, buf, nl);
518 ADVANCE(w, remains, nl);
519
520 if (! new_used)
521 continue;
522
523 if (remains < (len + 4))
524 goto err_no_buffer;
525
526 /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments
527 * here but we don't support confederations and such paths we already
528 * discarded in bgp_check_as_path().
529 */
530
531 DBG("BGP: Encoding AS4_PATH\n");
532 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len);
533 ADVANCE(w, remains, rv);
534 memcpy(w, a->u.ptr->data, len);
535 ADVANCE(w, remains, len);
536
537 continue;
538 }
539
540 /* The same issue with AGGREGATOR attribute */
541 if ((code == BA_AGGREGATOR) && (! p->as4_session))
542 {
543 int new_used;
544
545 len = 6;
546 if (remains < (len + 3))
547 goto err_no_buffer;
548
549 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len);
550 ADVANCE(w, remains, rv);
551 aggregator_convert_to_old(a->u.ptr, w, &new_used);
552 ADVANCE(w, remains, len);
553
554 if (! new_used)
555 continue;
556
557 len = 8;
558 if (remains < (len + 3))
559 goto err_no_buffer;
560
561 rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len);
562 ADVANCE(w, remains, rv);
563 memcpy(w, a->u.ptr->data, len);
564 ADVANCE(w, remains, len);
565
566 continue;
567 }
568
569 /* Standard path continues here ... */
570
571 type = a->type & EAF_TYPE_MASK;
572 flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL);
573 len = bgp_get_attr_len(a);
574
575 /* Skip empty sets */
576 if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET)) && (len == 0))
577 continue;
578
579 if (remains < len + 4)
580 goto err_no_buffer;
581
582 rv = bgp_encode_attr_hdr(w, flags, code, len);
583 ADVANCE(w, remains, rv);
584
585 switch (type)
586 {
587 case EAF_TYPE_INT:
588 case EAF_TYPE_ROUTER_ID:
589 if (len == 4)
590 put_u32(w, a->u.data);
591 else
592 *w = a->u.data;
593 break;
594 case EAF_TYPE_IP_ADDRESS:
595 {
596 ip_addr ip = *(ip_addr *)a->u.ptr->data;
597 ipa_hton(ip);
598 memcpy(w, &ip, len);
599 break;
600 }
601 case EAF_TYPE_INT_SET:
602 case EAF_TYPE_EC_SET:
603 {
604 u32 *z = int_set_get_data(a->u.ptr);
605 int i;
606 for(i=0; i<len; i+=4)
607 put_u32(w+i, *z++);
608 break;
609 }
610 case EAF_TYPE_OPAQUE:
611 case EAF_TYPE_AS_PATH:
612 memcpy(w, a->u.ptr->data, len);
613 break;
614 default:
615 bug("bgp_encode_attrs: unknown attribute type %02x", a->type);
616 }
617 ADVANCE(w, remains, len);
618 }
619 return w - start;
620
621 err_no_buffer:
622 return -1;
623 }
624
625 /*
626 static void
627 bgp_init_prefix(struct fib_node *N)
628 {
629 struct bgp_prefix *p = (struct bgp_prefix *) N;
630 p->bucket_node.next = NULL;
631 }
632 */
633
634 static int
635 bgp_compare_u32(const u32 *x, const u32 *y)
636 {
637 return (*x < *y) ? -1 : (*x > *y) ? 1 : 0;
638 }
639
640 static inline void
641 bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt)
642 {
643 memcpy(dest, src, sizeof(u32) * cnt);
644 qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32);
645 }
646
647 static int
648 bgp_compare_ec(const u32 *xp, const u32 *yp)
649 {
650 u64 x = ec_get(xp, 0);
651 u64 y = ec_get(yp, 0);
652 return (x < y) ? -1 : (x > y) ? 1 : 0;
653 }
654
655 static inline void
656 bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal)
657 {
658 u32 *dst = int_set_get_data(ad);
659
660 /* Remove non-transitive communities (EC_TBIT active) on external sessions */
661 if (! internal)
662 {
663 int len = int_set_get_size(ad);
664 u32 *t = dst;
665 int i;
666
667 for (i=0; i < len; i += 2)
668 {
669 if (src[i] & EC_TBIT)
670 continue;
671
672 *t++ = src[i];
673 *t++ = src[i+1];
674 }
675
676 ad->length = (t - dst) * 4;
677 }
678 else
679 memcpy(dst, src, ad->length);
680
681 qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec);
682 }
683
684 static void
685 bgp_rehash_buckets(struct bgp_proto *p)
686 {
687 struct bgp_bucket **old = p->bucket_hash;
688 struct bgp_bucket **new;
689 unsigned oldn = p->hash_size;
690 unsigned i, e, mask;
691 struct bgp_bucket *b;
692
693 p->hash_size = p->hash_limit;
694 DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size);
695 p->hash_limit *= 4;
696 if (p->hash_limit >= 65536)
697 p->hash_limit = ~0;
698 new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
699 mask = p->hash_size - 1;
700 for (i=0; i<oldn; i++)
701 while (b = old[i])
702 {
703 old[i] = b->hash_next;
704 e = b->hash & mask;
705 b->hash_next = new[e];
706 if (b->hash_next)
707 b->hash_next->hash_prev = b;
708 b->hash_prev = NULL;
709 new[e] = b;
710 }
711 mb_free(old);
712 }
713
714 static struct bgp_bucket *
715 bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash)
716 {
717 struct bgp_bucket *b;
718 unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
719 unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
720 unsigned size = sizeof(struct bgp_bucket) + ea_size;
721 unsigned i;
722 byte *dest;
723 unsigned index = hash & (p->hash_size - 1);
724
725 /* Gather total size of non-inline attributes */
726 for (i=0; i<new->count; i++)
727 {
728 eattr *a = &new->attrs[i];
729 if (!(a->type & EAF_EMBEDDED))
730 size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
731 }
732
733 /* Create the bucket and hash it */
734 b = mb_alloc(p->p.pool, size);
735 b->hash_next = p->bucket_hash[index];
736 if (b->hash_next)
737 b->hash_next->hash_prev = b;
738 p->bucket_hash[index] = b;
739 b->hash_prev = NULL;
740 b->hash = hash;
741 add_tail(&p->bucket_queue, &b->send_node);
742 init_list(&b->prefixes);
743 memcpy(b->eattrs, new, ea_size);
744 dest = ((byte *)b->eattrs) + ea_size_aligned;
745
746 /* Copy values of non-inline attributes */
747 for (i=0; i<new->count; i++)
748 {
749 eattr *a = &b->eattrs->attrs[i];
750 if (!(a->type & EAF_EMBEDDED))
751 {
752 struct adata *oa = a->u.ptr;
753 struct adata *na = (struct adata *) dest;
754 memcpy(na, oa, sizeof(struct adata) + oa->length);
755 a->u.ptr = na;
756 dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
757 }
758 }
759
760 /* If needed, rehash */
761 p->hash_count++;
762 if (p->hash_count > p->hash_limit)
763 bgp_rehash_buckets(p);
764
765 return b;
766 }
767
768 static struct bgp_bucket *
769 bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate)
770 {
771 ea_list *new;
772 unsigned i, cnt, hash, code;
773 eattr *a, *d;
774 u32 seen = 0;
775 struct bgp_bucket *b;
776
777 /* Merge the attribute list */
778 new = alloca(ea_scan(attrs));
779 ea_merge(attrs, new);
780 ea_sort(new);
781
782 /* Normalize attributes */
783 d = new->attrs;
784 cnt = new->count;
785 new->count = 0;
786 for(i=0; i<cnt; i++)
787 {
788 a = &new->attrs[i];
789 if (EA_PROTO(a->id) != EAP_BGP)
790 continue;
791 code = EA_ID(a->id);
792 if (ATTR_KNOWN(code))
793 {
794 if (!bgp_attr_table[code].allow_in_ebgp && !p->is_internal)
795 continue;
796 /* The flags might have been zero if the attr was added by filters */
797 a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags;
798 if (code < 32)
799 seen |= 1 << code;
800 }
801 else
802 {
803 /* Don't re-export unknown non-transitive attributes */
804 if (!(a->flags & BAF_TRANSITIVE))
805 continue;
806 }
807 *d = *a;
808 if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL))
809 d->flags |= BAF_PARTIAL;
810 switch (d->type & EAF_TYPE_MASK)
811 {
812 case EAF_TYPE_INT_SET:
813 {
814 struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
815 z->length = d->u.ptr->length;
816 bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4);
817 d->u.ptr = z;
818 break;
819 }
820 case EAF_TYPE_EC_SET:
821 {
822 struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
823 z->length = d->u.ptr->length;
824 bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal);
825 d->u.ptr = z;
826 break;
827 }
828 default: ;
829 }
830 d++;
831 new->count++;
832 }
833
834 /* Hash */
835 hash = ea_hash(new);
836 for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next)
837 if (b->hash == hash && ea_same(b->eattrs, new))
838 {
839 DBG("Found bucket.\n");
840 return b;
841 }
842
843 /* Ensure that there are all mandatory attributes */
844 for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++)
845 if (!(seen & (1 << bgp_mandatory_attrs[i])))
846 {
847 log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen);
848 return NULL;
849 }
850
851 /* Check if next hop is valid */
852 a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP));
853 if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data))
854 {
855 log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen);
856 return NULL;
857 }
858
859 /* Create new bucket */
860 DBG("Creating bucket.\n");
861 return bgp_new_bucket(p, new, hash);
862 }
863
864 void
865 bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
866 {
867 if (buck->hash_next)
868 buck->hash_next->hash_prev = buck->hash_prev;
869 if (buck->hash_prev)
870 buck->hash_prev->hash_next = buck->hash_next;
871 else
872 p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next;
873 mb_free(buck);
874 }
875
876
877 /* Prefix hash table */
878
879 #define PXH_KEY(n1) n1->n.prefix, n1->n.pxlen, n1->path_id
880 #define PXH_NEXT(n) n->next
881 #define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2
882 #define PXH_FN(p,l,i) ipa_hash32(p) ^ u32_hash((l << 16) ^ i)
883
884 #define PXH_REHASH bgp_pxh_rehash
885 #define PXH_PARAMS /8, *2, 2, 2, 8, 20
886
887
888 HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix)
889
890 void
891 bgp_init_prefix_table(struct bgp_proto *p, u32 order)
892 {
893 HASH_INIT(p->prefix_hash, p->p.pool, order);
894
895 p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix));
896 }
897
898 static struct bgp_prefix *
899 bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id)
900 {
901 struct bgp_prefix *bp = HASH_FIND(p->prefix_hash, PXH, prefix, pxlen, path_id);
902
903 if (bp)
904 return bp;
905
906 bp = sl_alloc(p->prefix_slab);
907 bp->n.prefix = prefix;
908 bp->n.pxlen = pxlen;
909 bp->path_id = path_id;
910 bp->bucket_node.next = NULL;
911
912 HASH_INSERT2(p->prefix_hash, PXH, p->p.pool, bp);
913
914 return bp;
915 }
916
917 void
918 bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp)
919 {
920 HASH_REMOVE2(p->prefix_hash, PXH, p->p.pool, bp);
921 sl_free(p->prefix_slab, bp);
922 }
923
924
925 void
926 bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs)
927 {
928 struct bgp_proto *p = (struct bgp_proto *) P;
929 struct bgp_bucket *buck;
930 struct bgp_prefix *px;
931 rte *key;
932 u32 path_id;
933
934 DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down");
935
936 if (new)
937 {
938 key = new;
939 buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP);
940 if (!buck) /* Inconsistent attribute list */
941 return;
942 }
943 else
944 {
945 key = old;
946 if (!(buck = p->withdraw_bucket))
947 {
948 buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket));
949 init_list(&buck->prefixes);
950 }
951 }
952 path_id = p->add_path_tx ? key->attrs->src->global_id : 0;
953 px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id);
954 if (px->bucket_node.next)
955 {
956 DBG("\tRemoving old entry.\n");
957 rem_node(&px->bucket_node);
958 }
959 add_tail(&buck->prefixes, &px->bucket_node);
960 bgp_schedule_packet(p->conn, PKT_UPDATE);
961 }
962
963 static int
964 bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool)
965 {
966 ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr));
967 rta *rta = e->attrs;
968 byte *z;
969
970 ea->next = *attrs;
971 *attrs = ea;
972 ea->flags = EALF_SORTED;
973 ea->count = 4;
974
975 bgp_set_attr(ea->attrs, BA_ORIGIN,
976 ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP);
977
978 if (p->is_internal)
979 bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0);
980 else
981 {
982 z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6);
983 z[0] = AS_PATH_SEQUENCE;
984 z[1] = 1; /* 1 AS */
985 put_u32(z+2, p->local_as);
986 }
987
988 /* iBGP -> use gw, eBGP multi-hop -> use source_addr,
989 eBGP single-hop -> use gw if on the same iface */
990 z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
991 if (p->cf->next_hop_self ||
992 rta->dest != RTD_ROUTER ||
993 ipa_equal(rta->gw, IPA_NONE) ||
994 ipa_is_link_local(rta->gw) ||
995 (!p->is_internal && !p->cf->next_hop_keep &&
996 (!p->neigh || (rta->iface != p->neigh->iface))))
997 set_next_hop(z, p->source_addr);
998 else
999 set_next_hop(z, rta->gw);
1000
1001 bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref);
1002
1003 return 0; /* Leave decision to the filters */
1004 }
1005
1006
1007 static inline int
1008 bgp_as_path_loopy(struct bgp_proto *p, rta *a)
1009 {
1010 int num = p->cf->allow_local_as + 1;
1011 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1012 return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num));
1013 }
1014
1015 static inline int
1016 bgp_originator_id_loopy(struct bgp_proto *p, rta *a)
1017 {
1018 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1019 return (e && (e->u.data == p->local_id));
1020 }
1021
1022 static inline int
1023 bgp_cluster_list_loopy(struct bgp_proto *p, rta *a)
1024 {
1025 eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1026 return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id));
1027 }
1028
1029
1030 static inline void
1031 bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as)
1032 {
1033 eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1034 bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as));
1035 }
1036
1037 static inline void
1038 bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid)
1039 {
1040 eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1041 bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_add(pool, a ? a->u.ptr : NULL, cid));
1042 }
1043
1044 static int
1045 bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr)
1046 {
1047 eattr *a;
1048
1049 if (!p->is_internal && !p->rs_client)
1050 {
1051 bgp_path_prepend(e, attrs, pool, p->local_as);
1052
1053 /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
1054 * propagated to other neighboring ASes.
1055 * Perhaps it would be better to undefine it.
1056 */
1057 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1058 if (a)
1059 bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0);
1060 }
1061
1062 /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr,
1063 * eBGP single-hop -> keep next_hop if on the same iface.
1064 * If the next_hop is zero (i.e. link-local), keep only if on the same iface.
1065 *
1066 * Note that same-iface-check uses iface from route, which is based on gw.
1067 */
1068 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
1069 if (a && !p->cf->next_hop_self &&
1070 (p->cf->next_hop_keep ||
1071 (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) ||
1072 (p->neigh && (e->attrs->iface == p->neigh->iface))))
1073 {
1074 /* Leave the original next hop attribute, will check later where does it point */
1075 }
1076 else
1077 {
1078 /* Need to create new one */
1079 byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
1080 set_next_hop(b, p->source_addr);
1081 }
1082
1083 if (rr)
1084 {
1085 /* Handling route reflection, RFC 4456 */
1086 struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto;
1087
1088 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1089 if (!a)
1090 bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id);
1091
1092 /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
1093 bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id);
1094
1095 /* Two RR clients with different cluster ID, hmmm */
1096 if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id))
1097 bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id);
1098 }
1099
1100 return 0; /* Leave decision to the filters */
1101 }
1102
1103 static int
1104 bgp_community_filter(struct bgp_proto *p, rte *e)
1105 {
1106 eattr *a;
1107 struct adata *d;
1108
1109 /* Check if we aren't forbidden to export the route by communities */
1110 a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
1111 if (a)
1112 {
1113 d = a->u.ptr;
1114 if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
1115 {
1116 DBG("\tNO_ADVERTISE\n");
1117 return 1;
1118 }
1119 if (!p->is_internal &&
1120 (int_set_contains(d, BGP_COMM_NO_EXPORT) ||
1121 int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)))
1122 {
1123 DBG("\tNO_EXPORT\n");
1124 return 1;
1125 }
1126 }
1127
1128 return 0;
1129 }
1130
1131 int
1132 bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool)
1133 {
1134 rte *e = *new;
1135 struct bgp_proto *p = (struct bgp_proto *) P;
1136 struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ?
1137 (struct bgp_proto *) e->attrs->src->proto : NULL;
1138
1139 if (p == new_bgp) /* Poison reverse updates */
1140 return -1;
1141 if (new_bgp)
1142 {
1143 /* We should check here for cluster list loop, because the receiving BGP instance
1144 might have different cluster ID */
1145 if (bgp_cluster_list_loopy(p, e->attrs))
1146 return -1;
1147
1148 if (p->cf->interpret_communities && bgp_community_filter(p, e))
1149 return -1;
1150
1151 if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal)
1152 {
1153 /* Redistribution of internal routes with IBGP */
1154 if (p->rr_client || new_bgp->rr_client)
1155 /* Route reflection, RFC 4456 */
1156 return bgp_update_attrs(p, e, attrs, pool, 1);
1157 else
1158 return -1;
1159 }
1160 else
1161 return bgp_update_attrs(p, e, attrs, pool, 0);
1162 }
1163 else
1164 return bgp_create_attrs(p, e, attrs, pool);
1165 }
1166
1167 static inline u32
1168 bgp_get_neighbor(rte *r)
1169 {
1170 eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1171 u32 as;
1172
1173 if (e && as_path_get_first(e->u.ptr, &as))
1174 return as;
1175 else
1176 return ((struct bgp_proto *) r->attrs->src->proto)->remote_as;
1177 }
1178
1179 static inline int
1180 rte_resolvable(rte *rt)
1181 {
1182 int rd = rt->attrs->dest;
1183 return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH);
1184 }
1185
1186 int
1187 bgp_rte_better(rte *new, rte *old)
1188 {
1189 struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->src->proto;
1190 struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->src->proto;
1191 eattr *x, *y;
1192 u32 n, o;
1193
1194 /* Skip suppressed routes (see bgp_rte_recalculate()) */
1195 n = new->u.bgp.suppressed;
1196 o = old->u.bgp.suppressed;
1197 if (n > o)
1198 return 0;
1199 if (n < o)
1200 return 1;
1201
1202 /* RFC 4271 9.1.2.1. Route resolvability test */
1203 n = rte_resolvable(new);
1204 o = rte_resolvable(old);
1205 if (n > o)
1206 return 1;
1207 if (n < o)
1208 return 0;
1209
1210 /* Start with local preferences */
1211 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
1212 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
1213 n = x ? x->u.data : new_bgp->cf->default_local_pref;
1214 o = y ? y->u.data : old_bgp->cf->default_local_pref;
1215 if (n > o)
1216 return 1;
1217 if (n < o)
1218 return 0;
1219
1220 /* RFC 4271 9.1.2.2. a) Use AS path lengths */
1221 if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
1222 {
1223 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1224 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1225 n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
1226 o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
1227 if (n < o)
1228 return 1;
1229 if (n > o)
1230 return 0;
1231 }
1232
1233 /* RFC 4271 9.1.2.2. b) Use origins */
1234 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1235 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1236 n = x ? x->u.data : ORIGIN_INCOMPLETE;
1237 o = y ? y->u.data : ORIGIN_INCOMPLETE;
1238 if (n < o)
1239 return 1;
1240 if (n > o)
1241 return 0;
1242
1243 /* RFC 4271 9.1.2.2. c) Compare MED's */
1244 /* Proper RFC 4271 path selection cannot be interpreted as finding
1245 * the best path in some ordering. It is implemented partially in
1246 * bgp_rte_recalculate() when deterministic_med option is
1247 * active. Without that option, the behavior is just an
1248 * approximation, which in specific situations may lead to
1249 * persistent routing loops, because it is nondeterministic - it
1250 * depends on the order in which routes appeared. But it is also the
1251 * same behavior as used by default in Cisco routers, so it is
1252 * probably not a big issue.
1253 */
1254 if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
1255 (bgp_get_neighbor(new) == bgp_get_neighbor(old)))
1256 {
1257 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1258 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
1259 n = x ? x->u.data : new_bgp->cf->default_med;
1260 o = y ? y->u.data : old_bgp->cf->default_med;
1261 if (n < o)
1262 return 1;
1263 if (n > o)
1264 return 0;
1265 }
1266
1267 /* RFC 4271 9.1.2.2. d) Prefer external peers */
1268 if (new_bgp->is_internal > old_bgp->is_internal)
1269 return 0;
1270 if (new_bgp->is_internal < old_bgp->is_internal)
1271 return 1;
1272
1273 /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
1274 n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0;
1275 o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0;
1276 if (n < o)
1277 return 1;
1278 if (n > o)
1279 return 0;
1280
1281 /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
1282 /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
1283 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1284 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
1285 n = x ? x->u.data : new_bgp->remote_id;
1286 o = y ? y->u.data : old_bgp->remote_id;
1287
1288 /* RFC 5004 - prefer older routes */
1289 /* (if both are external and from different peer) */
1290 if ((new_bgp->cf->prefer_older || old_bgp->cf->prefer_older) &&
1291 !new_bgp->is_internal && n != o)
1292 return 0;
1293
1294 /* rest of RFC 4271 9.1.2.2. f) */
1295 if (n < o)
1296 return 1;
1297 if (n > o)
1298 return 0;
1299
1300 /* RFC 4456 9. b) Compare cluster list lengths */
1301 x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1302 y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
1303 n = x ? int_set_get_size(x->u.ptr) : 0;
1304 o = y ? int_set_get_size(y->u.ptr) : 0;
1305 if (n < o)
1306 return 1;
1307 if (n > o)
1308 return 0;
1309
1310 /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
1311 return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
1312 }
1313
1314
1315 static inline int
1316 same_group(rte *r, u32 lpref, u32 lasn)
1317 {
1318 return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
1319 }
1320
1321 static inline int
1322 use_deterministic_med(rte *r)
1323 {
1324 struct proto *P = r->attrs->src->proto;
1325 return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med;
1326 }
1327
1328 int
1329 bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
1330 {
1331 rte *r, *s;
1332 rte *key = new ? new : old;
1333 u32 lpref = key->pref;
1334 u32 lasn = bgp_get_neighbor(key);
1335 int old_is_group_best = 0;
1336
1337 /*
1338 * Proper RFC 4271 path selection is a bit complicated, it cannot be
1339 * implemented just by rte_better(), because it is not a linear
1340 * ordering. But it can be splitted to two levels, where the lower
1341 * level chooses the best routes in each group of routes from the
1342 * same neighboring AS and higher level chooses the best route (with
1343 * a slightly different ordering) between the best-in-group routes.
1344 *
1345 * When deterministic_med is disabled, we just ignore this issue and
1346 * choose the best route by bgp_rte_better() alone. If enabled, the
1347 * lower level of the route selection is done here (for the group
1348 * to which the changed route belongs), all routes in group are
1349 * marked as suppressed, just chosen best-in-group is not.
1350 *
1351 * Global best route selection then implements higher level by
1352 * choosing between non-suppressed routes (as they are always
1353 * preferred over suppressed routes). Routes from BGP protocols
1354 * that do not set deterministic_med are just never suppressed. As
1355 * they do not participate in the lower level selection, it is OK
1356 * that this fn is not called for them.
1357 *
1358 * The idea is simple, the implementation is more problematic,
1359 * mostly because of optimizations in rte_recalculate() that
1360 * avoids full recalculation in most cases.
1361 *
1362 * We can assume that at least one of new, old is non-NULL and both
1363 * are from the same protocol with enabled deterministic_med. We
1364 * group routes by both neighbor AS (lasn) and preference (lpref),
1365 * because bgp_rte_better() does not handle preference itself.
1366 */
1367
1368 /* If new and old are from different groups, we just process that
1369 as two independent events */
1370 if (new && old && !same_group(old, lpref, lasn))
1371 {
1372 int i1, i2;
1373 i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
1374 i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
1375 return i1 || i2;
1376 }
1377
1378 /*
1379 * We could find the best-in-group and then make some shortcuts like
1380 * in rte_recalculate, but as we would have to walk through all
1381 * net->routes just to find it, it is probably not worth. So we
1382 * just have two simpler fast cases that use just the old route.
1383 * We also set suppressed flag to avoid using it in bgp_rte_better().
1384 */
1385
1386 if (new)
1387 new->u.bgp.suppressed = 1;
1388
1389 if (old)
1390 {
1391 old_is_group_best = !old->u.bgp.suppressed;
1392 old->u.bgp.suppressed = 1;
1393 int new_is_better = new && bgp_rte_better(new, old);
1394
1395 /* The first case - replace not best with worse (or remove not best) */
1396 if (!old_is_group_best && !new_is_better)
1397 return 0;
1398
1399 /* The second case - replace the best with better */
1400 if (old_is_group_best && new_is_better)
1401 {
1402 /* new is best-in-group, the see discussion below - this is
1403 a special variant of NBG && OBG. From OBG we can deduce
1404 that same_group(old_best) iff (old == old_best) */
1405 new->u.bgp.suppressed = 0;
1406 return (old == old_best);
1407 }
1408 }
1409
1410 /* The default case - find a new best-in-group route */
1411 r = new; /* new may not be in the list */
1412 for (s=net->routes; rte_is_valid(s); s=s->next)
1413 if (use_deterministic_med(s) && same_group(s, lpref, lasn))
1414 {
1415 s->u.bgp.suppressed = 1;
1416 if (!r || bgp_rte_better(s, r))
1417 r = s;
1418 }
1419
1420 /* Simple case - the last route in group disappears */
1421 if (!r)
1422 return 0;
1423
1424 /* Found best-in-group */
1425 r->u.bgp.suppressed = 0;
1426
1427 /*
1428 * There are generally two reasons why we have to force
1429 * recalculation (return 1): First, the new route may be wrongfully
1430 * chosen to be the best in the first case check in
1431 * rte_recalculate(), this may happen only if old_best is from the
1432 * same group. Second, another (different than new route)
1433 * best-in-group is chosen and that may be the proper best (although
1434 * rte_recalculate() without ignore that possibility).
1435 *
1436 * There are three possible cases according to whether the old route
1437 * was the best in group (OBG, stored in old_is_group_best) and
1438 * whether the new route is the best in group (NBG, tested by r == new).
1439 * These cases work even if old or new is NULL.
1440 *
1441 * NBG -> new is a possible candidate for the best route, so we just
1442 * check for the first reason using same_group().
1443 *
1444 * !NBG && OBG -> Second reason applies, return 1
1445 *
1446 * !NBG && !OBG -> Best in group does not change, old != old_best,
1447 * rte_better(new, old_best) is false and therefore
1448 * the first reason does not apply, return 0
1449 */
1450
1451 if (r == new)
1452 return old_best && same_group(old_best, lpref, lasn);
1453 else
1454 return old_is_group_best;
1455 }
1456
1457 static struct adata *
1458 bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
1459 {
1460 struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8);
1461 newa->length = 8;
1462 aggregator_convert_to_new(old, newa->data);
1463 return newa;
1464 }
1465
1466
1467 /* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
1468 * and append path old4 (in 4B format).
1469 */
1470 static struct adata *
1471 bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool)
1472 {
1473 byte buf[old2->length * 2];
1474
1475 int ol = as_path_convert_to_new(old2, buf, req_as);
1476 int nl = ol + (old4 ? old4->length : 0);
1477
1478 struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl);
1479 newa->length = nl;
1480 memcpy(newa->data, buf, ol);
1481 if (old4) memcpy(newa->data + ol, old4->data, old4->length);
1482
1483 return newa;
1484 }
1485
1486 static int
1487 as4_aggregator_valid(struct adata *aggr)
1488 {
1489 return aggr->length == 8;
1490 }
1491
1492
1493 /* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
1494 static void
1495 bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool)
1496 {
1497 eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1498 eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH));
1499 eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR));
1500 eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR));
1501 int a4_removed = 0;
1502
1503 if (a4 && !as4_aggregator_valid(a4->u.ptr))
1504 {
1505 log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name);
1506 a4 = NULL;
1507 a4_removed = 1;
1508 }
1509
1510 if (a2)
1511 {
1512 u32 a2_as = get_u16(a2->u.ptr->data);
1513
1514 if (a4)
1515 {
1516 if (a2_as != AS_TRANS)
1517 {
1518 /* Routes were aggregated by old router and therefore AS4_PATH
1519 * and AS4_AGGREGATOR is invalid
1520 *
1521 * Convert AS_PATH and AGGREGATOR to 4B format and finish.
1522 */
1523
1524 a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
1525 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
1526
1527 return;
1528 }
1529 else
1530 {
1531 /* Common case, use AS4_AGGREGATOR attribute */
1532 a2->u.ptr = a4->u.ptr;
1533 }
1534 }
1535 else
1536 {
1537 /* Common case, use old AGGREGATOR attribute */
1538 a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
1539
1540 if ((a2_as == AS_TRANS) && !a4_removed)
1541 log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name);
1542 }
1543 }
1544 else
1545 if (a4)
1546 log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name);
1547
1548 int p2_len = as_path_getlen_int(p2->u.ptr, 2);
1549 int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1;
1550
1551 if (p4 && (p4_len < 0))
1552 log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name);
1553
1554 if ((p4_len <= 0) || (p2_len < p4_len))
1555 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
1556 else
1557 p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool);
1558 }
1559
1560 static void
1561 bgp_remove_as4_attrs(struct bgp_proto *p, rta *a)
1562 {
1563 unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH);
1564 unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR);
1565 ea_list **el = &(a->eattrs);
1566
1567 /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */
1568 while (*el != NULL)
1569 {
1570 unsigned fid = (*el)->attrs[0].id;
1571
1572 if ((fid == id1) || (fid == id2))
1573 {
1574 *el = (*el)->next;
1575 if (p->as4_session)
1576 log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name);
1577 }
1578 else
1579 el = &((*el)->next);
1580 }
1581 }
1582
1583 /**
1584 * bgp_decode_attrs - check and decode BGP attributes
1585 * @conn: connection
1586 * @attr: start of attribute block
1587 * @len: length of attribute block
1588 * @pool: linear pool to make all the allocations in
1589 * @mandatory: 1 iff presence of mandatory attributes has to be checked
1590 *
1591 * This function takes a BGP attribute block (a part of an Update message), checks
1592 * its consistency and converts it to a list of BIRD route attributes represented
1593 * by a &rta.
1594 */
1595 struct rta *
1596 bgp_decode_attrs(struct bgp_conn *conn, byte *attr, unsigned int len, struct linpool *pool, int mandatory)
1597 {
1598 struct bgp_proto *bgp = conn->bgp;
1599 rta *a = lp_alloc(pool, sizeof(struct rta));
1600 unsigned int flags, code, l, i, type;
1601 int errcode;
1602 byte *z, *attr_start;
1603 byte seen[256/8];
1604 ea_list *ea;
1605 struct adata *ad;
1606 int withdraw = 0;
1607
1608 bzero(a, sizeof(rta));
1609 a->source = RTS_BGP;
1610 a->scope = SCOPE_UNIVERSE;
1611 a->cast = RTC_UNICAST;
1612 /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */
1613 a->from = bgp->cf->remote_ip;
1614
1615 /* Parse the attributes */
1616 bzero(seen, sizeof(seen));
1617 DBG("BGP: Parsing attributes\n");
1618 while (len)
1619 {
1620 if (len < 2)
1621 goto malformed;
1622 attr_start = attr;
1623 flags = *attr++;
1624 code = *attr++;
1625 len -= 2;
1626 if (flags & BAF_EXT_LEN)
1627 {
1628 if (len < 2)
1629 goto malformed;
1630 l = get_u16(attr);
1631 attr += 2;
1632 len -= 2;
1633 }
1634 else
1635 {
1636 if (len < 1)
1637 goto malformed;
1638 l = *attr++;
1639 len--;
1640 }
1641 if (l > len)
1642 goto malformed;
1643 len -= l;
1644 z = attr;
1645 attr += l;
1646 DBG("Attr %02x %02x %d\n", code, flags, l);
1647 if (seen[code/8] & (1 << (code%8)))
1648 goto malformed;
1649 if (ATTR_KNOWN(code))
1650 {
1651 struct attr_desc *desc = &bgp_attr_table[code];
1652 if (desc->expected_length >= 0 && desc->expected_length != (int) l)
1653 { errcode = 5; goto err; }
1654 if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
1655 { errcode = 4; goto err; }
1656 if (!desc->allow_in_ebgp && !bgp->is_internal)
1657 continue;
1658 if (desc->validate)
1659 {
1660 errcode = desc->validate(bgp, z, l);
1661 if (errcode > 0)
1662 goto err;
1663 if (errcode == IGNORE)
1664 continue;
1665 if (errcode <= WITHDRAW)
1666 {
1667 log(L_WARN "%s: Attribute %s is malformed, withdrawing update",
1668 bgp->p.name, desc->name);
1669 withdraw = 1;
1670 }
1671 }
1672 else if (code == BA_AS_PATH)
1673 {
1674 /* Special case as it might also trim the attribute */
1675 if (validate_as_path(bgp, z, &l) < 0)
1676 { errcode = 11; goto err; }
1677 }
1678 type = desc->type;
1679 }
1680 else /* Unknown attribute */
1681 {
1682 if (!(flags & BAF_OPTIONAL))
1683 { errcode = 2; goto err; }
1684 type = EAF_TYPE_OPAQUE;
1685 }
1686
1687 // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag
1688 // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL))
1689 // { errcode = 4; goto err; }
1690
1691 seen[code/8] |= (1 << (code%8));
1692 ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
1693 ea->next = a->eattrs;
1694 a->eattrs = ea;
1695 ea->flags = 0;
1696 ea->count = 1;
1697 ea->attrs[0].id = EA_CODE(EAP_BGP, code);
1698 ea->attrs[0].flags = flags;
1699 ea->attrs[0].type = type;
1700 if (type & EAF_EMBEDDED)
1701 ad = NULL;
1702 else
1703 {
1704 ad = lp_alloc(pool, sizeof(struct adata) + l);
1705 ea->attrs[0].u.ptr = ad;
1706 ad->length = l;
1707 memcpy(ad->data, z, l);
1708 }
1709 switch (type)
1710 {
1711 case EAF_TYPE_ROUTER_ID:
1712 case EAF_TYPE_INT:
1713 if (l == 1)
1714 ea->attrs[0].u.data = *z;
1715 else
1716 ea->attrs[0].u.data = get_u32(z);
1717 break;
1718 case EAF_TYPE_IP_ADDRESS:
1719 ipa_ntoh(*(ip_addr *)ad->data);
1720 break;
1721 case EAF_TYPE_INT_SET:
1722 case EAF_TYPE_EC_SET:
1723 {
1724 u32 *z = (u32 *) ad->data;
1725 for(i=0; i<ad->length/4; i++)
1726 z[i] = ntohl(z[i]);
1727 break;
1728 }
1729 }
1730 }
1731
1732 if (withdraw)
1733 goto withdraw;
1734
1735 #ifdef IPV6
1736 /* If we received MP_REACH_NLRI we should check mandatory attributes */
1737 if (bgp->mp_reach_len != 0)
1738 mandatory = 1;
1739 #endif
1740
1741 /* If there is no (reachability) NLRI, we should exit now */
1742 if (! mandatory)
1743 return a;
1744
1745 /* Check if all mandatory attributes are present */
1746 for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++)
1747 {
1748 code = bgp_mandatory_attrs[i];
1749 if (!(seen[code/8] & (1 << (code%8))))
1750 {
1751 bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1);
1752 return NULL;
1753 }
1754 }
1755
1756 /* When receiving attributes from non-AS4-aware BGP speaker,
1757 * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
1758 */
1759 if (! bgp->as4_session)
1760 bgp_reconstruct_4b_atts(bgp, a, pool);
1761
1762 bgp_remove_as4_attrs(bgp, a);
1763
1764 /* If the AS path attribute contains our AS, reject the routes */
1765 if (bgp_as_path_loopy(bgp, a))
1766 goto withdraw;
1767
1768 /* Two checks for IBGP loops caused by route reflection, RFC 4456 */
1769 if (bgp_originator_id_loopy(bgp, a) ||
1770 bgp_cluster_list_loopy(bgp, a))
1771 goto withdraw;
1772
1773 /* If there's no local preference, define one */
1774 if (!(seen[0] & (1 << BA_LOCAL_PREF)))
1775 bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref);
1776
1777 return a;
1778
1779 withdraw:
1780 return NULL;
1781
1782 malformed:
1783 bgp_error(conn, 3, 1, NULL, 0);
1784 return NULL;
1785
1786 err:
1787 bgp_error(conn, 3, errcode, attr_start, z+l-attr_start);
1788 return NULL;
1789 }
1790
1791 int
1792 bgp_get_attr(eattr *a, byte *buf, int buflen)
1793 {
1794 unsigned int i = EA_ID(a->id);
1795 struct attr_desc *d;
1796 int len;
1797
1798 if (ATTR_KNOWN(i))
1799 {
1800 d = &bgp_attr_table[i];
1801 len = bsprintf(buf, "%s", d->name);
1802 buf += len;
1803 if (d->format)
1804 {
1805 *buf++ = ':';
1806 *buf++ = ' ';
1807 d->format(a, buf, buflen - len - 2);
1808 return GA_FULL;
1809 }
1810 return GA_NAME;
1811 }
1812 bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
1813 return GA_NAME;
1814 }
1815
1816 void
1817 bgp_init_bucket_table(struct bgp_proto *p)
1818 {
1819 p->hash_size = 256;
1820 p->hash_limit = p->hash_size * 4;
1821 p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
1822 init_list(&p->bucket_queue);
1823 p->withdraw_bucket = NULL;
1824 // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
1825 }
1826
1827 void
1828 bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
1829 {
1830 eattr *p = ea_find(attrs, EA_CODE(EAP_BGP, BA_AS_PATH));
1831 eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
1832 u32 origas;
1833
1834 buf += bsprintf(buf, " (%d", e->pref);
1835
1836 if (e->u.bgp.suppressed)
1837 buf += bsprintf(buf, "-");
1838
1839 if (e->attrs->hostentry)
1840 {
1841 if (!rte_resolvable(e))
1842 buf += bsprintf(buf, "/-");
1843 else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
1844 buf += bsprintf(buf, "/?");
1845 else
1846 buf += bsprintf(buf, "/%d", e->attrs->igp_metric);
1847 }
1848 buf += bsprintf(buf, ") [");
1849
1850 if (p && as_path_get_last(p->u.ptr, &origas))
1851 buf += bsprintf(buf, "AS%u", origas);
1852 if (o)
1853 buf += bsprintf(buf, "%c", "ie?"[o->u.data]);
1854 strcpy(buf, "]");
1855 }