2 * BIRD -- BGP Attributes
4 * (c) 2000 Martin Mares <mj@ucw.cz>
6 * Can be freely distributed and used under the terms of the GNU GPL.
13 #include "nest/bird.h"
14 #include "nest/iface.h"
15 #include "nest/protocol.h"
16 #include "nest/route.h"
17 #include "nest/attrs.h"
18 #include "conf/conf.h"
19 #include "lib/resource.h"
20 #include "lib/string.h"
21 #include "lib/unaligned.h"
26 * UPDATE message error handling
28 * All checks from RFC 4271 6.3 are done as specified with these exceptions:
29 * - The semantic check of an IP address from NEXT_HOP attribute is missing.
30 * - Checks of some optional attribute values are missing.
31 * - Syntactic and semantic checks of NLRIs (done in DECODE_PREFIX())
32 * are probably inadequate.
34 * Loop detection based on AS_PATH causes updates to be withdrawn. RFC
35 * 4271 does not explicitly specifiy the behavior in that case.
37 * Loop detection related to route reflection (based on ORIGINATOR_ID
38 * and CLUSTER_LIST) causes updates to be withdrawn. RFC 4456 8
39 * specifies that such updates should be ignored, but that is generally
42 * Error checking of optional transitive attributes is done according to
43 * draft-ietf-idr-optional-transitive-03, but errors are handled always
46 * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed,
47 * but unknown segments cause a session drop with Malformed AS_PATH
48 * error (see validate_path()). The behavior in such case is not
49 * explicitly specified by RFC 4271. RFC 5065 specifies that
50 * inconsistent AS_CONFED_* segments should cause a session drop, but
51 * implementations that pass invalid AS_CONFED_* segments are
54 * Error handling of AS4_* attributes is done as specified by RFC 6793. There
55 * are several possible inconsistencies between AGGREGATOR and AS4_AGGREGATOR
56 * that are not handled by that RFC, these are logged and ignored (see
57 * bgp_reconstruct_4b_attrs()).
61 static byte bgp_mandatory_attrs
[] = { BA_ORIGIN
, BA_AS_PATH
73 int (*validate
)(struct bgp_proto
*p
, byte
*attr
, int len
);
74 void (*format
)(eattr
*ea
, byte
*buf
, int buflen
);
81 bgp_check_origin(struct bgp_proto
*p UNUSED
, byte
*a
, int len UNUSED
)
89 bgp_format_origin(eattr
*a
, byte
*buf
, int buflen UNUSED
)
91 static char *bgp_origin_names
[] = { "IGP", "EGP", "Incomplete" };
93 bsprintf(buf
, bgp_origin_names
[a
->u
.data
]);
97 path_segment_contains(byte
*p
, int bs
, u32 asn
)
105 u32 asn2
= (bs
== 4) ? get_u32(p
) : get_u16(p
);
114 /* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */
116 validate_path(struct bgp_proto
*p
, int as_path
, int bs
, byte
*idata
, uint
*ilength
)
130 plen
= 2 + bs
* a
[1];
136 log(L_WARN
"%s: %s_PATH attribute contains empty segment, skipping it",
137 p
->p
.name
, as_path
? "AS" : "AS4");
147 case AS_PATH_SEQUENCE
:
151 case AS_PATH_CONFED_SEQUENCE
:
152 case AS_PATH_CONFED_SET
:
153 if (as_path
&& path_segment_contains(a
, bs
, p
->remote_as
))
155 log(L_WARN
"%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p
->p
.name
);
159 log(L_WARN
"%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment",
160 p
->p
.name
, as_path
? "AS" : "AS4");
168 memmove(dst
, a
, plen
);
176 *ilength
= dst
- idata
;
181 validate_as_path(struct bgp_proto
*p
, byte
*a
, int *len
)
183 return validate_path(p
, 1, p
->as4_session
? 4 : 2, a
, len
);
187 validate_as4_path(struct bgp_proto
*p
, struct adata
*path
)
189 return validate_path(p
, 0, 4, path
->data
, &path
->length
);
193 bgp_check_next_hop(struct bgp_proto
*p UNUSED
, byte
*a UNUSED6
, int len UNUSED6
)
200 memcpy(&addr
, a
, len
);
202 if (ipa_classify(addr
) & IADDR_HOST
)
210 bgp_format_next_hop(eattr
*a
, byte
*buf
, int buflen UNUSED
)
212 ip_addr
*ipp
= (ip_addr
*) a
->u
.ptr
->data
;
214 /* in IPv6, we might have two addresses in NEXT HOP */
215 if ((a
->u
.ptr
->length
== NEXT_HOP_LENGTH
) && ipa_nonzero(ipp
[1]))
217 bsprintf(buf
, "%I %I", ipp
[0], ipp
[1]);
222 bsprintf(buf
, "%I", ipp
[0]);
226 bgp_check_aggregator(struct bgp_proto
*p
, byte
*a UNUSED
, int len
)
228 int exp_len
= p
->as4_session
? 8 : 6;
230 return (len
== exp_len
) ? 0 : WITHDRAW
;
234 bgp_format_aggregator(eattr
*a
, byte
*buf
, int buflen UNUSED
)
236 struct adata
*ad
= a
->u
.ptr
;
237 byte
*data
= ad
->data
;
243 bsprintf(buf
, "%d.%d.%d.%d AS%u", data
[0], data
[1], data
[2], data
[3], as
);
247 bgp_check_community(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len
)
249 return ((len
% 4) == 0) ? 0 : WITHDRAW
;
253 bgp_check_cluster_list(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len
)
255 return ((len
% 4) == 0) ? 0 : 5;
259 bgp_format_cluster_list(eattr
*a
, byte
*buf
, int buflen
)
261 /* Truncates cluster lists larger than buflen, probably not a problem */
262 int_set_format(a
->u
.ptr
, 0, -1, buf
, buflen
);
266 bgp_check_reach_nlri(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len UNUSED
)
269 p
->mp_reach_start
= a
;
270 p
->mp_reach_len
= len
;
276 bgp_check_unreach_nlri(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len UNUSED
)
279 p
->mp_unreach_start
= a
;
280 p
->mp_unreach_len
= len
;
286 bgp_check_ext_community(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len
)
288 return ((len
% 8) == 0) ? 0 : WITHDRAW
;
292 bgp_check_large_community(struct bgp_proto
*p UNUSED
, byte
*a UNUSED
, int len
)
294 return ((len
% 12) == 0) ? 0 : WITHDRAW
;
298 static struct attr_desc bgp_attr_table
[] = {
299 { NULL
, -1, 0, 0, 0, /* Undefined */
301 { "origin", 1, BAF_TRANSITIVE
, EAF_TYPE_INT
, 1, /* BA_ORIGIN */
302 bgp_check_origin
, bgp_format_origin
},
303 { "as_path", -1, BAF_TRANSITIVE
, EAF_TYPE_AS_PATH
, 1, /* BA_AS_PATH */
304 NULL
, NULL
}, /* is checked by validate_as_path() as a special case */
305 { "next_hop", 4, BAF_TRANSITIVE
, EAF_TYPE_IP_ADDRESS
, 1, /* BA_NEXT_HOP */
306 bgp_check_next_hop
, bgp_format_next_hop
},
307 { "med", 4, BAF_OPTIONAL
, EAF_TYPE_INT
, 1, /* BA_MULTI_EXIT_DISC */
309 { "local_pref", 4, BAF_TRANSITIVE
, EAF_TYPE_INT
, 1, /* BA_LOCAL_PREF */
311 { "atomic_aggr", 0, BAF_TRANSITIVE
, EAF_TYPE_OPAQUE
, 1, /* BA_ATOMIC_AGGR */
313 { "aggregator", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_OPAQUE
, 1, /* BA_AGGREGATOR */
314 bgp_check_aggregator
, bgp_format_aggregator
},
315 { "community", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_INT_SET
, 1, /* BA_COMMUNITY */
316 bgp_check_community
, NULL
},
317 { "originator_id", 4, BAF_OPTIONAL
, EAF_TYPE_ROUTER_ID
, 0, /* BA_ORIGINATOR_ID */
319 { "cluster_list", -1, BAF_OPTIONAL
, EAF_TYPE_INT_SET
, 0, /* BA_CLUSTER_LIST */
320 bgp_check_cluster_list
, bgp_format_cluster_list
},
321 { .name
= NULL
}, /* BA_DPA */
322 { .name
= NULL
}, /* BA_ADVERTISER */
323 { .name
= NULL
}, /* BA_RCID_PATH */
324 { "mp_reach_nlri", -1, BAF_OPTIONAL
, EAF_TYPE_OPAQUE
, 1, /* BA_MP_REACH_NLRI */
325 bgp_check_reach_nlri
, NULL
},
326 { "mp_unreach_nlri", -1, BAF_OPTIONAL
, EAF_TYPE_OPAQUE
, 1, /* BA_MP_UNREACH_NLRI */
327 bgp_check_unreach_nlri
, NULL
},
328 { "ext_community", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_EC_SET
, 1, /* BA_EXT_COMMUNITY */
329 bgp_check_ext_community
, NULL
},
330 { "as4_path", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_OPAQUE
, 1, /* BA_AS4_PATH */
332 { "as4_aggregator", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_OPAQUE
, 1, /* BA_AS4_PATH */
334 [BA_LARGE_COMMUNITY
] =
335 { "large_community", -1, BAF_OPTIONAL
| BAF_TRANSITIVE
, EAF_TYPE_LC_SET
, 1,
336 bgp_check_large_community
, NULL
}
339 /* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH.
340 * It does not matter as this attribute does not appear on routes in the routing table.
343 #define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)
345 static inline struct adata
*
346 bgp_alloc_adata(struct linpool
*pool
, unsigned len
)
348 struct adata
*ad
= lp_alloc(pool
, sizeof(struct adata
) + len
);
354 bgp_set_attr(eattr
*e
, unsigned attr
, uintptr_t val
)
356 ASSERT(ATTR_KNOWN(attr
));
357 e
->id
= EA_CODE(EAP_BGP
, attr
);
358 e
->type
= bgp_attr_table
[attr
].type
;
359 e
->flags
= bgp_attr_table
[attr
].expected_flags
;
360 if (e
->type
& EAF_EMBEDDED
)
363 e
->u
.ptr
= (struct adata
*) val
;
367 bgp_set_attr_wa(eattr
*e
, struct linpool
*pool
, unsigned attr
, unsigned len
)
369 struct adata
*ad
= bgp_alloc_adata(pool
, len
);
370 bgp_set_attr(e
, attr
, (uintptr_t) ad
);
375 bgp_attach_attr(ea_list
**to
, struct linpool
*pool
, unsigned attr
, uintptr_t val
)
377 ea_list
*a
= lp_alloc(pool
, sizeof(ea_list
) + sizeof(eattr
));
380 a
->flags
= EALF_SORTED
;
382 bgp_set_attr(a
->attrs
, attr
, val
);
386 bgp_attach_attr_wa(ea_list
**to
, struct linpool
*pool
, unsigned attr
, unsigned len
)
388 struct adata
*ad
= bgp_alloc_adata(pool
, len
);
389 bgp_attach_attr(to
, pool
, attr
, (uintptr_t) ad
);
394 bgp_encode_attr_hdr(byte
*dst
, uint flags
, unsigned code
, int len
)
398 DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code
, len
, flags
);
409 *dst
++ = flags
| BAF_EXT_LEN
;
419 aggregator_convert_to_old(struct adata
*aggr
, byte
*dst
, int *new_used
)
421 byte
*src
= aggr
->data
;
424 u32 as
= get_u32(src
);
432 /* Copy IPv4 address */
433 memcpy(dst
+ 2, src
+ 4, 4);
437 aggregator_convert_to_new(struct adata
*aggr
, byte
*dst
)
439 byte
*src
= aggr
->data
;
441 u32 as
= get_u16(src
);
444 /* Copy IPv4 address */
445 memcpy(dst
+ 4, src
+ 2, 4);
449 bgp_get_attr_len(eattr
*a
)
452 if (ATTR_KNOWN(EA_ID(a
->id
)))
454 int code
= EA_ID(a
->id
);
455 struct attr_desc
*desc
= &bgp_attr_table
[code
];
456 len
= desc
->expected_length
;
459 ASSERT(!(a
->type
& EAF_EMBEDDED
));
460 len
= a
->u
.ptr
->length
;
465 ASSERT((a
->type
& EAF_TYPE_MASK
) == EAF_TYPE_OPAQUE
);
466 len
= a
->u
.ptr
->length
;
473 * bgp_encode_attrs - encode BGP attributes
476 * @attrs: a list of extended attributes
477 * @remains: remaining space in the buffer
479 * The bgp_encode_attrs() function takes a list of extended attributes
480 * and converts it to its BGP representation (a part of an Update message).
482 * Result: Length of the attribute block generated or -1 if not enough space.
485 bgp_encode_attrs(struct bgp_proto
*p
, byte
*w
, ea_list
*attrs
, int remains
)
487 uint i
, code
, type
, flags
;
491 for(i
=0; i
<attrs
->count
; i
++)
493 eattr
*a
= &attrs
->attrs
[i
];
494 ASSERT(EA_PROTO(a
->id
) == EAP_BGP
);
498 /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
499 if (code
== BA_NEXT_HOP
)
503 /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
504 * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH
505 * as optional AS4_PATH attribute.
507 if ((code
== BA_AS_PATH
) && (! p
->as4_session
))
509 len
= a
->u
.ptr
->length
;
511 if (remains
< (len
+ 4))
514 /* Using temporary buffer because don't know a length of created attr
515 * and therefore a length of a header. Perhaps i should better always
516 * use BAF_EXT_LEN. */
520 int nl
= as_path_convert_to_old(a
->u
.ptr
, buf
, &new_used
);
522 DBG("BGP: Encoding old AS_PATH\n");
523 rv
= bgp_encode_attr_hdr(w
, BAF_TRANSITIVE
, BA_AS_PATH
, nl
);
524 ADVANCE(w
, remains
, rv
);
526 ADVANCE(w
, remains
, nl
);
531 if (remains
< (len
+ 4))
534 /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments
535 * here but we don't support confederations and such paths we already
536 * discarded in bgp_check_as_path().
539 DBG("BGP: Encoding AS4_PATH\n");
540 rv
= bgp_encode_attr_hdr(w
, BAF_OPTIONAL
| BAF_TRANSITIVE
, BA_AS4_PATH
, len
);
541 ADVANCE(w
, remains
, rv
);
542 memcpy(w
, a
->u
.ptr
->data
, len
);
543 ADVANCE(w
, remains
, len
);
548 /* The same issue with AGGREGATOR attribute */
549 if ((code
== BA_AGGREGATOR
) && (! p
->as4_session
))
554 if (remains
< (len
+ 3))
557 rv
= bgp_encode_attr_hdr(w
, BAF_OPTIONAL
| BAF_TRANSITIVE
, BA_AGGREGATOR
, len
);
558 ADVANCE(w
, remains
, rv
);
559 aggregator_convert_to_old(a
->u
.ptr
, w
, &new_used
);
560 ADVANCE(w
, remains
, len
);
566 if (remains
< (len
+ 3))
569 rv
= bgp_encode_attr_hdr(w
, BAF_OPTIONAL
| BAF_TRANSITIVE
, BA_AS4_AGGREGATOR
, len
);
570 ADVANCE(w
, remains
, rv
);
571 memcpy(w
, a
->u
.ptr
->data
, len
);
572 ADVANCE(w
, remains
, len
);
577 /* Standard path continues here ... */
579 type
= a
->type
& EAF_TYPE_MASK
;
580 flags
= a
->flags
& (BAF_OPTIONAL
| BAF_TRANSITIVE
| BAF_PARTIAL
);
581 len
= bgp_get_attr_len(a
);
583 /* Skip empty sets */
584 if (((type
== EAF_TYPE_INT_SET
) || (type
== EAF_TYPE_EC_SET
) || (type
== EAF_TYPE_LC_SET
)) && (len
== 0))
587 if (remains
< len
+ 4)
590 rv
= bgp_encode_attr_hdr(w
, flags
, code
, len
);
591 ADVANCE(w
, remains
, rv
);
596 case EAF_TYPE_ROUTER_ID
:
598 put_u32(w
, a
->u
.data
);
602 case EAF_TYPE_IP_ADDRESS
:
604 ip_addr ip
= *(ip_addr
*)a
->u
.ptr
->data
;
609 case EAF_TYPE_INT_SET
:
610 case EAF_TYPE_LC_SET
:
611 case EAF_TYPE_EC_SET
:
613 u32
*z
= int_set_get_data(a
->u
.ptr
);
615 for(i
=0; i
<len
; i
+=4)
619 case EAF_TYPE_OPAQUE
:
620 case EAF_TYPE_AS_PATH
:
621 memcpy(w
, a
->u
.ptr
->data
, len
);
624 bug("bgp_encode_attrs: unknown attribute type %02x", a
->type
);
626 ADVANCE(w
, remains
, len
);
636 bgp_init_prefix(struct fib_node *N)
638 struct bgp_prefix *p = (struct bgp_prefix *) N;
639 p->bucket_node.next = NULL;
644 bgp_compare_u32(const u32
*x
, const u32
*y
)
646 return (*x
< *y
) ? -1 : (*x
> *y
) ? 1 : 0;
650 bgp_normalize_int_set(u32
*dest
, u32
*src
, unsigned cnt
)
652 memcpy(dest
, src
, sizeof(u32
) * cnt
);
653 qsort(dest
, cnt
, sizeof(u32
), (int(*)(const void *, const void *)) bgp_compare_u32
);
657 bgp_compare_ec(const u32
*xp
, const u32
*yp
)
659 u64 x
= ec_get(xp
, 0);
660 u64 y
= ec_get(yp
, 0);
661 return (x
< y
) ? -1 : (x
> y
) ? 1 : 0;
665 bgp_normalize_ec_set(struct adata
*ad
, u32
*src
, int internal
)
667 u32
*dst
= int_set_get_data(ad
);
669 /* Remove non-transitive communities (EC_TBIT active) on external sessions */
672 int len
= int_set_get_size(ad
);
676 for (i
=0; i
< len
; i
+= 2)
678 if (src
[i
] & EC_TBIT
)
685 ad
->length
= (t
- dst
) * 4;
688 memcpy(dst
, src
, ad
->length
);
690 qsort(dst
, ad
->length
/ 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec
);
694 bgp_compare_lc(const u32
*x
, const u32
*y
)
697 return (x
[0] > y
[0]) ? 1 : -1;
699 return (x
[1] > y
[1]) ? 1 : -1;
701 return (x
[2] > y
[2]) ? 1 : -1;
706 bgp_normalize_lc_set(u32
*dest
, u32
*src
, unsigned cnt
)
708 memcpy(dest
, src
, LCOMM_LENGTH
* cnt
);
709 qsort(dest
, cnt
, LCOMM_LENGTH
, (int(*)(const void *, const void *)) bgp_compare_lc
);
713 bgp_rehash_buckets(struct bgp_proto
*p
)
715 struct bgp_bucket
**old
= p
->bucket_hash
;
716 struct bgp_bucket
**new;
717 unsigned oldn
= p
->hash_size
;
719 struct bgp_bucket
*b
;
721 p
->hash_size
= p
->hash_limit
;
722 DBG("BGP: Rehashing bucket table from %d to %d\n", oldn
, p
->hash_size
);
724 if (p
->hash_limit
>= 65536)
726 new = p
->bucket_hash
= mb_allocz(p
->p
.pool
, p
->hash_size
* sizeof(struct bgp_bucket
*));
727 mask
= p
->hash_size
- 1;
728 for (i
=0; i
<oldn
; i
++)
731 old
[i
] = b
->hash_next
;
733 b
->hash_next
= new[e
];
735 b
->hash_next
->hash_prev
= b
;
742 static struct bgp_bucket
*
743 bgp_new_bucket(struct bgp_proto
*p
, ea_list
*new, unsigned hash
)
745 struct bgp_bucket
*b
;
746 unsigned ea_size
= sizeof(ea_list
) + new->count
* sizeof(eattr
);
747 unsigned ea_size_aligned
= BIRD_ALIGN(ea_size
, CPU_STRUCT_ALIGN
);
748 unsigned size
= sizeof(struct bgp_bucket
) + ea_size_aligned
;
751 unsigned index
= hash
& (p
->hash_size
- 1);
753 /* Gather total size of non-inline attributes */
754 for (i
=0; i
<new->count
; i
++)
756 eattr
*a
= &new->attrs
[i
];
757 if (!(a
->type
& EAF_EMBEDDED
))
758 size
+= BIRD_ALIGN(sizeof(struct adata
) + a
->u
.ptr
->length
, CPU_STRUCT_ALIGN
);
761 /* Create the bucket and hash it */
762 b
= mb_alloc(p
->p
.pool
, size
);
763 b
->hash_next
= p
->bucket_hash
[index
];
765 b
->hash_next
->hash_prev
= b
;
766 p
->bucket_hash
[index
] = b
;
769 add_tail(&p
->bucket_queue
, &b
->send_node
);
770 init_list(&b
->prefixes
);
771 memcpy(b
->eattrs
, new, ea_size
);
772 dest
= ((byte
*)b
->eattrs
) + ea_size_aligned
;
774 /* Copy values of non-inline attributes */
775 for (i
=0; i
<new->count
; i
++)
777 eattr
*a
= &b
->eattrs
->attrs
[i
];
778 if (!(a
->type
& EAF_EMBEDDED
))
780 struct adata
*oa
= a
->u
.ptr
;
781 struct adata
*na
= (struct adata
*) dest
;
782 memcpy(na
, oa
, sizeof(struct adata
) + oa
->length
);
784 dest
+= BIRD_ALIGN(sizeof(struct adata
) + na
->length
, CPU_STRUCT_ALIGN
);
788 /* If needed, rehash */
790 if (p
->hash_count
> p
->hash_limit
)
791 bgp_rehash_buckets(p
);
796 static struct bgp_bucket
*
797 bgp_get_bucket(struct bgp_proto
*p
, net
*n
, ea_list
*attrs
, int originate
)
800 unsigned i
, cnt
, hash
, code
;
803 struct bgp_bucket
*b
;
805 /* Merge the attribute list */
806 new = alloca(ea_scan(attrs
));
807 ea_merge(attrs
, new);
810 /* Normalize attributes */
817 if (EA_PROTO(a
->id
) != EAP_BGP
)
820 if (ATTR_KNOWN(code
))
824 if (!bgp_attr_table
[code
].allow_in_ebgp
)
826 if ((code
== BA_LOCAL_PREF
) && !p
->cf
->allow_local_pref
)
829 /* The flags might have been zero if the attr was added by filters */
830 a
->flags
= (a
->flags
& BAF_PARTIAL
) | bgp_attr_table
[code
].expected_flags
;
836 /* Don't re-export unknown non-transitive attributes */
837 if (!(a
->flags
& BAF_TRANSITIVE
))
841 if ((d
->type
& EAF_ORIGINATED
) && !originate
&& (d
->flags
& BAF_TRANSITIVE
) && (d
->flags
& BAF_OPTIONAL
))
842 d
->flags
|= BAF_PARTIAL
;
843 switch (d
->type
& EAF_TYPE_MASK
)
845 case EAF_TYPE_INT_SET
:
847 struct adata
*z
= alloca(sizeof(struct adata
) + d
->u
.ptr
->length
);
848 z
->length
= d
->u
.ptr
->length
;
849 bgp_normalize_int_set((u32
*) z
->data
, (u32
*) d
->u
.ptr
->data
, z
->length
/ 4);
853 case EAF_TYPE_EC_SET
:
855 struct adata
*z
= alloca(sizeof(struct adata
) + d
->u
.ptr
->length
);
856 z
->length
= d
->u
.ptr
->length
;
857 bgp_normalize_ec_set(z
, (u32
*) d
->u
.ptr
->data
, p
->is_internal
);
861 case EAF_TYPE_LC_SET
:
863 struct adata
*z
= alloca(sizeof(struct adata
) + d
->u
.ptr
->length
);
864 z
->length
= d
->u
.ptr
->length
;
865 bgp_normalize_lc_set((u32
*) z
->data
, (u32
*) d
->u
.ptr
->data
, z
->length
/ LCOMM_LENGTH
);
877 for(b
=p
->bucket_hash
[hash
& (p
->hash_size
- 1)]; b
; b
=b
->hash_next
)
878 if (b
->hash
== hash
&& ea_same(b
->eattrs
, new))
880 DBG("Found bucket.\n");
884 /* Ensure that there are all mandatory attributes */
885 for(i
=0; i
<ARRAY_SIZE(bgp_mandatory_attrs
); i
++)
886 if (!(seen
& (1 << bgp_mandatory_attrs
[i
])))
888 log(L_ERR
"%s: Mandatory attribute %s missing in route %I/%d", p
->p
.name
, bgp_attr_table
[bgp_mandatory_attrs
[i
]].name
, n
->n
.prefix
, n
->n
.pxlen
);
892 /* Check if next hop is valid */
893 a
= ea_find(new, EA_CODE(EAP_BGP
, BA_NEXT_HOP
));
894 if (!a
|| ipa_equal(p
->cf
->remote_ip
, *(ip_addr
*)a
->u
.ptr
->data
))
896 log(L_ERR
"%s: Invalid NEXT_HOP attribute in route %I/%d", p
->p
.name
, n
->n
.prefix
, n
->n
.pxlen
);
900 /* Create new bucket */
901 DBG("Creating bucket.\n");
902 return bgp_new_bucket(p
, new, hash
);
906 bgp_free_bucket(struct bgp_proto
*p
, struct bgp_bucket
*buck
)
909 buck
->hash_next
->hash_prev
= buck
->hash_prev
;
911 buck
->hash_prev
->hash_next
= buck
->hash_next
;
913 p
->bucket_hash
[buck
->hash
& (p
->hash_size
-1)] = buck
->hash_next
;
918 /* Prefix hash table */
920 #define PXH_KEY(n1) n1->n.prefix, n1->n.pxlen, n1->path_id
921 #define PXH_NEXT(n) n->next
922 #define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2
923 #define PXH_FN(p,l,i) ipa_hash32(p) ^ u32_hash((l << 16) ^ i)
925 #define PXH_REHASH bgp_pxh_rehash
926 #define PXH_PARAMS /8, *2, 2, 2, 8, 20
929 HASH_DEFINE_REHASH_FN(PXH
, struct bgp_prefix
)
932 bgp_init_prefix_table(struct bgp_proto
*p
, u32 order
)
934 HASH_INIT(p
->prefix_hash
, p
->p
.pool
, order
);
936 p
->prefix_slab
= sl_new(p
->p
.pool
, sizeof(struct bgp_prefix
));
940 bgp_free_prefix_table(struct bgp_proto
*p
)
942 HASH_FREE(p
->prefix_hash
);
944 rfree(p
->prefix_slab
);
945 p
->prefix_slab
= NULL
;
948 static struct bgp_prefix
*
949 bgp_get_prefix(struct bgp_proto
*p
, ip_addr prefix
, int pxlen
, u32 path_id
)
951 struct bgp_prefix
*bp
= HASH_FIND(p
->prefix_hash
, PXH
, prefix
, pxlen
, path_id
);
956 bp
= sl_alloc(p
->prefix_slab
);
957 bp
->n
.prefix
= prefix
;
959 bp
->path_id
= path_id
;
960 bp
->bucket_node
.next
= NULL
;
962 HASH_INSERT2(p
->prefix_hash
, PXH
, p
->p
.pool
, bp
);
968 bgp_free_prefix(struct bgp_proto
*p
, struct bgp_prefix
*bp
)
970 HASH_REMOVE2(p
->prefix_hash
, PXH
, p
->p
.pool
, bp
);
971 sl_free(p
->prefix_slab
, bp
);
976 bgp_rt_notify(struct proto
*P
, rtable
*tbl UNUSED
, net
*n
, rte
*new, rte
*old UNUSED
, ea_list
*attrs
)
978 struct bgp_proto
*p
= (struct bgp_proto
*) P
;
979 struct bgp_bucket
*buck
;
980 struct bgp_prefix
*px
;
984 DBG("BGP: Got route %I/%d %s\n", n
->n
.prefix
, n
->n
.pxlen
, new ? "up" : "down");
989 buck
= bgp_get_bucket(p
, n
, attrs
, new->attrs
->source
!= RTS_BGP
);
990 if (!buck
) /* Inconsistent attribute list */
996 if (!(buck
= p
->withdraw_bucket
))
998 buck
= p
->withdraw_bucket
= mb_alloc(P
->pool
, sizeof(struct bgp_bucket
));
999 init_list(&buck
->prefixes
);
1002 path_id
= p
->add_path_tx
? key
->attrs
->src
->global_id
: 0;
1003 px
= bgp_get_prefix(p
, n
->n
.prefix
, n
->n
.pxlen
, path_id
);
1004 if (px
->bucket_node
.next
)
1006 DBG("\tRemoving old entry.\n");
1007 rem_node(&px
->bucket_node
);
1009 add_tail(&buck
->prefixes
, &px
->bucket_node
);
1010 bgp_schedule_packet(p
->conn
, PKT_UPDATE
);
1014 bgp_create_attrs(struct bgp_proto
*p
, rte
*e
, ea_list
**attrs
, struct linpool
*pool
)
1016 ea_list
*ea
= lp_alloc(pool
, sizeof(ea_list
) + 4*sizeof(eattr
));
1017 rta
*rta
= e
->attrs
;
1022 ea
->flags
= EALF_SORTED
;
1025 bgp_set_attr(ea
->attrs
, BA_ORIGIN
,
1026 ((rta
->source
== RTS_OSPF_EXT1
) || (rta
->source
== RTS_OSPF_EXT2
)) ? ORIGIN_INCOMPLETE
: ORIGIN_IGP
);
1029 bgp_set_attr_wa(ea
->attrs
+1, pool
, BA_AS_PATH
, 0);
1032 z
= bgp_set_attr_wa(ea
->attrs
+1, pool
, BA_AS_PATH
, 6);
1033 z
[0] = AS_PATH_SEQUENCE
;
1034 z
[1] = 1; /* 1 AS */
1035 put_u32(z
+2, p
->local_as
);
1038 /* iBGP -> use gw, eBGP multi-hop -> use source_addr,
1039 eBGP single-hop -> use gw if on the same iface */
1040 z
= bgp_set_attr_wa(ea
->attrs
+2, pool
, BA_NEXT_HOP
, NEXT_HOP_LENGTH
);
1041 if (p
->cf
->next_hop_self
||
1042 rta
->dest
!= RTD_ROUTER
||
1043 ipa_equal(rta
->gw
, IPA_NONE
) ||
1044 ipa_is_link_local(rta
->gw
) ||
1045 (!p
->is_internal
&& !p
->cf
->next_hop_keep
&&
1046 (!p
->neigh
|| (rta
->iface
!= p
->neigh
->iface
))))
1047 set_next_hop(z
, p
->source_addr
);
1049 set_next_hop(z
, rta
->gw
);
1051 bgp_set_attr(ea
->attrs
+3, BA_LOCAL_PREF
, p
->cf
->default_local_pref
);
1053 return 0; /* Leave decision to the filters */
1058 bgp_as_path_loopy(struct bgp_proto
*p
, rta
*a
)
1060 int num
= p
->cf
->allow_local_as
+ 1;
1061 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1062 return (e
&& (num
> 0) && as_path_contains(e
->u
.ptr
, p
->local_as
, num
));
1066 bgp_originator_id_loopy(struct bgp_proto
*p
, rta
*a
)
1068 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGINATOR_ID
));
1069 return (e
&& (e
->u
.data
== p
->local_id
));
1073 bgp_cluster_list_loopy(struct bgp_proto
*p
, rta
*a
)
1075 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_CLUSTER_LIST
));
1076 return (e
&& p
->rr_client
&& int_set_contains(e
->u
.ptr
, p
->rr_cluster_id
));
1081 bgp_path_prepend(rte
*e
, ea_list
**attrs
, struct linpool
*pool
, u32 as
)
1083 eattr
*a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1084 bgp_attach_attr(attrs
, pool
, BA_AS_PATH
, (uintptr_t) as_path_prepend(pool
, a
->u
.ptr
, as
));
1088 bgp_cluster_list_prepend(rte
*e
, ea_list
**attrs
, struct linpool
*pool
, u32 cid
)
1090 eattr
*a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_CLUSTER_LIST
));
1091 bgp_attach_attr(attrs
, pool
, BA_CLUSTER_LIST
, (uintptr_t) int_set_prepend(pool
, a
? a
->u
.ptr
: NULL
, cid
));
1095 bgp_update_attrs(struct bgp_proto
*p
, rte
*e
, ea_list
**attrs
, struct linpool
*pool
, int rr
)
1099 if (!p
->is_internal
&& !p
->rs_client
)
1101 bgp_path_prepend(e
, attrs
, pool
, p
->local_as
);
1103 /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
1104 * propagated to other neighboring ASes.
1105 * Perhaps it would be better to undefine it.
1107 a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_MULTI_EXIT_DISC
));
1109 bgp_attach_attr(attrs
, pool
, BA_MULTI_EXIT_DISC
, 0);
1112 /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr,
1113 * eBGP single-hop -> keep next_hop if on the same iface.
1114 * If the next_hop is zero (i.e. link-local), keep only if on the same iface.
1116 * Note that same-iface-check uses iface from route, which is based on gw.
1118 a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_NEXT_HOP
));
1119 if (a
&& !p
->cf
->next_hop_self
&&
1120 (p
->cf
->next_hop_keep
||
1121 (p
->is_internal
&& ipa_nonzero(*((ip_addr
*) a
->u
.ptr
->data
))) ||
1122 (p
->neigh
&& (e
->attrs
->iface
== p
->neigh
->iface
))))
1124 /* Leave the original next hop attribute, will check later where does it point */
1128 /* Need to create new one */
1129 byte
*b
= bgp_attach_attr_wa(attrs
, pool
, BA_NEXT_HOP
, NEXT_HOP_LENGTH
);
1130 set_next_hop(b
, p
->source_addr
);
1135 /* Handling route reflection, RFC 4456 */
1136 struct bgp_proto
*src
= (struct bgp_proto
*) e
->attrs
->src
->proto
;
1138 a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGINATOR_ID
));
1140 bgp_attach_attr(attrs
, pool
, BA_ORIGINATOR_ID
, src
->remote_id
);
1142 /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
1143 bgp_cluster_list_prepend(e
, attrs
, pool
, src
->rr_client
? src
->rr_cluster_id
: p
->rr_cluster_id
);
1145 /* Two RR clients with different cluster ID, hmmm */
1146 if (src
->rr_client
&& p
->rr_client
&& (src
->rr_cluster_id
!= p
->rr_cluster_id
))
1147 bgp_cluster_list_prepend(e
, attrs
, pool
, p
->rr_cluster_id
);
1150 return 0; /* Leave decision to the filters */
1154 bgp_community_filter(struct bgp_proto
*p
, rte
*e
)
1159 /* Check if we aren't forbidden to export the route by communities */
1160 a
= ea_find(e
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_COMMUNITY
));
1164 if (int_set_contains(d
, BGP_COMM_NO_ADVERTISE
))
1166 DBG("\tNO_ADVERTISE\n");
1169 if (!p
->is_internal
&&
1170 (int_set_contains(d
, BGP_COMM_NO_EXPORT
) ||
1171 int_set_contains(d
, BGP_COMM_NO_EXPORT_SUBCONFED
)))
1173 DBG("\tNO_EXPORT\n");
1182 bgp_import_control(struct proto
*P
, rte
**new, ea_list
**attrs
, struct linpool
*pool
)
1185 struct bgp_proto
*p
= (struct bgp_proto
*) P
;
1186 struct bgp_proto
*new_bgp
= (e
->attrs
->src
->proto
->proto
== &proto_bgp
) ?
1187 (struct bgp_proto
*) e
->attrs
->src
->proto
: NULL
;
1189 if (p
== new_bgp
) /* Poison reverse updates */
1193 /* We should check here for cluster list loop, because the receiving BGP instance
1194 might have different cluster ID */
1195 if (bgp_cluster_list_loopy(p
, e
->attrs
))
1198 if (p
->cf
->interpret_communities
&& bgp_community_filter(p
, e
))
1201 if (p
->local_as
== new_bgp
->local_as
&& p
->is_internal
&& new_bgp
->is_internal
)
1203 /* Redistribution of internal routes with IBGP */
1204 if (p
->rr_client
|| new_bgp
->rr_client
)
1205 /* Route reflection, RFC 4456 */
1206 return bgp_update_attrs(p
, e
, attrs
, pool
, 1);
1211 return bgp_update_attrs(p
, e
, attrs
, pool
, 0);
1214 return bgp_create_attrs(p
, e
, attrs
, pool
);
1218 bgp_get_neighbor(rte
*r
)
1220 eattr
*e
= ea_find(r
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1223 if (e
&& as_path_get_first(e
->u
.ptr
, &as
))
1226 return ((struct bgp_proto
*) r
->attrs
->src
->proto
)->remote_as
;
1230 rte_resolvable(rte
*rt
)
1232 int rd
= rt
->attrs
->dest
;
1233 return (rd
== RTD_ROUTER
) || (rd
== RTD_DEVICE
) || (rd
== RTD_MULTIPATH
);
1237 bgp_rte_better(rte
*new, rte
*old
)
1239 struct bgp_proto
*new_bgp
= (struct bgp_proto
*) new->attrs
->src
->proto
;
1240 struct bgp_proto
*old_bgp
= (struct bgp_proto
*) old
->attrs
->src
->proto
;
1244 /* Skip suppressed routes (see bgp_rte_recalculate()) */
1245 n
= new->u
.bgp
.suppressed
;
1246 o
= old
->u
.bgp
.suppressed
;
1252 /* RFC 4271 9.1.2.1. Route resolvability test */
1253 n
= rte_resolvable(new);
1254 o
= rte_resolvable(old
);
1260 /* Start with local preferences */
1261 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_LOCAL_PREF
));
1262 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_LOCAL_PREF
));
1263 n
= x
? x
->u
.data
: new_bgp
->cf
->default_local_pref
;
1264 o
= y
? y
->u
.data
: old_bgp
->cf
->default_local_pref
;
1270 /* RFC 4271 9.1.2.2. a) Use AS path lengths */
1271 if (new_bgp
->cf
->compare_path_lengths
|| old_bgp
->cf
->compare_path_lengths
)
1273 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1274 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1275 n
= x
? as_path_getlen(x
->u
.ptr
) : AS_PATH_MAXLEN
;
1276 o
= y
? as_path_getlen(y
->u
.ptr
) : AS_PATH_MAXLEN
;
1283 /* RFC 4271 9.1.2.2. b) Use origins */
1284 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGIN
));
1285 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGIN
));
1286 n
= x
? x
->u
.data
: ORIGIN_INCOMPLETE
;
1287 o
= y
? y
->u
.data
: ORIGIN_INCOMPLETE
;
1293 /* RFC 4271 9.1.2.2. c) Compare MED's */
1294 /* Proper RFC 4271 path selection cannot be interpreted as finding
1295 * the best path in some ordering. It is implemented partially in
1296 * bgp_rte_recalculate() when deterministic_med option is
1297 * active. Without that option, the behavior is just an
1298 * approximation, which in specific situations may lead to
1299 * persistent routing loops, because it is nondeterministic - it
1300 * depends on the order in which routes appeared. But it is also the
1301 * same behavior as used by default in Cisco routers, so it is
1302 * probably not a big issue.
1304 if (new_bgp
->cf
->med_metric
|| old_bgp
->cf
->med_metric
||
1305 (bgp_get_neighbor(new) == bgp_get_neighbor(old
)))
1307 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_MULTI_EXIT_DISC
));
1308 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_MULTI_EXIT_DISC
));
1309 n
= x
? x
->u
.data
: new_bgp
->cf
->default_med
;
1310 o
= y
? y
->u
.data
: old_bgp
->cf
->default_med
;
1317 /* RFC 4271 9.1.2.2. d) Prefer external peers */
1318 if (new_bgp
->is_internal
> old_bgp
->is_internal
)
1320 if (new_bgp
->is_internal
< old_bgp
->is_internal
)
1323 /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
1324 n
= new_bgp
->cf
->igp_metric
? new->attrs
->igp_metric
: 0;
1325 o
= old_bgp
->cf
->igp_metric
? old
->attrs
->igp_metric
: 0;
1331 /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
1332 /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
1333 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGINATOR_ID
));
1334 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGINATOR_ID
));
1335 n
= x
? x
->u
.data
: new_bgp
->remote_id
;
1336 o
= y
? y
->u
.data
: old_bgp
->remote_id
;
1338 /* RFC 5004 - prefer older routes */
1339 /* (if both are external and from different peer) */
1340 if ((new_bgp
->cf
->prefer_older
|| old_bgp
->cf
->prefer_older
) &&
1341 !new_bgp
->is_internal
&& n
!= o
)
1344 /* rest of RFC 4271 9.1.2.2. f) */
1350 /* RFC 4456 9. b) Compare cluster list lengths */
1351 x
= ea_find(new->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_CLUSTER_LIST
));
1352 y
= ea_find(old
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_CLUSTER_LIST
));
1353 n
= x
? int_set_get_size(x
->u
.ptr
) : 0;
1354 o
= y
? int_set_get_size(y
->u
.ptr
) : 0;
1360 /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
1361 return (ipa_compare(new_bgp
->cf
->remote_ip
, old_bgp
->cf
->remote_ip
) < 0);
1366 bgp_rte_mergable(rte
*pri
, rte
*sec
)
1368 struct bgp_proto
*pri_bgp
= (struct bgp_proto
*) pri
->attrs
->src
->proto
;
1369 struct bgp_proto
*sec_bgp
= (struct bgp_proto
*) sec
->attrs
->src
->proto
;
1373 /* Skip suppressed routes (see bgp_rte_recalculate()) */
1374 if (pri
->u
.bgp
.suppressed
!= sec
->u
.bgp
.suppressed
)
1377 /* RFC 4271 9.1.2.1. Route resolvability test */
1378 if (!rte_resolvable(sec
))
1381 /* Start with local preferences */
1382 x
= ea_find(pri
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_LOCAL_PREF
));
1383 y
= ea_find(sec
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_LOCAL_PREF
));
1384 p
= x
? x
->u
.data
: pri_bgp
->cf
->default_local_pref
;
1385 s
= y
? y
->u
.data
: sec_bgp
->cf
->default_local_pref
;
1389 /* RFC 4271 9.1.2.2. a) Use AS path lengths */
1390 if (pri_bgp
->cf
->compare_path_lengths
|| sec_bgp
->cf
->compare_path_lengths
)
1392 x
= ea_find(pri
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1393 y
= ea_find(sec
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1394 p
= x
? as_path_getlen(x
->u
.ptr
) : AS_PATH_MAXLEN
;
1395 s
= y
? as_path_getlen(y
->u
.ptr
) : AS_PATH_MAXLEN
;
1400 // if (DELTA(p, s) > pri_bgp->cf->relax_multipath)
1404 /* RFC 4271 9.1.2.2. b) Use origins */
1405 x
= ea_find(pri
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGIN
));
1406 y
= ea_find(sec
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_ORIGIN
));
1407 p
= x
? x
->u
.data
: ORIGIN_INCOMPLETE
;
1408 s
= y
? y
->u
.data
: ORIGIN_INCOMPLETE
;
1412 /* RFC 4271 9.1.2.2. c) Compare MED's */
1413 if (pri_bgp
->cf
->med_metric
|| sec_bgp
->cf
->med_metric
||
1414 (bgp_get_neighbor(pri
) == bgp_get_neighbor(sec
)))
1416 x
= ea_find(pri
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_MULTI_EXIT_DISC
));
1417 y
= ea_find(sec
->attrs
->eattrs
, EA_CODE(EAP_BGP
, BA_MULTI_EXIT_DISC
));
1418 p
= x
? x
->u
.data
: pri_bgp
->cf
->default_med
;
1419 s
= y
? y
->u
.data
: sec_bgp
->cf
->default_med
;
1424 /* RFC 4271 9.1.2.2. d) Prefer external peers */
1425 if (pri_bgp
->is_internal
!= sec_bgp
->is_internal
)
1428 /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
1429 p
= pri_bgp
->cf
->igp_metric
? pri
->attrs
->igp_metric
: 0;
1430 s
= sec_bgp
->cf
->igp_metric
? sec
->attrs
->igp_metric
: 0;
1434 /* Remaining criteria are ignored */
1442 same_group(rte
*r
, u32 lpref
, u32 lasn
)
1444 return (r
->pref
== lpref
) && (bgp_get_neighbor(r
) == lasn
);
1448 use_deterministic_med(rte
*r
)
1450 struct proto
*P
= r
->attrs
->src
->proto
;
1451 return (P
->proto
== &proto_bgp
) && ((struct bgp_proto
*) P
)->cf
->deterministic_med
;
1455 bgp_rte_recalculate(rtable
*table
, net
*net
, rte
*new, rte
*old
, rte
*old_best
)
1458 rte
*key
= new ? new : old
;
1459 u32 lpref
= key
->pref
;
1460 u32 lasn
= bgp_get_neighbor(key
);
1461 int old_is_group_best
= 0;
1464 * Proper RFC 4271 path selection is a bit complicated, it cannot be
1465 * implemented just by rte_better(), because it is not a linear
1466 * ordering. But it can be splitted to two levels, where the lower
1467 * level chooses the best routes in each group of routes from the
1468 * same neighboring AS and higher level chooses the best route (with
1469 * a slightly different ordering) between the best-in-group routes.
1471 * When deterministic_med is disabled, we just ignore this issue and
1472 * choose the best route by bgp_rte_better() alone. If enabled, the
1473 * lower level of the route selection is done here (for the group
1474 * to which the changed route belongs), all routes in group are
1475 * marked as suppressed, just chosen best-in-group is not.
1477 * Global best route selection then implements higher level by
1478 * choosing between non-suppressed routes (as they are always
1479 * preferred over suppressed routes). Routes from BGP protocols
1480 * that do not set deterministic_med are just never suppressed. As
1481 * they do not participate in the lower level selection, it is OK
1482 * that this fn is not called for them.
1484 * The idea is simple, the implementation is more problematic,
1485 * mostly because of optimizations in rte_recalculate() that
1486 * avoids full recalculation in most cases.
1488 * We can assume that at least one of new, old is non-NULL and both
1489 * are from the same protocol with enabled deterministic_med. We
1490 * group routes by both neighbor AS (lasn) and preference (lpref),
1491 * because bgp_rte_better() does not handle preference itself.
1494 /* If new and old are from different groups, we just process that
1495 as two independent events */
1496 if (new && old
&& !same_group(old
, lpref
, lasn
))
1499 i1
= bgp_rte_recalculate(table
, net
, NULL
, old
, old_best
);
1500 i2
= bgp_rte_recalculate(table
, net
, new, NULL
, old_best
);
1505 * We could find the best-in-group and then make some shortcuts like
1506 * in rte_recalculate, but as we would have to walk through all
1507 * net->routes just to find it, it is probably not worth. So we
1508 * just have two simpler fast cases that use just the old route.
1509 * We also set suppressed flag to avoid using it in bgp_rte_better().
1513 new->u
.bgp
.suppressed
= 1;
1517 old_is_group_best
= !old
->u
.bgp
.suppressed
;
1518 old
->u
.bgp
.suppressed
= 1;
1519 int new_is_better
= new && bgp_rte_better(new, old
);
1521 /* The first case - replace not best with worse (or remove not best) */
1522 if (!old_is_group_best
&& !new_is_better
)
1525 /* The second case - replace the best with better */
1526 if (old_is_group_best
&& new_is_better
)
1528 /* new is best-in-group, the see discussion below - this is
1529 a special variant of NBG && OBG. From OBG we can deduce
1530 that same_group(old_best) iff (old == old_best) */
1531 new->u
.bgp
.suppressed
= 0;
1532 return (old
== old_best
);
1536 /* The default case - find a new best-in-group route */
1537 r
= new; /* new may not be in the list */
1538 for (s
=net
->routes
; rte_is_valid(s
); s
=s
->next
)
1539 if (use_deterministic_med(s
) && same_group(s
, lpref
, lasn
))
1541 s
->u
.bgp
.suppressed
= 1;
1542 if (!r
|| bgp_rte_better(s
, r
))
1546 /* Simple case - the last route in group disappears */
1550 /* Found best-in-group */
1551 r
->u
.bgp
.suppressed
= 0;
1554 * There are generally two reasons why we have to force
1555 * recalculation (return 1): First, the new route may be wrongfully
1556 * chosen to be the best in the first case check in
1557 * rte_recalculate(), this may happen only if old_best is from the
1558 * same group. Second, another (different than new route)
1559 * best-in-group is chosen and that may be the proper best (although
1560 * rte_recalculate() without ignore that possibility).
1562 * There are three possible cases according to whether the old route
1563 * was the best in group (OBG, stored in old_is_group_best) and
1564 * whether the new route is the best in group (NBG, tested by r == new).
1565 * These cases work even if old or new is NULL.
1567 * NBG -> new is a possible candidate for the best route, so we just
1568 * check for the first reason using same_group().
1570 * !NBG && OBG -> Second reason applies, return 1
1572 * !NBG && !OBG -> Best in group does not change, old != old_best,
1573 * rte_better(new, old_best) is false and therefore
1574 * the first reason does not apply, return 0
1578 return old_best
&& same_group(old_best
, lpref
, lasn
);
1580 return old_is_group_best
;
1583 static struct adata
*
1584 bgp_aggregator_convert_to_new(struct adata
*old
, struct linpool
*pool
)
1586 struct adata
*newa
= lp_alloc(pool
, sizeof(struct adata
) + 8);
1588 aggregator_convert_to_new(old
, newa
->data
);
1593 /* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
1594 * and append path old4 (in 4B format).
1596 static struct adata
*
1597 bgp_merge_as_paths(struct adata
*old2
, struct adata
*old4
, int req_as
, struct linpool
*pool
)
1599 byte buf
[old2
->length
* 2];
1601 int ol
= as_path_convert_to_new(old2
, buf
, req_as
);
1602 int nl
= ol
+ (old4
? old4
->length
: 0);
1604 struct adata
*newa
= lp_alloc(pool
, sizeof(struct adata
) + nl
);
1606 memcpy(newa
->data
, buf
, ol
);
1607 if (old4
) memcpy(newa
->data
+ ol
, old4
->data
, old4
->length
);
1613 as4_aggregator_valid(struct adata
*aggr
)
1615 return aggr
->length
== 8;
1619 /* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
1621 bgp_reconstruct_4b_atts(struct bgp_proto
*p
, rta
*a
, struct linpool
*pool
)
1623 eattr
*p2
=ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1624 eattr
*p4
=ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_AS4_PATH
));
1625 eattr
*a2
=ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_AGGREGATOR
));
1626 eattr
*a4
=ea_find(a
->eattrs
, EA_CODE(EAP_BGP
, BA_AS4_AGGREGATOR
));
1629 if (a4
&& !as4_aggregator_valid(a4
->u
.ptr
))
1631 log(L_WARN
"%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p
->p
.name
);
1638 u32 a2_as
= get_u16(a2
->u
.ptr
->data
);
1642 if (a2_as
!= AS_TRANS
)
1644 /* Routes were aggregated by old router and therefore AS4_PATH
1645 * and AS4_AGGREGATOR is invalid
1647 * Convert AS_PATH and AGGREGATOR to 4B format and finish.
1650 a2
->u
.ptr
= bgp_aggregator_convert_to_new(a2
->u
.ptr
, pool
);
1651 p2
->u
.ptr
= bgp_merge_as_paths(p2
->u
.ptr
, NULL
, AS_PATH_MAXLEN
, pool
);
1657 /* Common case, use AS4_AGGREGATOR attribute */
1658 a2
->u
.ptr
= a4
->u
.ptr
;
1663 /* Common case, use old AGGREGATOR attribute */
1664 a2
->u
.ptr
= bgp_aggregator_convert_to_new(a2
->u
.ptr
, pool
);
1666 if ((a2_as
== AS_TRANS
) && !a4_removed
)
1667 log(L_WARN
"%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p
->p
.name
);
1672 log(L_WARN
"%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p
->p
.name
);
1674 int p2_len
= as_path_getlen_int(p2
->u
.ptr
, 2);
1675 int p4_len
= p4
? validate_as4_path(p
, p4
->u
.ptr
) : -1;
1677 if (p4
&& (p4_len
< 0))
1678 log(L_WARN
"%s: AS4_PATH attribute is malformed, skipping attribute", p
->p
.name
);
1680 if ((p4_len
<= 0) || (p2_len
< p4_len
))
1681 p2
->u
.ptr
= bgp_merge_as_paths(p2
->u
.ptr
, NULL
, AS_PATH_MAXLEN
, pool
);
1683 p2
->u
.ptr
= bgp_merge_as_paths(p2
->u
.ptr
, p4
->u
.ptr
, p2_len
- p4_len
, pool
);
1687 bgp_remove_as4_attrs(struct bgp_proto
*p
, rta
*a
)
1689 unsigned id1
= EA_CODE(EAP_BGP
, BA_AS4_PATH
);
1690 unsigned id2
= EA_CODE(EAP_BGP
, BA_AS4_AGGREGATOR
);
1691 ea_list
**el
= &(a
->eattrs
);
1693 /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */
1696 unsigned fid
= (*el
)->attrs
[0].id
;
1698 if ((fid
== id1
) || (fid
== id2
))
1702 log(L_WARN
"%s: Unexpected AS4_* attributes received", p
->p
.name
);
1705 el
= &((*el
)->next
);
1710 * bgp_decode_attrs - check and decode BGP attributes
1712 * @attr: start of attribute block
1713 * @len: length of attribute block
1714 * @pool: linear pool to make all the allocations in
1715 * @mandatory: 1 iff presence of mandatory attributes has to be checked
1717 * This function takes a BGP attribute block (a part of an Update message), checks
1718 * its consistency and converts it to a list of BIRD route attributes represented
1722 bgp_decode_attrs(struct bgp_conn
*conn
, byte
*attr
, uint len
, struct linpool
*pool
, int mandatory
)
1724 struct bgp_proto
*bgp
= conn
->bgp
;
1725 rta
*a
= lp_alloc(pool
, sizeof(struct rta
));
1726 uint flags
, code
, l
, i
, type
;
1728 byte
*z
, *attr_start
;
1734 bzero(a
, sizeof(rta
));
1735 a
->source
= RTS_BGP
;
1736 a
->scope
= SCOPE_UNIVERSE
;
1737 a
->cast
= RTC_UNICAST
;
1738 /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */
1739 a
->from
= bgp
->cf
->remote_ip
;
1741 /* Parse the attributes */
1742 bzero(seen
, sizeof(seen
));
1743 DBG("BGP: Parsing attributes\n");
1752 if (flags
& BAF_EXT_LEN
)
1772 DBG("Attr %02x %02x %d\n", code
, flags
, l
);
1773 if (seen
[code
/8] & (1 << (code
%8)))
1775 if (ATTR_KNOWN(code
))
1777 struct attr_desc
*desc
= &bgp_attr_table
[code
];
1778 if (desc
->expected_length
>= 0 && desc
->expected_length
!= (int) l
)
1779 { errcode
= 5; goto err
; }
1780 if ((desc
->expected_flags
^ flags
) & (BAF_OPTIONAL
| BAF_TRANSITIVE
))
1781 { errcode
= 4; goto err
; }
1782 if (!bgp
->is_internal
)
1784 if (!desc
->allow_in_ebgp
)
1786 if ((code
== BA_LOCAL_PREF
) && !bgp
->cf
->allow_local_pref
)
1791 errcode
= desc
->validate(bgp
, z
, l
);
1794 if (errcode
== IGNORE
)
1796 if (errcode
<= WITHDRAW
)
1798 log(L_WARN
"%s: Attribute %s is malformed, withdrawing update",
1799 bgp
->p
.name
, desc
->name
);
1803 else if (code
== BA_AS_PATH
)
1805 /* Special case as it might also trim the attribute */
1806 if (validate_as_path(bgp
, z
, &l
) < 0)
1807 { errcode
= 11; goto err
; }
1811 else /* Unknown attribute */
1813 if (!(flags
& BAF_OPTIONAL
))
1814 { errcode
= 2; goto err
; }
1815 type
= EAF_TYPE_OPAQUE
;
1818 // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag
1819 // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL))
1820 // { errcode = 4; goto err; }
1822 seen
[code
/8] |= (1 << (code
%8));
1823 ea
= lp_alloc(pool
, sizeof(ea_list
) + sizeof(eattr
));
1824 ea
->next
= a
->eattrs
;
1828 ea
->attrs
[0].id
= EA_CODE(EAP_BGP
, code
);
1829 ea
->attrs
[0].flags
= flags
;
1830 ea
->attrs
[0].type
= type
;
1831 if (type
& EAF_EMBEDDED
)
1835 ad
= lp_alloc(pool
, sizeof(struct adata
) + l
);
1836 ea
->attrs
[0].u
.ptr
= ad
;
1838 memcpy(ad
->data
, z
, l
);
1842 case EAF_TYPE_ROUTER_ID
:
1845 ea
->attrs
[0].u
.data
= *z
;
1847 ea
->attrs
[0].u
.data
= get_u32(z
);
1849 case EAF_TYPE_IP_ADDRESS
:
1850 ipa_ntoh(*(ip_addr
*)ad
->data
);
1852 case EAF_TYPE_INT_SET
:
1853 case EAF_TYPE_LC_SET
:
1854 case EAF_TYPE_EC_SET
:
1856 u32
*z
= (u32
*) ad
->data
;
1857 for(i
=0; i
<ad
->length
/4; i
++)
1868 /* If we received MP_REACH_NLRI we should check mandatory attributes */
1869 if (bgp
->mp_reach_len
!= 0)
1873 /* If there is no (reachability) NLRI, we should exit now */
1877 /* Check if all mandatory attributes are present */
1878 for(i
=0; i
< ARRAY_SIZE(bgp_mandatory_attrs
); i
++)
1880 code
= bgp_mandatory_attrs
[i
];
1881 if (!(seen
[code
/8] & (1 << (code
%8))))
1883 bgp_error(conn
, 3, 3, &bgp_mandatory_attrs
[i
], 1);
1888 /* When receiving attributes from non-AS4-aware BGP speaker,
1889 * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
1891 if (! bgp
->as4_session
)
1892 bgp_reconstruct_4b_atts(bgp
, a
, pool
);
1894 bgp_remove_as4_attrs(bgp
, a
);
1896 /* If the AS path attribute contains our AS, reject the routes */
1897 if (bgp_as_path_loopy(bgp
, a
))
1900 /* Two checks for IBGP loops caused by route reflection, RFC 4456 */
1901 if (bgp_originator_id_loopy(bgp
, a
) ||
1902 bgp_cluster_list_loopy(bgp
, a
))
1905 /* If there's no local preference, define one */
1906 if (!(seen
[0] & (1 << BA_LOCAL_PREF
)))
1907 bgp_attach_attr(&a
->eattrs
, pool
, BA_LOCAL_PREF
, bgp
->cf
->default_local_pref
);
1915 bgp_error(conn
, 3, 1, NULL
, 0);
1919 bgp_error(conn
, 3, errcode
, attr_start
, z
+l
-attr_start
);
1924 bgp_get_attr(eattr
*a
, byte
*buf
, int buflen
)
1926 uint i
= EA_ID(a
->id
);
1927 struct attr_desc
*d
;
1932 d
= &bgp_attr_table
[i
];
1933 len
= bsprintf(buf
, "%s", d
->name
);
1939 d
->format(a
, buf
, buflen
- len
- 2);
1944 bsprintf(buf
, "%02x%s", i
, (a
->flags
& BAF_TRANSITIVE
) ? " [t]" : "");
1949 bgp_init_bucket_table(struct bgp_proto
*p
)
1952 p
->hash_limit
= p
->hash_size
* 4;
1953 p
->bucket_hash
= mb_allocz(p
->p
.pool
, p
->hash_size
* sizeof(struct bgp_bucket
*));
1954 init_list(&p
->bucket_queue
);
1955 p
->withdraw_bucket
= NULL
;
1956 // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
1960 bgp_free_bucket_table(struct bgp_proto
*p
)
1962 mb_free(p
->bucket_hash
);
1963 p
->bucket_hash
= NULL
;
1965 struct bgp_bucket
*b
;
1966 WALK_LIST_FIRST(b
, p
->bucket_queue
)
1968 rem_node(&b
->send_node
);
1972 mb_free(p
->withdraw_bucket
);
1973 p
->withdraw_bucket
= NULL
;
1977 bgp_get_route_info(rte
*e
, byte
*buf
, ea_list
*attrs
)
1979 eattr
*p
= ea_find(attrs
, EA_CODE(EAP_BGP
, BA_AS_PATH
));
1980 eattr
*o
= ea_find(attrs
, EA_CODE(EAP_BGP
, BA_ORIGIN
));
1983 buf
+= bsprintf(buf
, " (%d", e
->pref
);
1985 if (e
->u
.bgp
.suppressed
)
1986 buf
+= bsprintf(buf
, "-");
1988 if (e
->attrs
->hostentry
)
1990 if (!rte_resolvable(e
))
1991 buf
+= bsprintf(buf
, "/-");
1992 else if (e
->attrs
->igp_metric
>= IGP_METRIC_UNKNOWN
)
1993 buf
+= bsprintf(buf
, "/?");
1995 buf
+= bsprintf(buf
, "/%d", e
->attrs
->igp_metric
);
1997 buf
+= bsprintf(buf
, ") [");
1999 if (p
&& as_path_get_last(p
->u
.ptr
, &origas
))
2000 buf
+= bsprintf(buf
, "AS%u", origas
);
2002 buf
+= bsprintf(buf
, "%c", "ie?"[o
->u
.data
]);