]>
git.ipfire.org Git - thirdparty/bird.git/blob - proto/bgp/packets.c
2 * BIRD -- BGP Packet Processing
4 * (c) 2000 Martin Mares <mj@ucw.cz>
5 * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6 * (c) 2008--2016 CZ.NIC z.s.p.o.
8 * Can be freely distributed and used under the terms of the GNU GPL.
15 #include "nest/bird.h"
16 #include "nest/iface.h"
17 #include "nest/protocol.h"
18 #include "nest/route.h"
19 #include "nest/attrs.h"
20 #include "nest/mrtdump.h"
21 #include "conf/conf.h"
22 #include "lib/unaligned.h"
23 #include "lib/flowspec.h"
24 #include "lib/socket.h"
31 #define BGP_RR_REQUEST 0
32 #define BGP_RR_BEGIN 1
36 static struct tbf rl_rcv_update
= TBF_DEFAULT_LOG_LIMITS
;
37 static struct tbf rl_snd_update
= TBF_DEFAULT_LOG_LIMITS
;
39 /* Table for state -> RFC 6608 FSM error subcodes */
40 static byte fsm_err_subcode
[BS_MAX
] = {
47 static struct bgp_channel
*
48 bgp_get_channel(struct bgp_proto
*p
, u32 afi
)
52 for (i
= 0; i
< p
->channel_count
; i
++)
53 if (p
->afi_map
[i
] == afi
)
54 return p
->channel_map
[i
];
60 put_af3(byte
*buf
, u32 id
)
62 put_u16(buf
, id
>> 16);
67 put_af4(byte
*buf
, u32 id
)
69 put_u16(buf
, id
>> 16);
77 return (get_u16(buf
) << 16) | buf
[2];
83 return (get_u16(buf
) << 16) | buf
[3];
87 * MRT Dump format is not semantically specified.
88 * We will use these values in appropriate fields:
90 * Local AS, Remote AS - configured AS numbers for given BGP instance.
91 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
93 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
94 * changes) and MESSAGE (for received BGP messages).
96 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
97 * only when AS4 session is established and even in that case MESSAGE
98 * does not use AS4 variant for initial OPEN message. This strange
99 * behavior is here for compatibility with Quagga and Bgpdump,
103 mrt_put_bgp4_hdr(byte
*buf
, struct bgp_conn
*conn
, int as4
)
105 struct bgp_proto
*p
= conn
->bgp
;
106 uint v4
= ipa_is_ip4(p
->cf
->remote_ip
);
110 put_u32(buf
+0, p
->remote_as
);
111 put_u32(buf
+4, p
->public_as
);
116 put_u16(buf
+0, (p
->remote_as
<= 0xFFFF) ? p
->remote_as
: AS_TRANS
);
117 put_u16(buf
+2, (p
->public_as
<= 0xFFFF) ? p
->public_as
: AS_TRANS
);
121 put_u16(buf
+0, (p
->neigh
&& p
->neigh
->iface
) ? p
->neigh
->iface
->index
: 0);
122 put_u16(buf
+2, v4
? BGP_AFI_IPV4
: BGP_AFI_IPV6
);
127 buf
= put_ip4(buf
, conn
->sk
? ipa_to_ip4(conn
->sk
->daddr
) : IP4_NONE
);
128 buf
= put_ip4(buf
, conn
->sk
? ipa_to_ip4(conn
->sk
->saddr
) : IP4_NONE
);
132 buf
= put_ip6(buf
, conn
->sk
? ipa_to_ip6(conn
->sk
->daddr
) : IP6_NONE
);
133 buf
= put_ip6(buf
, conn
->sk
? ipa_to_ip6(conn
->sk
->saddr
) : IP6_NONE
);
140 mrt_dump_bgp_packet(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
142 byte
*buf
= alloca(128+len
); /* 128 is enough for MRT headers */
143 byte
*bp
= buf
+ MRTDUMP_HDR_LENGTH
;
144 int as4
= conn
->bgp
->as4_session
;
146 bp
= mrt_put_bgp4_hdr(bp
, conn
, as4
);
147 memcpy(bp
, pkt
, len
);
149 mrt_dump_message(&conn
->bgp
->p
, BGP4MP
, as4
? BGP4MP_MESSAGE_AS4
: BGP4MP_MESSAGE
,
154 convert_state(uint state
)
156 /* Convert state from our BS_* values to values used in MRTDump */
157 return (state
== BS_CLOSE
) ? 1 : state
+ 1;
161 mrt_dump_bgp_state_change(struct bgp_conn
*conn
, uint old
, uint
new)
164 byte
*bp
= buf
+ MRTDUMP_HDR_LENGTH
;
166 bp
= mrt_put_bgp4_hdr(bp
, conn
, 1);
167 put_u16(bp
+0, convert_state(old
));
168 put_u16(bp
+2, convert_state(new));
170 mrt_dump_message(&conn
->bgp
->p
, BGP4MP
, BGP4MP_STATE_CHANGE_AS4
, buf
, bp
-buf
);
174 bgp_create_notification(struct bgp_conn
*conn
, byte
*buf
)
176 struct bgp_proto
*p
= conn
->bgp
;
178 BGP_TRACE(D_PACKETS
, "Sending NOTIFICATION(code=%d.%d)", conn
->notify_code
, conn
->notify_subcode
);
179 buf
[0] = conn
->notify_code
;
180 buf
[1] = conn
->notify_subcode
;
181 memcpy(buf
+2, conn
->notify_data
, conn
->notify_size
);
182 return buf
+ 2 + conn
->notify_size
;
186 /* Capability negotiation as per RFC 5492 */
188 const struct bgp_af_caps
*
189 bgp_find_af_caps(struct bgp_caps
*caps
, u32 afi
)
191 struct bgp_af_caps
*ac
;
193 WALK_AF_CAPS(caps
, ac
)
200 static struct bgp_af_caps
*
201 bgp_get_af_caps(struct bgp_caps
*caps
, u32 afi
)
203 struct bgp_af_caps
*ac
;
205 WALK_AF_CAPS(caps
, ac
)
209 ac
= &caps
->af_data
[caps
->af_count
++];
210 memset(ac
, 0, sizeof(struct bgp_af_caps
));
217 bgp_af_caps_cmp(const void *X
, const void *Y
)
219 const struct bgp_af_caps
*x
= X
, *y
= Y
;
220 return (x
->afi
< y
->afi
) ? -1 : (x
->afi
> y
->afi
) ? 1 : 0;
225 bgp_write_capabilities(struct bgp_conn
*conn
, byte
*buf
)
227 struct bgp_proto
*p
= conn
->bgp
;
228 struct bgp_channel
*c
;
229 struct bgp_caps
*caps
;
230 struct bgp_af_caps
*ac
;
231 uint any_add_path
= 0;
234 /* Prepare bgp_caps structure */
236 int n
= list_length(&p
->p
.channels
);
237 caps
= mb_allocz(p
->p
.pool
, sizeof(struct bgp_caps
) + n
* sizeof(struct bgp_af_caps
));
238 conn
->local_caps
= caps
;
240 caps
->as4_support
= p
->cf
->enable_as4
;
241 caps
->ext_messages
= p
->cf
->enable_extended_messages
;
242 caps
->route_refresh
= p
->cf
->enable_refresh
;
243 caps
->enhanced_refresh
= p
->cf
->enable_refresh
;
245 if (caps
->as4_support
)
246 caps
->as4_number
= p
->public_as
;
251 caps
->gr_time
= p
->cf
->gr_time
;
252 caps
->gr_flags
= p
->p
.gr_recovery
? BGP_GRF_RESTART
: 0;
255 /* Allocate and fill per-AF fields */
256 WALK_LIST(c
, p
->p
.channels
)
258 ac
= &caps
->af_data
[caps
->af_count
++];
262 ac
->add_path
= c
->cf
->add_path
;
263 any_add_path
|= ac
->add_path
;
269 if (p
->p
.gr_recovery
)
270 ac
->gr_af_flags
|= BGP_GRF_FORWARDING
;
274 /* Sort capability fields by AFI/SAFI */
275 qsort(caps
->af_data
, caps
->af_count
, sizeof(struct bgp_af_caps
), bgp_af_caps_cmp
);
278 /* Create capability list in buffer */
281 * Note that max length is ~ 20+14*af_count. With max 6 channels that is
282 * 104. Option limit is 253 and buffer size is 4096, so we cannot overflow
283 * unless we add new capabilities or more AFs.
286 WALK_AF_CAPS(caps
, ac
)
289 *buf
++ = 1; /* Capability 1: Multiprotocol extensions */
290 *buf
++ = 4; /* Capability data length */
291 put_af4(buf
, ac
->afi
);
295 if (caps
->route_refresh
)
297 *buf
++ = 2; /* Capability 2: Support for route refresh */
298 *buf
++ = 0; /* Capability data length */
301 if (caps
->ext_messages
)
303 *buf
++ = 6; /* Capability 6: Support for extended messages */
304 *buf
++ = 0; /* Capability data length */
309 *buf
++ = 64; /* Capability 64: Support for graceful restart */
310 *buf
++ = 0; /* Capability data length, will be fixed later */
313 put_u16(buf
, caps
->gr_time
);
314 buf
[0] |= caps
->gr_flags
;
317 WALK_AF_CAPS(caps
, ac
)
320 put_af3(buf
, ac
->afi
);
321 buf
[3] = ac
->gr_af_flags
;
325 data
[-1] = buf
- data
;
328 if (caps
->as4_support
)
330 *buf
++ = 65; /* Capability 65: Support for 4-octet AS number */
331 *buf
++ = 4; /* Capability data length */
332 put_u32(buf
, p
->public_as
);
338 *buf
++ = 69; /* Capability 69: Support for ADD-PATH */
339 *buf
++ = 0; /* Capability data length, will be fixed later */
342 WALK_AF_CAPS(caps
, ac
)
345 put_af3(buf
, ac
->afi
);
346 buf
[3] = ac
->add_path
;
350 data
[-1] = buf
- data
;
353 if (caps
->enhanced_refresh
)
355 *buf
++ = 70; /* Capability 70: Support for enhanced route refresh */
356 *buf
++ = 0; /* Capability data length */
363 bgp_read_capabilities(struct bgp_conn
*conn
, struct bgp_caps
*caps
, byte
*pos
, int len
)
365 struct bgp_proto
*p
= conn
->bgp
;
366 struct bgp_af_caps
*ac
;
372 if (len
< 2 || len
< (2 + pos
[1]))
375 /* Capability length */
378 /* Capability type */
381 case 1: /* Multiprotocol capability, RFC 4760 */
386 ac
= bgp_get_af_caps(caps
, af
);
390 case 2: /* Route refresh capability, RFC 2918 */
394 caps
->route_refresh
= 1;
397 case 6: /* Extended message length capability, RFC draft */
401 caps
->ext_messages
= 1;
404 case 64: /* Graceful restart capability, RFC 4724 */
408 /* Only the last instance is valid */
409 WALK_AF_CAPS(caps
, ac
)
416 caps
->gr_flags
= pos
[2] & 0xf0;
417 caps
->gr_time
= get_u16(pos
+ 2) & 0x0fff;
419 for (i
= 2; i
< cl
; i
+= 4)
421 af
= get_af3(pos
+2+i
);
422 ac
= bgp_get_af_caps(caps
, af
);
424 ac
->gr_af_flags
= pos
[2+i
+3];
428 case 65: /* AS4 capability, RFC 4893 */
432 caps
->as4_support
= 1;
433 caps
->as4_number
= get_u32(pos
+ 2);
436 case 69: /* ADD-PATH capability, RFC 7911 */
440 for (i
= 0; i
< cl
; i
+= 4)
442 byte val
= pos
[2+i
+3];
443 if (!val
|| (val
> BGP_ADD_PATH_FULL
))
445 log(L_WARN
"%s: Got ADD-PATH capability with unknown value %u, ignoring",
451 for (i
= 0; i
< cl
; i
+= 4)
453 af
= get_af3(pos
+2+i
);
454 ac
= bgp_get_af_caps(caps
, af
);
455 ac
->add_path
= pos
[2+i
+3];
459 case 70: /* Enhanced route refresh capability, RFC 7313 */
463 caps
->enhanced_refresh
= 1;
466 /* We can safely ignore all other capabilities */
469 ADVANCE(pos
, len
, 2 + cl
);
474 bgp_error(conn
, 2, 0, NULL
, 0);
479 bgp_read_options(struct bgp_conn
*conn
, byte
*pos
, int len
)
481 struct bgp_proto
*p
= conn
->bgp
;
482 struct bgp_caps
*caps
;
485 /* Max number of announced AFIs is limited by max option length (255) */
486 caps
= alloca(sizeof(struct bgp_caps
) + 64 * sizeof(struct bgp_af_caps
));
487 memset(caps
, 0, sizeof(struct bgp_caps
));
491 if ((len
< 2) || (len
< (2 + pos
[1])))
492 { bgp_error(conn
, 2, 0, NULL
, 0); return -1; }
497 /* BGP capabilities, RFC 5492 */
498 if (p
->cf
->capabilities
)
499 bgp_read_capabilities(conn
, caps
, pos
+ 2, ol
);
504 bgp_error(conn
, 2, 4, pos
, ol
); /* FIXME: ol or ol+2 ? */
508 ADVANCE(pos
, len
, 2 + ol
);
511 uint n
= sizeof(struct bgp_caps
) + caps
->af_count
* sizeof(struct bgp_af_caps
);
512 conn
->remote_caps
= mb_allocz(p
->p
.pool
, n
);
513 memcpy(conn
->remote_caps
, caps
, n
);
519 bgp_create_open(struct bgp_conn
*conn
, byte
*buf
)
521 struct bgp_proto
*p
= conn
->bgp
;
523 BGP_TRACE(D_PACKETS
, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
524 BGP_VERSION
, p
->public_as
, p
->cf
->hold_time
, p
->local_id
);
526 buf
[0] = BGP_VERSION
;
527 put_u16(buf
+1, (p
->public_as
< 0xFFFF) ? p
->public_as
: AS_TRANS
);
528 put_u16(buf
+3, p
->cf
->hold_time
);
529 put_u32(buf
+5, p
->local_id
);
531 if (p
->cf
->capabilities
)
533 /* Prepare local_caps and write capabilities to buffer */
534 byte
*end
= bgp_write_capabilities(conn
, buf
+12);
535 uint len
= end
- (buf
+12);
537 buf
[9] = len
+ 2; /* Optional parameters length */
538 buf
[10] = 2; /* Option 2: Capability list */
539 buf
[11] = len
; /* Option data length */
545 /* Prepare empty local_caps */
546 conn
->local_caps
= mb_allocz(p
->p
.pool
, sizeof(struct bgp_caps
));
548 buf
[9] = 0; /* No optional parameters */
556 bgp_rx_open(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
558 struct bgp_proto
*p
= conn
->bgp
;
559 struct bgp_conn
*other
;
563 if (conn
->state
!= BS_OPENSENT
)
564 { bgp_error(conn
, 5, fsm_err_subcode
[conn
->state
], NULL
, 0); return; }
566 /* Check message contents */
567 if (len
< 29 || len
!= 29 + (uint
) pkt
[28])
568 { bgp_error(conn
, 1, 2, pkt
+16, 2); return; }
570 if (pkt
[19] != BGP_VERSION
)
571 { u16 val
= BGP_VERSION
; bgp_error(conn
, 2, 1, (byte
*) &val
, 2); return; }
573 asn
= get_u16(pkt
+20);
574 hold
= get_u16(pkt
+22);
575 id
= get_u32(pkt
+24);
576 BGP_TRACE(D_PACKETS
, "Got OPEN(as=%d,hold=%d,id=%R)", asn
, hold
, id
);
578 if (bgp_read_options(conn
, pkt
+29, pkt
[28]) < 0)
581 if (hold
> 0 && hold
< 3)
582 { bgp_error(conn
, 2, 6, pkt
+22, 2); return; }
584 /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
585 if (!id
|| (p
->is_internal
&& id
== p
->local_id
))
586 { bgp_error(conn
, 2, 3, pkt
+24, -4); return; }
588 struct bgp_caps
*caps
= conn
->remote_caps
;
590 if (caps
->as4_support
)
592 u32 as4
= caps
->as4_number
;
594 if ((as4
!= asn
) && (asn
!= AS_TRANS
))
595 log(L_WARN
"%s: Peer advertised inconsistent AS numbers", p
->p
.name
);
597 if (as4
!= p
->remote_as
)
598 { as4
= htonl(as4
); bgp_error(conn
, 2, 2, (byte
*) &as4
, 4); return; }
602 if (asn
!= p
->remote_as
)
603 { bgp_error(conn
, 2, 2, pkt
+20, 2); return; }
606 /* Check the other connection */
607 other
= (conn
== &p
->outgoing_conn
) ? &p
->incoming_conn
: &p
->outgoing_conn
;
608 switch (other
->state
)
612 /* Stop outgoing connection attempts */
613 bgp_conn_enter_idle_state(other
);
623 * Description of collision detection rules in RFC 4271 is confusing and
624 * contradictory, but it is essentially:
626 * 1. Router with higher ID is dominant
627 * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
628 * 3. When both connections are in OpenConfirm state, one initiated by
629 * the dominant router is kept.
631 * The first line in the expression below evaluates whether the neighbor
632 * is dominant, the second line whether the new connection was initiated
633 * by the neighbor. If both are true (or both are false), we keep the new
634 * connection, otherwise we keep the old one.
636 if (((p
->local_id
< id
) || ((p
->local_id
== id
) && (p
->public_as
< p
->remote_as
)))
637 == (conn
== &p
->incoming_conn
))
639 /* Should close the other connection */
640 BGP_TRACE(D_EVENTS
, "Connection collision, giving up the other connection");
641 bgp_error(other
, 6, 7, NULL
, 0);
646 /* Should close this connection */
647 BGP_TRACE(D_EVENTS
, "Connection collision, giving up this connection");
648 bgp_error(conn
, 6, 7, NULL
, 0);
652 bug("bgp_rx_open: Unknown state");
655 /* Update our local variables */
656 conn
->hold_time
= MIN(hold
, p
->cf
->hold_time
);
657 conn
->keepalive_time
= p
->cf
->keepalive_time
? : conn
->hold_time
/ 3;
658 conn
->as4_session
= conn
->local_caps
->as4_support
&& caps
->as4_support
;
659 conn
->ext_messages
= conn
->local_caps
->ext_messages
&& caps
->ext_messages
;
662 DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
663 conn
->hold_time
, conn
->keepalive_time
, p
->remote_as
, p
->remote_id
, conn
->as4_session
);
665 bgp_schedule_packet(conn
, NULL
, PKT_KEEPALIVE
);
666 bgp_start_timer(conn
->hold_timer
, conn
->hold_time
);
667 bgp_conn_enter_openconfirm_state(conn
);
675 #define REPORT(msg, args...) \
676 ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
678 #define WITHDRAW(msg, args...) \
679 ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
681 #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
682 #define NO_NEXT_HOP "Missing NEXT_HOP attribute"
686 bgp_apply_next_hop(struct bgp_parse_state
*s
, rta
*a
, ip_addr gw
, ip_addr ll
)
688 struct bgp_proto
*p
= s
->proto
;
689 struct bgp_channel
*c
= s
->channel
;
691 if (c
->cf
->gw_mode
== GW_DIRECT
)
693 neighbor
*nbr
= NULL
;
695 /* GW_DIRECT -> single_hop -> p->neigh != NULL */
697 nbr
= neigh_find2(&p
->p
, &gw
, NULL
, 0);
698 else if (ipa_nonzero(ll
))
699 nbr
= neigh_find2(&p
->p
, &ll
, p
->neigh
->iface
, 0);
701 if (!nbr
|| (nbr
->scope
== SCOPE_HOST
))
702 WITHDRAW(BAD_NEXT_HOP
);
704 a
->dest
= RTD_ROUTER
;
706 a
->iface
= nbr
->iface
;
710 else /* GW_RECURSIVE */
713 WITHDRAW(BAD_NEXT_HOP
);
715 rta_set_recursive_next_hop(c
->c
.table
, a
, c
->igp_table
, gw
, ll
);
720 bgp_use_next_hop(struct bgp_export_state
*s
, eattr
*a
)
722 struct bgp_proto
*p
= s
->proto
;
723 ip_addr
*nh
= (void *) a
->u
.ptr
->data
;
725 if (s
->channel
->cf
->next_hop_self
)
728 if (s
->channel
->cf
->next_hop_keep
)
731 /* Keep it when explicitly set in export filter */
732 if (a
->type
& EAF_FRESH
)
735 /* Keep it when exported to internal peers */
736 if (p
->is_interior
&& ipa_nonzero(*nh
))
739 /* Keep it when forwarded between single-hop BGPs on the same iface */
740 struct iface
*ifa
= (s
->src
&& s
->src
->neigh
) ? s
->src
->neigh
->iface
: NULL
;
741 return p
->neigh
&& (p
->neigh
->iface
== ifa
);
745 bgp_use_gateway(struct bgp_export_state
*s
)
747 struct bgp_proto
*p
= s
->proto
;
748 rta
*ra
= s
->route
->attrs
;
750 if (s
->channel
->cf
->next_hop_self
)
753 /* We need valid global gateway */
754 if ((ra
->dest
!= RTD_ROUTER
) || ipa_zero(ra
->gw
) || ipa_is_link_local(ra
->gw
))
757 /* Use it when exported to internal peers */
761 /* Use it when forwarded to single-hop BGP peer on on the same iface */
762 return p
->neigh
&& (p
->neigh
->iface
== ra
->iface
);
766 bgp_update_next_hop_ip(struct bgp_export_state
*s
, eattr
*a
, ea_list
**to
)
768 if (!a
|| !bgp_use_next_hop(s
, a
))
770 if (bgp_use_gateway(s
))
772 ip_addr nh
[1] = { s
->route
->attrs
->gw
};
773 bgp_set_attr_data(to
, s
->pool
, BA_NEXT_HOP
, 0, nh
, 16);
777 ip_addr nh
[2] = { s
->channel
->next_hop_addr
, s
->channel
->link_addr
};
778 bgp_set_attr_data(to
, s
->pool
, BA_NEXT_HOP
, 0, nh
, ipa_nonzero(nh
[1]) ? 32 : 16);
782 /* Check if next hop is valid */
783 a
= bgp_find_attr(*to
, BA_NEXT_HOP
);
785 WITHDRAW(NO_NEXT_HOP
);
787 ip_addr
*nh
= (void *) a
->u
.ptr
->data
;
788 ip_addr peer
= s
->proto
->cf
->remote_ip
;
789 uint len
= a
->u
.ptr
->length
;
791 if (ipa_zero(nh
[0]) && ((len
!= 32) || ipa_zero(nh
[1])))
792 WITHDRAW(BAD_NEXT_HOP
);
794 if (ipa_equal(peer
, nh
[0]) || ((len
== 32) && ipa_equal(peer
, nh
[1])))
795 WITHDRAW(BAD_NEXT_HOP
);
799 bgp_encode_next_hop_none(struct bgp_write_state
*s UNUSED
, eattr
*a UNUSED
, byte
*buf UNUSED
, uint size UNUSED
)
806 bgp_decode_next_hop_none(struct bgp_parse_state
*s UNUSED
, byte
*data UNUSED
, uint len UNUSED
, rta
*a UNUSED
)
813 bgp_update_next_hop_none(struct bgp_export_state
*s UNUSED
, eattr
*a UNUSED
, ea_list
**to UNUSED
)
824 bgp_rte_update(struct bgp_parse_state
*s
, net_addr
*n
, u32 path_id
, rta
*a0
)
826 if (path_id
!= s
->last_id
)
828 s
->last_src
= rt_get_source(&s
->proto
->p
, path_id
);
829 s
->last_id
= path_id
;
831 rta_free(s
->cached_rta
);
832 s
->cached_rta
= NULL
;
838 rte_update2(&s
->channel
->c
, n
, NULL
, s
->last_src
);
842 /* Prepare cached route attributes */
843 if (s
->cached_rta
== NULL
)
845 a0
->src
= s
->last_src
;
847 /* Workaround for rta_lookup() breaking eattrs */
848 ea_list
*ea
= a0
->eattrs
;
849 s
->cached_rta
= rta_lookup(a0
);
853 rta
*a
= rta_clone(s
->cached_rta
);
854 rte
*e
= rte_get_temp(a
);
857 e
->u
.bgp
.suppressed
= 0;
858 rte_update2(&s
->channel
->c
, n
, e
, s
->last_src
);
864 bgp_encode_nlri_ip4(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, uint size
)
868 while (!EMPTY_LIST(buck
->prefixes
) && (size
>= (5 + sizeof(ip4_addr
))))
870 struct bgp_prefix
*px
= HEAD(buck
->prefixes
);
871 struct net_addr_ip4
*net
= (void *) px
->net
;
876 put_u32(pos
, px
->path_id
);
877 ADVANCE(pos
, size
, 4);
880 ip4_addr a
= ip4_hton(net
->prefix
);
881 uint b
= (net
->pxlen
+ 7) / 8;
883 /* Encode prefix length */
885 ADVANCE(pos
, size
, 1);
887 /* Encode prefix body */
889 ADVANCE(pos
, size
, b
);
891 bgp_free_prefix(s
->channel
, px
);
898 bgp_decode_nlri_ip4(struct bgp_parse_state
*s
, byte
*pos
, uint len
, rta
*a
)
909 bgp_parse_error(s
, 1);
911 path_id
= get_u32(pos
);
912 ADVANCE(pos
, len
, 4);
915 /* Decode prefix length */
917 uint b
= (l
+ 7) / 8;
918 ADVANCE(pos
, len
, 1);
920 if (l
> IP4_MAX_PREFIX_LENGTH
)
921 bgp_parse_error(s
, 10);
924 bgp_parse_error(s
, 1);
926 /* Decode prefix body */
927 ip4_addr addr
= IP4_NONE
;
928 memcpy(&addr
, pos
, b
);
929 ADVANCE(pos
, len
, b
);
931 net
= NET_ADDR_IP4(ip4_ntoh(addr
), l
);
932 net_normalize_ip4(&net
);
934 // XXXX validate prefix
936 bgp_rte_update(s
, (net_addr
*) &net
, path_id
, a
);
941 bgp_encode_next_hop_ip4(struct bgp_write_state
*s UNUSED
, eattr
*a
, byte
*buf
, uint size UNUSED
)
943 /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
945 ASSERT(a
->u
.ptr
->length
== sizeof(ip_addr
));
947 put_ip4(buf
, ipa_to_ip4( *(ip_addr
*) a
->u
.ptr
->data
));
953 bgp_decode_next_hop_ip4(struct bgp_parse_state
*s
, byte
*data
, uint len
, rta
*a
)
956 bgp_parse_error(s
, 9);
958 ip_addr nh
= ipa_from_ip4(get_ip4(data
));
960 // XXXX validate next hop
962 bgp_set_attr_data(&(a
->eattrs
), s
->pool
, BA_NEXT_HOP
, 0, &nh
, sizeof(nh
));
963 bgp_apply_next_hop(s
, a
, nh
, IPA_NONE
);
968 bgp_encode_nlri_ip6(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, uint size
)
972 while (!EMPTY_LIST(buck
->prefixes
) && (size
>= (5 + sizeof(ip6_addr
))))
974 struct bgp_prefix
*px
= HEAD(buck
->prefixes
);
975 struct net_addr_ip6
*net
= (void *) px
->net
;
980 put_u32(pos
, px
->path_id
);
981 ADVANCE(pos
, size
, 4);
984 ip6_addr a
= ip6_hton(net
->prefix
);
985 uint b
= (net
->pxlen
+ 7) / 8;
987 /* Encode prefix length */
989 ADVANCE(pos
, size
, 1);
991 /* Encode prefix body */
993 ADVANCE(pos
, size
, b
);
995 bgp_free_prefix(s
->channel
, px
);
1002 bgp_decode_nlri_ip6(struct bgp_parse_state
*s
, byte
*pos
, uint len
, rta
*a
)
1009 /* Decode path ID */
1013 bgp_parse_error(s
, 1);
1015 path_id
= get_u32(pos
);
1016 ADVANCE(pos
, len
, 4);
1019 /* Decode prefix length */
1021 uint b
= (l
+ 7) / 8;
1022 ADVANCE(pos
, len
, 1);
1024 if (l
> IP6_MAX_PREFIX_LENGTH
)
1025 bgp_parse_error(s
, 10);
1028 bgp_parse_error(s
, 1);
1030 /* Decode prefix body */
1031 ip6_addr addr
= IP6_NONE
;
1032 memcpy(&addr
, pos
, b
);
1033 ADVANCE(pos
, len
, b
);
1035 net
= NET_ADDR_IP6(ip6_ntoh(addr
), l
);
1036 net_normalize_ip6(&net
);
1038 // XXXX validate prefix
1040 bgp_rte_update(s
, (net_addr
*) &net
, path_id
, a
);
1045 bgp_encode_next_hop_ip6(struct bgp_write_state
*s UNUSED
, eattr
*a
, byte
*buf
, uint size UNUSED
)
1047 ip_addr
*nh
= (void *) a
->u
.ptr
->data
;
1048 uint len
= a
->u
.ptr
->length
;
1050 ASSERT((len
== 16) || (len
== 32));
1052 put_ip6(buf
, ipa_to_ip6(nh
[0]));
1055 put_ip6(buf
+16, ipa_to_ip6(nh
[1]));
1061 bgp_decode_next_hop_ip6(struct bgp_parse_state
*s
, byte
*data
, uint len
, rta
*a
)
1063 struct adata
*ad
= lp_alloc_adata(s
->pool
, 32);
1064 ip_addr
*nh
= (void *) ad
->data
;
1066 if ((len
!= 16) && (len
!= 32))
1067 bgp_parse_error(s
, 9);
1069 nh
[0] = ipa_from_ip6(get_ip6(data
));
1070 nh
[1] = (len
== 32) ? ipa_from_ip6(get_ip6(data
+16)) : IPA_NONE
;
1072 if (ip6_is_link_local(nh
[0]))
1078 if (!ip6_is_link_local(nh
[1]))
1081 if (ipa_zero(nh
[1]))
1084 // XXXX validate next hop
1086 bgp_set_attr_ptr(&(a
->eattrs
), s
->pool
, BA_NEXT_HOP
, 0, ad
);
1087 bgp_apply_next_hop(s
, a
, nh
[0], nh
[1]);
1092 bgp_encode_nlri_flow4(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, uint size
)
1096 while (!EMPTY_LIST(buck
->prefixes
) && (size
>= 4))
1098 struct bgp_prefix
*px
= HEAD(buck
->prefixes
);
1099 struct net_addr_flow4
*net
= (void *) px
->net
;
1100 uint flen
= net
->length
- sizeof(net_addr_flow4
);
1102 /* Encode path ID */
1105 put_u32(pos
, px
->path_id
);
1106 ADVANCE(pos
, size
, 4);
1112 /* Copy whole flow data including length */
1113 memcpy(pos
, net
->data
, flen
);
1114 ADVANCE(pos
, size
, flen
);
1116 bgp_free_prefix(s
->channel
, px
);
1123 bgp_decode_nlri_flow4(struct bgp_parse_state
*s
, byte
*pos
, uint len
, rta
*a
)
1129 /* Decode path ID */
1133 bgp_parse_error(s
, 1);
1135 path_id
= get_u32(pos
);
1136 ADVANCE(pos
, len
, 4);
1140 bgp_parse_error(s
, 1);
1142 /* Decode flow length */
1143 uint hlen
= flow_hdr_length(pos
);
1144 uint dlen
= flow_read_length(pos
);
1145 uint flen
= hlen
+ dlen
;
1146 byte
*data
= pos
+ hlen
;
1149 bgp_parse_error(s
, 1);
1151 /* Validate flow data */
1152 enum flow_validated_state r
= flow4_validate(data
, dlen
);
1153 if (r
!= FLOW_ST_VALID
)
1155 log(L_REMOTE
"%s: Invalid flow route: %s", s
->proto
->p
.name
, flow_validated_state_str(r
));
1156 bgp_parse_error(s
, 1);
1159 if (data
[0] != FLOW_TYPE_DST_PREFIX
)
1161 log(L_REMOTE
"%s: No dst prefix at first pos", s
->proto
->p
.name
);
1162 bgp_parse_error(s
, 1);
1165 /* Decode dst prefix */
1166 ip4_addr px
= IP4_NONE
;
1167 uint pxlen
= data
[1];
1169 // FIXME: Use some generic function
1170 memcpy(&px
, data
, BYTES(pxlen
));
1171 px
= ip4_and(px
, ip4_mkmask(pxlen
));
1173 /* Prepare the flow */
1174 net_addr
*n
= alloca(sizeof(struct net_addr_flow4
) + flen
);
1175 net_fill_flow4(n
, px
, pxlen
, pos
, flen
);
1176 ADVANCE(pos
, len
, flen
);
1178 bgp_rte_update(s
, n
, path_id
, a
);
1184 bgp_encode_nlri_flow6(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, uint size
)
1188 while (!EMPTY_LIST(buck
->prefixes
) && (size
>= 4))
1190 struct bgp_prefix
*px
= HEAD(buck
->prefixes
);
1191 struct net_addr_flow6
*net
= (void *) px
->net
;
1192 uint flen
= net
->length
- sizeof(net_addr_flow6
);
1194 /* Encode path ID */
1197 put_u32(pos
, px
->path_id
);
1198 ADVANCE(pos
, size
, 4);
1204 /* Copy whole flow data including length */
1205 memcpy(pos
, net
->data
, flen
);
1206 ADVANCE(pos
, size
, flen
);
1208 bgp_free_prefix(s
->channel
, px
);
1215 bgp_decode_nlri_flow6(struct bgp_parse_state
*s
, byte
*pos
, uint len
, rta
*a
)
1221 /* Decode path ID */
1225 bgp_parse_error(s
, 1);
1227 path_id
= get_u32(pos
);
1228 ADVANCE(pos
, len
, 4);
1232 bgp_parse_error(s
, 1);
1234 /* Decode flow length */
1235 uint hlen
= flow_hdr_length(pos
);
1236 uint dlen
= flow_read_length(pos
);
1237 uint flen
= hlen
+ dlen
;
1238 byte
*data
= pos
+ hlen
;
1241 bgp_parse_error(s
, 1);
1243 /* Validate flow data */
1244 enum flow_validated_state r
= flow6_validate(data
, dlen
);
1245 if (r
!= FLOW_ST_VALID
)
1247 log(L_REMOTE
"%s: Invalid flow route: %s", s
->proto
->p
.name
, flow_validated_state_str(r
));
1248 bgp_parse_error(s
, 1);
1251 if (data
[0] != FLOW_TYPE_DST_PREFIX
)
1253 log(L_REMOTE
"%s: No dst prefix at first pos", s
->proto
->p
.name
);
1254 bgp_parse_error(s
, 1);
1257 /* Decode dst prefix */
1258 ip6_addr px
= IP6_NONE
;
1259 uint pxlen
= data
[1];
1261 // FIXME: Use some generic function
1262 memcpy(&px
, data
, BYTES(pxlen
));
1263 px
= ip6_and(px
, ip6_mkmask(pxlen
));
1265 /* Prepare the flow */
1266 net_addr
*n
= alloca(sizeof(struct net_addr_flow6
) + flen
);
1267 net_fill_flow6(n
, px
, pxlen
, pos
, flen
);
1268 ADVANCE(pos
, len
, flen
);
1270 bgp_rte_update(s
, n
, path_id
, a
);
1275 static const struct bgp_af_desc bgp_af_table
[] = {
1280 .encode_nlri
= bgp_encode_nlri_ip4
,
1281 .decode_nlri
= bgp_decode_nlri_ip4
,
1282 .encode_next_hop
= bgp_encode_next_hop_ip4
,
1283 .decode_next_hop
= bgp_decode_next_hop_ip4
,
1284 .update_next_hop
= bgp_update_next_hop_ip
,
1287 .afi
= BGP_AF_IPV4_MC
,
1290 .encode_nlri
= bgp_encode_nlri_ip4
,
1291 .decode_nlri
= bgp_decode_nlri_ip4
,
1292 .encode_next_hop
= bgp_encode_next_hop_ip4
,
1293 .decode_next_hop
= bgp_decode_next_hop_ip4
,
1294 .update_next_hop
= bgp_update_next_hop_ip
,
1297 .afi
= BGP_AF_FLOW4
,
1300 .encode_nlri
= bgp_encode_nlri_flow4
,
1301 .decode_nlri
= bgp_decode_nlri_flow4
,
1302 .encode_next_hop
= bgp_encode_next_hop_none
,
1303 .decode_next_hop
= bgp_decode_next_hop_none
,
1304 .update_next_hop
= bgp_update_next_hop_none
,
1310 .encode_nlri
= bgp_encode_nlri_ip6
,
1311 .decode_nlri
= bgp_decode_nlri_ip6
,
1312 .encode_next_hop
= bgp_encode_next_hop_ip6
,
1313 .decode_next_hop
= bgp_decode_next_hop_ip6
,
1314 .update_next_hop
= bgp_update_next_hop_ip
,
1317 .afi
= BGP_AF_IPV6_MC
,
1320 .encode_nlri
= bgp_encode_nlri_ip6
,
1321 .decode_nlri
= bgp_decode_nlri_ip6
,
1322 .encode_next_hop
= bgp_encode_next_hop_ip6
,
1323 .decode_next_hop
= bgp_decode_next_hop_ip6
,
1324 .update_next_hop
= bgp_update_next_hop_ip
,
1327 .afi
= BGP_AF_FLOW6
,
1330 .encode_nlri
= bgp_encode_nlri_flow6
,
1331 .decode_nlri
= bgp_decode_nlri_flow6
,
1332 .encode_next_hop
= bgp_encode_next_hop_none
,
1333 .decode_next_hop
= bgp_decode_next_hop_none
,
1334 .update_next_hop
= bgp_update_next_hop_none
,
1338 const struct bgp_af_desc
*
1339 bgp_get_af_desc(u32 afi
)
1342 for (i
= 0; i
< ARRAY_SIZE(bgp_af_table
); i
++)
1343 if (bgp_af_table
[i
].afi
== afi
)
1344 return &bgp_af_table
[i
];
1350 bgp_encode_nlri(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, byte
*end
)
1352 return s
->channel
->desc
->encode_nlri(s
, buck
, buf
, end
- buf
);
1356 bgp_encode_next_hop(struct bgp_write_state
*s
, eattr
*nh
, byte
*buf
)
1358 return s
->channel
->desc
->encode_next_hop(s
, nh
, buf
, 255);
1362 bgp_update_next_hop(struct bgp_export_state
*s
, eattr
*a
, ea_list
**to
)
1364 s
->channel
->desc
->update_next_hop(s
, a
, to
);
1367 #define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1370 bgp_create_ip_reach(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, byte
*end
)
1373 * 2 B Withdrawn Routes Length (zero)
1374 * --- IPv4 Withdrawn Routes NLRI (unused)
1375 * 2 B Total Path Attribute Length
1376 * var Path Attributes
1377 * var IPv4 Network Layer Reachability Information
1382 la
= bgp_encode_attrs(s
, buck
->eattrs
, buf
+4, buf
+ MAX_ATTRS_LENGTH
);
1385 /* Attribute list too long */
1386 bgp_withdraw_bucket(s
->channel
, buck
);
1393 lr
= bgp_encode_nlri(s
, buck
, buf
+4+la
, end
);
1399 bgp_create_mp_reach(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, byte
*end
)
1402 * 2 B IPv4 Withdrawn Routes Length (zero)
1403 * --- IPv4 Withdrawn Routes NLRI (unused)
1404 * 2 B Total Path Attribute Length
1405 * 1 B MP_REACH_NLRI hdr - Attribute Flags
1406 * 1 B MP_REACH_NLRI hdr - Attribute Type Code
1407 * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
1408 * 2 B MP_REACH_NLRI data - Address Family Identifier
1409 * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
1410 * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
1411 * var MP_REACH_NLRI data - Network Address of Next Hop
1412 * 1 B MP_REACH_NLRI data - Reserved (zero)
1413 * var MP_REACH_NLRI data - Network Layer Reachability Information
1414 * var Rest of Path Attributes
1415 * --- IPv4 Network Layer Reachability Information (unused)
1418 int lh
, lr
, la
; /* Lengths of next hop, NLRI and attributes */
1420 /* Begin of MP_REACH_NLRI atribute */
1421 buf
[4] = BAF_OPTIONAL
| BAF_EXT_LEN
;
1422 buf
[5] = BA_MP_REACH_NLRI
;
1423 put_u16(buf
+6, 0); /* Will be fixed later */
1424 put_af3(buf
+8, s
->channel
->afi
);
1427 /* Encode attributes to temporary buffer */
1428 byte
*abuf
= alloca(MAX_ATTRS_LENGTH
);
1429 la
= bgp_encode_attrs(s
, buck
->eattrs
, abuf
, abuf
+ MAX_ATTRS_LENGTH
);
1432 /* Attribute list too long */
1433 bgp_withdraw_bucket(s
->channel
, buck
);
1437 /* Encode the next hop */
1438 lh
= bgp_encode_next_hop(s
, s
->mp_next_hop
, pos
+1);
1442 /* Reserved field */
1445 /* Encode the NLRI */
1446 lr
= bgp_encode_nlri(s
, buck
, pos
, end
- la
);
1449 /* End of MP_REACH_NLRI atribute, update data length */
1450 put_u16(buf
+6, pos
-buf
-8);
1452 /* Copy remaining attributes */
1453 memcpy(pos
, abuf
, la
);
1456 /* Initial UPDATE fields */
1458 put_u16(buf
+2, pos
-buf
-4);
1463 #undef MAX_ATTRS_LENGTH
1466 bgp_create_ip_unreach(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, byte
*end
)
1469 * 2 B Withdrawn Routes Length
1470 * var IPv4 Withdrawn Routes NLRI
1471 * 2 B Total Path Attribute Length (zero)
1472 * --- Path Attributes (unused)
1473 * --- IPv4 Network Layer Reachability Information (unused)
1476 uint len
= bgp_encode_nlri(s
, buck
, buf
+2, end
);
1478 put_u16(buf
+0, len
);
1479 put_u16(buf
+2+len
, 0);
1485 bgp_create_mp_unreach(struct bgp_write_state
*s
, struct bgp_bucket
*buck
, byte
*buf
, byte
*end
)
1488 * 2 B Withdrawn Routes Length (zero)
1489 * --- IPv4 Withdrawn Routes NLRI (unused)
1490 * 2 B Total Path Attribute Length
1491 * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
1492 * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
1493 * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
1494 * 2 B MP_UNREACH_NLRI data - Address Family Identifier
1495 * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
1496 * var MP_UNREACH_NLRI data - Network Layer Reachability Information
1497 * --- IPv4 Network Layer Reachability Information (unused)
1500 uint len
= bgp_encode_nlri(s
, buck
, buf
+11, end
);
1503 put_u16(buf
+2, 7+len
);
1505 /* Begin of MP_UNREACH_NLRI atribute */
1506 buf
[4] = BAF_OPTIONAL
| BAF_EXT_LEN
;
1507 buf
[5] = BA_MP_UNREACH_NLRI
;
1508 put_u16(buf
+6, 3+len
);
1509 put_af3(buf
+8, s
->channel
->afi
);
1515 bgp_create_update(struct bgp_channel
*c
, byte
*buf
)
1517 struct bgp_proto
*p
= (void *) c
->c
.proto
;
1518 struct bgp_bucket
*buck
;
1519 byte
*end
= buf
+ (bgp_max_packet_length(p
->conn
) - BGP_HEADER_LENGTH
);
1522 /* Initialize write state */
1523 struct bgp_write_state s
= {
1526 .pool
= bgp_linpool
,
1527 .as4_session
= p
->as4_session
,
1528 .add_path
= c
->add_path_tx
,
1533 /* Try unreachable bucket */
1534 if ((buck
= c
->withdraw_bucket
) && !EMPTY_LIST(buck
->prefixes
))
1536 res
= (c
->afi
== BGP_AF_IPV4
) ?
1537 bgp_create_ip_unreach(&s
, buck
, buf
, end
):
1538 bgp_create_mp_unreach(&s
, buck
, buf
, end
);
1543 /* Try reachable buckets */
1544 if (!EMPTY_LIST(c
->bucket_queue
))
1546 buck
= HEAD(c
->bucket_queue
);
1548 /* Cleanup empty buckets */
1549 if (EMPTY_LIST(buck
->prefixes
))
1551 bgp_free_bucket(c
, buck
);
1555 res
= (c
->afi
== BGP_AF_IPV4
) ?
1556 bgp_create_ip_reach(&s
, buck
, buf
, end
):
1557 bgp_create_mp_reach(&s
, buck
, buf
, end
);
1559 if (EMPTY_LIST(buck
->prefixes
))
1560 bgp_free_bucket(c
, buck
);
1562 bgp_defer_bucket(c
, buck
);
1570 /* No more prefixes to send */
1574 BGP_TRACE_RL(&rl_snd_update
, D_PACKETS
, "Sending UPDATE");
1581 bgp_create_ip_end_mark(struct bgp_channel
*c UNUSED
, byte
*buf
)
1583 /* Empty update packet */
1590 bgp_create_mp_end_mark(struct bgp_channel
*c
, byte
*buf
)
1593 put_u16(buf
+2, 6); /* length 4--9 */
1595 /* Empty MP_UNREACH_NLRI atribute */
1596 buf
[4] = BAF_OPTIONAL
;
1597 buf
[5] = BA_MP_UNREACH_NLRI
;
1598 buf
[6] = 3; /* Length 7--9 */
1599 put_af3(buf
+7, c
->afi
);
1605 bgp_create_end_mark(struct bgp_channel
*c
, byte
*buf
)
1607 struct bgp_proto
*p
= (void *) c
->c
.proto
;
1609 BGP_TRACE(D_PACKETS
, "Sending END-OF-RIB");
1611 return (c
->afi
== BGP_AF_IPV4
) ?
1612 bgp_create_ip_end_mark(c
, buf
):
1613 bgp_create_mp_end_mark(c
, buf
);
1617 bgp_rx_end_mark(struct bgp_proto
*p
, u32 afi
)
1619 struct bgp_channel
*c
= bgp_get_channel(p
, afi
);
1621 BGP_TRACE(D_PACKETS
, "Got END-OF-RIB");
1623 /* XXXX handle unknown AF in MP_*_NLRI */
1627 if (c
->load_state
== BFS_LOADING
)
1628 c
->load_state
= BFS_NONE
;
1630 if (p
->p
.gr_recovery
)
1631 channel_graceful_restart_unlock(&c
->c
);
1634 bgp_graceful_restart_done(c
);
1638 bgp_decode_nlri(struct bgp_parse_state
*s
, u32 afi
, byte
*nlri
, uint len
, ea_list
*ea
, byte
*nh
, uint nh_len
)
1640 struct bgp_channel
*c
= bgp_get_channel(s
->proto
, afi
);
1643 /* XXXX handle unknown AF in MP_*_NLRI */
1648 s
->add_path
= c
->add_path_rx
;
1651 s
->last_src
= s
->proto
->p
.main_source
;
1654 * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
1655 * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
1656 * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
1657 * decode_next_hop hooks) by restoring a->eattrs afterwards.
1662 a
= alloca(sizeof(struct rta
));
1663 memset(a
, 0, sizeof(struct rta
));
1665 a
->source
= RTS_BGP
;
1666 a
->scope
= SCOPE_UNIVERSE
;
1667 a
->cast
= RTC_UNICAST
;
1668 a
->dest
= RTD_UNREACHABLE
;
1669 a
->from
= s
->proto
->cf
->remote_ip
;
1672 c
->desc
->decode_next_hop(s
, nh
, nh_len
, a
);
1674 /* Handle withdraw during next hop decoding */
1675 if (s
->err_withdraw
)
1679 c
->desc
->decode_nlri(s
, nlri
, len
, a
);
1681 rta_free(s
->cached_rta
);
1682 s
->cached_rta
= NULL
;
1686 bgp_rx_update(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
1688 struct bgp_proto
*p
= conn
->bgp
;
1691 BGP_TRACE_RL(&rl_rcv_update
, D_PACKETS
, "Got UPDATE");
1693 /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1694 if (conn
->state
== BS_OPENCONFIRM
)
1695 bgp_conn_enter_established_state(conn
);
1697 if (conn
->state
!= BS_ESTABLISHED
)
1698 { bgp_error(conn
, 5, fsm_err_subcode
[conn
->state
], NULL
, 0); return; }
1700 bgp_start_timer(conn
->hold_timer
, conn
->hold_time
);
1702 /* Initialize parse state */
1703 struct bgp_parse_state s
= {
1705 .pool
= bgp_linpool
,
1706 .as4_session
= p
->as4_session
,
1709 /* Parse error handler */
1710 if (setjmp(s
.err_jmpbuf
))
1712 bgp_error(conn
, 3, s
.err_subcode
, NULL
, 0);
1716 /* Check minimal length */
1718 { bgp_error(conn
, 1, 2, pkt
+16, 2); return; }
1720 /* Skip fixed header */
1724 * UPDATE message format
1726 * 2 B IPv4 Withdrawn Routes Length
1727 * var IPv4 Withdrawn Routes NLRI
1728 * 2 B Total Path Attribute Length
1729 * var Path Attributes
1730 * var IPv4 Reachable Routes NLRI
1733 s
.ip_unreach_len
= get_u16(pkt
+ pos
);
1734 s
.ip_unreach_nlri
= pkt
+ pos
+ 2;
1735 pos
+= 2 + s
.ip_unreach_len
;
1738 bgp_parse_error(&s
, 1);
1740 s
.attr_len
= get_u16(pkt
+ pos
);
1741 s
.attrs
= pkt
+ pos
+ 2;
1742 pos
+= 2 + s
.attr_len
;
1745 bgp_parse_error(&s
, 1);
1747 s
.ip_reach_len
= len
- pos
;
1748 s
.ip_reach_nlri
= pkt
+ pos
;
1752 ea
= bgp_decode_attrs(&s
, s
.attrs
, s
.attr_len
);
1754 /* Check for End-of-RIB marker */
1755 if (!s
.attr_len
&& !s
.ip_unreach_len
&& !s
.ip_reach_len
)
1756 { bgp_rx_end_mark(p
, BGP_AF_IPV4
); goto done
; }
1758 /* Check for MP End-of-RIB marker */
1759 if ((s
.attr_len
< 8) && !s
.ip_unreach_len
&& !s
.ip_reach_len
&&
1760 !s
.mp_reach_len
&& !s
.mp_unreach_len
&& s
.mp_unreach_af
) /* XXXX See RFC 7606 5.2 */
1761 { bgp_rx_end_mark(p
, s
.mp_unreach_af
); goto done
; }
1763 if (s
.ip_unreach_len
)
1764 bgp_decode_nlri(&s
, BGP_AF_IPV4
, s
.ip_unreach_nlri
, s
.ip_unreach_len
, NULL
, NULL
, 0);
1766 if (s
.mp_unreach_len
)
1767 bgp_decode_nlri(&s
, s
.mp_unreach_af
, s
.mp_unreach_nlri
, s
.mp_unreach_len
, NULL
, NULL
, 0);
1770 bgp_decode_nlri(&s
, BGP_AF_IPV4
, s
.ip_reach_nlri
, s
.ip_reach_len
,
1771 ea
, s
.ip_next_hop_data
, s
.ip_next_hop_len
);
1774 bgp_decode_nlri(&s
, s
.mp_reach_af
, s
.mp_reach_nlri
, s
.mp_reach_len
,
1775 ea
, s
.mp_next_hop_data
, s
.mp_next_hop_len
);
1778 rta_free(s
.cached_rta
);
1788 static inline byte
*
1789 bgp_create_route_refresh(struct bgp_channel
*c
, byte
*buf
)
1791 struct bgp_proto
*p
= (void *) c
->c
.proto
;
1793 BGP_TRACE(D_PACKETS
, "Sending ROUTE-REFRESH");
1795 /* Original route refresh request, RFC 2918 */
1796 put_af4(buf
, c
->afi
);
1797 buf
[2] = BGP_RR_REQUEST
;
1802 static inline byte
*
1803 bgp_create_begin_refresh(struct bgp_channel
*c
, byte
*buf
)
1805 struct bgp_proto
*p
= (void *) c
->c
.proto
;
1807 BGP_TRACE(D_PACKETS
, "Sending BEGIN-OF-RR");
1809 /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
1810 put_af4(buf
, c
->afi
);
1811 buf
[2] = BGP_RR_BEGIN
;
1816 static inline byte
*
1817 bgp_create_end_refresh(struct bgp_channel
*c
, byte
*buf
)
1819 struct bgp_proto
*p
= (void *) c
->c
.proto
;
1821 BGP_TRACE(D_PACKETS
, "Sending END-OF-RR");
1823 /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
1824 put_af4(buf
, c
->afi
);
1825 buf
[2] = BGP_RR_END
;
1831 bgp_rx_route_refresh(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
1833 struct bgp_proto
*p
= conn
->bgp
;
1835 if (conn
->state
!= BS_ESTABLISHED
)
1836 { bgp_error(conn
, 5, fsm_err_subcode
[conn
->state
], NULL
, 0); return; }
1838 if (!conn
->local_caps
->route_refresh
)
1839 { bgp_error(conn
, 1, 3, pkt
+18, 1); return; }
1841 if (len
< (BGP_HEADER_LENGTH
+ 4))
1842 { bgp_error(conn
, 1, 2, pkt
+16, 2); return; }
1844 if (len
> (BGP_HEADER_LENGTH
+ 4))
1845 { bgp_error(conn
, 7, 1, pkt
, MIN(len
, 2048)); return; }
1847 struct bgp_channel
*c
= bgp_get_channel(p
, get_af4(pkt
+19));
1850 log(L_WARN
"%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
1851 p
->p
.name
, pkt
[21], get_u16(pkt
+19), pkt
[22]);
1855 /* RFC 7313 redefined reserved field as RR message subtype */
1856 uint subtype
= p
->enhanced_refresh
? pkt
[21] : BGP_RR_REQUEST
;
1860 case BGP_RR_REQUEST
:
1861 BGP_TRACE(D_PACKETS
, "Got ROUTE-REFRESH");
1862 channel_request_feeding(&c
->c
);
1866 BGP_TRACE(D_PACKETS
, "Got BEGIN-OF-RR");
1867 bgp_refresh_begin(c
);
1871 BGP_TRACE(D_PACKETS
, "Got END-OF-RR");
1876 log(L_WARN
"%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
1877 p
->p
.name
, subtype
);
1882 static inline struct bgp_channel
*
1883 bgp_get_channel_to_send(struct bgp_proto
*p
, struct bgp_conn
*conn
)
1885 uint i
= conn
->last_channel
;
1887 /* Try the last channel, but at most several times */
1888 if ((conn
->channels_to_send
& (1 << i
)) &&
1889 (conn
->last_channel_count
< 16))
1892 /* Find channel with non-zero channels_to_send */
1896 if (i
>= p
->channel_count
)
1899 while (! (conn
->channels_to_send
& (1 << i
)));
1901 /* Use that channel */
1902 conn
->last_channel
= i
;
1903 conn
->last_channel_count
= 0;
1906 conn
->last_channel_count
++;
1907 return p
->channel_map
[i
];
1911 bgp_send(struct bgp_conn
*conn
, uint type
, uint len
)
1913 sock
*sk
= conn
->sk
;
1914 byte
*buf
= sk
->tbuf
;
1916 memset(buf
, 0xff, 16); /* Marker */
1917 put_u16(buf
+16, len
);
1920 return sk_send(sk
, len
);
1924 * bgp_fire_tx - transmit packets
1927 * Whenever the transmit buffers of the underlying TCP connection
1928 * are free and we have any packets queued for sending, the socket functions
1929 * call bgp_fire_tx() which takes care of selecting the highest priority packet
1930 * queued (Notification > Keepalive > Open > Update), assembling its header
1931 * and body and sending it to the connection.
1934 bgp_fire_tx(struct bgp_conn
*conn
)
1936 struct bgp_proto
*p
= conn
->bgp
;
1937 struct bgp_channel
*c
;
1938 byte
*buf
, *pkt
, *end
;
1944 buf
= conn
->sk
->tbuf
;
1945 pkt
= buf
+ BGP_HEADER_LENGTH
;
1946 s
= conn
->packets_to_send
;
1948 if (s
& (1 << PKT_SCHEDULE_CLOSE
))
1950 /* We can finally close connection and enter idle state */
1951 bgp_conn_enter_idle_state(conn
);
1954 if (s
& (1 << PKT_NOTIFICATION
))
1956 conn
->packets_to_send
= 1 << PKT_SCHEDULE_CLOSE
;
1957 end
= bgp_create_notification(conn
, pkt
);
1958 return bgp_send(conn
, PKT_NOTIFICATION
, end
- buf
);
1960 else if (s
& (1 << PKT_KEEPALIVE
))
1962 conn
->packets_to_send
&= ~(1 << PKT_KEEPALIVE
);
1963 BGP_TRACE(D_PACKETS
, "Sending KEEPALIVE");
1964 bgp_start_timer(conn
->keepalive_timer
, conn
->keepalive_time
);
1965 return bgp_send(conn
, PKT_KEEPALIVE
, BGP_HEADER_LENGTH
);
1967 else if (s
& (1 << PKT_OPEN
))
1969 conn
->packets_to_send
&= ~(1 << PKT_OPEN
);
1970 end
= bgp_create_open(conn
, pkt
);
1971 return bgp_send(conn
, PKT_OPEN
, end
- buf
);
1973 else while (conn
->channels_to_send
)
1975 c
= bgp_get_channel_to_send(p
, conn
);
1976 s
= c
->packets_to_send
;
1978 if (s
& (1 << PKT_ROUTE_REFRESH
))
1980 c
->packets_to_send
&= ~(1 << PKT_ROUTE_REFRESH
);
1981 end
= bgp_create_route_refresh(c
, pkt
);
1982 return bgp_send(conn
, PKT_ROUTE_REFRESH
, end
- buf
);
1984 else if (s
& (1 << PKT_BEGIN_REFRESH
))
1986 /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
1987 c
->packets_to_send
&= ~(1 << PKT_BEGIN_REFRESH
);
1988 end
= bgp_create_begin_refresh(c
, pkt
);
1989 return bgp_send(conn
, PKT_ROUTE_REFRESH
, end
- buf
);
1991 else if (s
& (1 << PKT_UPDATE
))
1993 end
= bgp_create_update(c
, pkt
);
1995 return bgp_send(conn
, PKT_UPDATE
, end
- buf
);
1997 /* No update to send, perhaps we need to send End-of-RIB or EoRR */
1998 c
->packets_to_send
= 0;
1999 conn
->channels_to_send
&= ~(1 << c
->index
);
2001 if (c
->feed_state
== BFS_LOADED
)
2003 c
->feed_state
= BFS_NONE
;
2004 end
= bgp_create_end_mark(c
, pkt
);
2005 return bgp_send(conn
, PKT_UPDATE
, end
- buf
);
2008 else if (c
->feed_state
== BFS_REFRESHED
)
2010 c
->feed_state
= BFS_NONE
;
2011 end
= bgp_create_end_refresh(c
, pkt
);
2012 return bgp_send(conn
, PKT_ROUTE_REFRESH
, end
- buf
);
2016 bug("Channel packets_to_send: %x", s
);
2018 c
->packets_to_send
= 0;
2019 conn
->channels_to_send
&= ~(1 << c
->index
);
2026 * bgp_schedule_packet - schedule a packet for transmission
2029 * @type: packet type
2031 * Schedule a packet of type @type to be sent as soon as possible.
2034 bgp_schedule_packet(struct bgp_conn
*conn
, struct bgp_channel
*c
, int type
)
2038 DBG("BGP: Scheduling packet type %d\n", type
);
2042 if (! conn
->channels_to_send
)
2044 conn
->last_channel
= c
->index
;
2045 conn
->last_channel_count
= 0;
2048 c
->packets_to_send
|= 1 << type
;
2049 conn
->channels_to_send
|= 1 << c
->index
;
2052 conn
->packets_to_send
|= 1 << type
;
2054 if ((conn
->sk
->tpos
== conn
->sk
->tbuf
) && !ev_active(conn
->tx_ev
))
2055 ev_schedule(conn
->tx_ev
);
2059 bgp_kick_tx(void *vconn
)
2061 struct bgp_conn
*conn
= vconn
;
2063 DBG("BGP: kicking TX\n");
2064 while (bgp_fire_tx(conn
) > 0)
2071 struct bgp_conn
*conn
= sk
->data
;
2073 DBG("BGP: TX hook\n");
2074 while (bgp_fire_tx(conn
) > 0)
2082 } bgp_msg_table
[] = {
2083 { 1, 0, "Invalid message header" },
2084 { 1, 1, "Connection not synchronized" },
2085 { 1, 2, "Bad message length" },
2086 { 1, 3, "Bad message type" },
2087 { 2, 0, "Invalid OPEN message" },
2088 { 2, 1, "Unsupported version number" },
2089 { 2, 2, "Bad peer AS" },
2090 { 2, 3, "Bad BGP identifier" },
2091 { 2, 4, "Unsupported optional parameter" },
2092 { 2, 5, "Authentication failure" },
2093 { 2, 6, "Unacceptable hold time" },
2094 { 2, 7, "Required capability missing" }, /* [RFC5492] */
2095 { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2096 { 3, 0, "Invalid UPDATE message" },
2097 { 3, 1, "Malformed attribute list" },
2098 { 3, 2, "Unrecognized well-known attribute" },
2099 { 3, 3, "Missing mandatory attribute" },
2100 { 3, 4, "Invalid attribute flags" },
2101 { 3, 5, "Invalid attribute length" },
2102 { 3, 6, "Invalid ORIGIN attribute" },
2103 { 3, 7, "AS routing loop" }, /* Deprecated */
2104 { 3, 8, "Invalid NEXT_HOP attribute" },
2105 { 3, 9, "Optional attribute error" },
2106 { 3, 10, "Invalid network field" },
2107 { 3, 11, "Malformed AS_PATH" },
2108 { 4, 0, "Hold timer expired" },
2109 { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2110 { 5, 1, "Unexpected message in OpenSent state" },
2111 { 5, 2, "Unexpected message in OpenConfirm state" },
2112 { 5, 3, "Unexpected message in Established state" },
2113 { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2114 { 6, 1, "Maximum number of prefixes reached" },
2115 { 6, 2, "Administrative shutdown" },
2116 { 6, 3, "Peer de-configured" },
2117 { 6, 4, "Administrative reset" },
2118 { 6, 5, "Connection rejected" },
2119 { 6, 6, "Other configuration change" },
2120 { 6, 7, "Connection collision resolution" },
2121 { 6, 8, "Out of Resources" },
2122 { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2123 { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2127 * bgp_error_dsc - return BGP error description
2128 * @code: BGP error code
2129 * @subcode: BGP error subcode
2131 * bgp_error_dsc() returns error description for BGP errors
2132 * which might be static string or given temporary buffer.
2135 bgp_error_dsc(uint code
, uint subcode
)
2137 static char buff
[32];
2140 for (i
=0; i
< ARRAY_SIZE(bgp_msg_table
); i
++)
2141 if (bgp_msg_table
[i
].major
== code
&& bgp_msg_table
[i
].minor
== subcode
)
2142 return bgp_msg_table
[i
].msg
;
2144 bsprintf(buff
, "Unknown error %u.%u", code
, subcode
);
2149 bgp_log_error(struct bgp_proto
*p
, u8
class, char *msg
, uint code
, uint subcode
, byte
*data
, uint len
)
2152 byte
*t
, argbuf
[36];
2155 /* Don't report Cease messages generated by myself */
2156 if (code
== 6 && class == BE_BGP_TX
)
2159 name
= bgp_error_dsc(code
, subcode
);
2166 if ((code
== 2) && (subcode
== 2) && ((len
== 2) || (len
== 4)))
2168 /* Bad peer AS - we would like to print the AS */
2169 t
+= bsprintf(t
, "%u", (len
== 2) ? get_u16(data
) : get_u32(data
));
2174 for (i
=0; i
<len
; i
++)
2175 t
+= bsprintf(t
, "%02x", data
[i
]);
2179 log(L_REMOTE
"%s: %s: %s%s", p
->p
.name
, msg
, name
, argbuf
);
2183 bgp_rx_notification(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
2185 struct bgp_proto
*p
= conn
->bgp
;
2188 { bgp_error(conn
, 1, 2, pkt
+16, 2); return; }
2190 uint code
= pkt
[19];
2191 uint subcode
= pkt
[20];
2192 int err
= (code
!= 6);
2194 bgp_log_error(p
, BE_BGP_RX
, "Received", code
, subcode
, pkt
+21, len
-21);
2195 bgp_store_error(p
, conn
, BE_BGP_RX
, (code
<< 16) | subcode
);
2197 bgp_conn_enter_close_state(conn
);
2198 bgp_schedule_packet(conn
, NULL
, PKT_SCHEDULE_CLOSE
);
2202 bgp_update_startup_delay(p
);
2208 bgp_rx_keepalive(struct bgp_conn
*conn
)
2210 struct bgp_proto
*p
= conn
->bgp
;
2212 BGP_TRACE(D_PACKETS
, "Got KEEPALIVE");
2213 bgp_start_timer(conn
->hold_timer
, conn
->hold_time
);
2215 if (conn
->state
== BS_OPENCONFIRM
)
2216 { bgp_conn_enter_established_state(conn
); return; }
2218 if (conn
->state
!= BS_ESTABLISHED
)
2219 bgp_error(conn
, 5, fsm_err_subcode
[conn
->state
], NULL
, 0);
2224 * bgp_rx_packet - handle a received packet
2225 * @conn: BGP connection
2226 * @pkt: start of the packet
2229 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2230 * packet handler according to the packet type.
2233 bgp_rx_packet(struct bgp_conn
*conn
, byte
*pkt
, uint len
)
2235 byte type
= pkt
[18];
2237 DBG("BGP: Got packet %02x (%d bytes)\n", type
, len
);
2239 if (conn
->bgp
->p
.mrtdump
& MD_MESSAGES
)
2240 mrt_dump_bgp_packet(conn
, pkt
, len
);
2244 case PKT_OPEN
: return bgp_rx_open(conn
, pkt
, len
);
2245 case PKT_UPDATE
: return bgp_rx_update(conn
, pkt
, len
);
2246 case PKT_NOTIFICATION
: return bgp_rx_notification(conn
, pkt
, len
);
2247 case PKT_KEEPALIVE
: return bgp_rx_keepalive(conn
);
2248 case PKT_ROUTE_REFRESH
: return bgp_rx_route_refresh(conn
, pkt
, len
);
2249 default: bgp_error(conn
, 1, 3, pkt
+18, 1);
2254 * bgp_rx - handle received data
2256 * @size: amount of data received
2258 * bgp_rx() is called by the socket layer whenever new data arrive from
2259 * the underlying TCP connection. It assembles the data fragments to packets,
2260 * checks their headers and framing and passes complete packets to
2264 bgp_rx(sock
*sk
, uint size
)
2266 struct bgp_conn
*conn
= sk
->data
;
2267 byte
*pkt_start
= sk
->rbuf
;
2268 byte
*end
= pkt_start
+ size
;
2271 DBG("BGP: RX hook: Got %d bytes\n", size
);
2272 while (end
>= pkt_start
+ BGP_HEADER_LENGTH
)
2274 if ((conn
->state
== BS_CLOSE
) || (conn
->sk
!= sk
))
2277 if (pkt_start
[i
] != 0xff)
2279 bgp_error(conn
, 1, 1, NULL
, 0);
2282 len
= get_u16(pkt_start
+16);
2283 if ((len
< BGP_HEADER_LENGTH
) || (len
> bgp_max_packet_length(conn
)))
2285 bgp_error(conn
, 1, 2, pkt_start
+16, 2);
2288 if (end
< pkt_start
+ len
)
2290 bgp_rx_packet(conn
, pkt_start
, len
);
2293 if (pkt_start
!= sk
->rbuf
)
2295 memmove(sk
->rbuf
, pkt_start
, end
- pkt_start
);
2296 sk
->rpos
= sk
->rbuf
+ (end
- pkt_start
);