1 From foo@baz Thu Dec 14 11:45:40 CET 2017
2 From: Willem de Bruijn <willemb@google.com>
3 Date: Tue, 21 Nov 2017 10:22:25 -0500
4 Subject: net: accept UFO datagrams from tuntap and packet
6 From: Willem de Bruijn <willemb@google.com>
9 [ Upstream commit 0c19f846d582af919db66a5914a0189f9f92c936 ]
11 Tuntap and similar devices can inject GSO packets. Accept type
12 VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.
14 Processes are expected to use feature negotiation such as TUNSETOFFLOAD
15 to detect supported offload types and refrain from injecting other
16 packets. This process breaks down with live migration: guest kernels
17 do not renegotiate flags, so destination hosts need to expose all
18 features that the source host does.
20 Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
21 This patch introduces nearly(*) no new code to simplify verification.
22 It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
23 insertion and software UFO segmentation.
25 It does not reinstate protocol stack support, hardware offload
26 (NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
27 of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.
29 To support SKB_GSO_UDP reappearing in the stack, also reinstate
30 logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
31 by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
32 CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
33 ("net: avoid skb_warn_bad_offload false positives on UFO").
35 (*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
36 ipv6_proxy_select_ident is changed to return a __be32 and this is
37 assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
38 at the end of the enum to minimize code churn.
41 Booted a v4.13 guest kernel with QEMU. On a host kernel before this
42 patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
43 enabled, same as on a v4.13 host kernel.
45 A UFO packet sent from the guest appears on the tap device:
51 dd if=/dev/zero of=payload.txt bs=1 count=2000
52 nc -u 192.16.1.1 8000 < payload.txt
54 Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
55 packets arriving fragmented:
57 ./with_tap_pair.sh ./tap_send_ufo tap0 tap1
58 (from https://github.com/wdebruij/kerneltools/tree/master/tests)
62 - simplified set_offload change (review comment)
63 - documented test procedure
65 Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
66 Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
67 Reported-by: Michal Kubecek <mkubecek@suse.cz>
68 Signed-off-by: Willem de Bruijn <willemb@google.com>
69 Acked-by: Jason Wang <jasowang@redhat.com>
70 Signed-off-by: David S. Miller <davem@davemloft.net>
71 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
75 include/linux/netdev_features.h | 4 +
76 include/linux/netdevice.h | 1
77 include/linux/skbuff.h | 2
78 include/linux/virtio_net.h | 5 +-
79 include/net/ipv6.h | 2
81 net/ipv4/af_inet.c | 12 ++++-
82 net/ipv4/udp_offload.c | 49 +++++++++++++++++++++--
83 net/ipv6/output_core.c | 6 +-
84 net/ipv6/udp_offload.c | 85 ++++++++++++++++++++++++++++++++++++++--
85 net/openvswitch/datapath.c | 14 ++++++
86 net/openvswitch/flow.c | 6 ++
87 net/sched/act_csum.c | 6 ++
88 15 files changed, 181 insertions(+), 18 deletions(-)
90 --- a/drivers/net/tap.c
91 +++ b/drivers/net/tap.c
92 @@ -1080,7 +1080,7 @@ static long tap_ioctl(struct file *file,
94 /* let the user check for future flags */
95 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
97 + TUN_F_TSO_ECN | TUN_F_UFO))
101 --- a/drivers/net/tun.c
102 +++ b/drivers/net/tun.c
103 @@ -2157,6 +2157,8 @@ static int set_offload(struct tun_struct
104 features |= NETIF_F_TSO6;
105 arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
111 /* This gives the user a way to test for new features in future by
112 --- a/include/linux/netdev_features.h
113 +++ b/include/linux/netdev_features.h
114 @@ -54,8 +54,9 @@ enum {
115 NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
116 NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */
117 NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */
118 + NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */
119 /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */
120 - NETIF_F_GSO_ESP_BIT,
121 + NETIF_F_GSO_UDP_BIT,
123 NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */
124 NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */
125 @@ -132,6 +133,7 @@ enum {
126 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
127 #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP)
128 #define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP)
129 +#define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP)
130 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
131 #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX)
132 #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX)
133 --- a/include/linux/netdevice.h
134 +++ b/include/linux/netdevice.h
135 @@ -4101,6 +4101,7 @@ static inline bool net_gso_ok(netdev_fea
136 BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
137 BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
138 BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
139 + BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
141 return (features & feature) == feature;
143 --- a/include/linux/skbuff.h
144 +++ b/include/linux/skbuff.h
145 @@ -569,6 +569,8 @@ enum {
146 SKB_GSO_SCTP = 1 << 14,
148 SKB_GSO_ESP = 1 << 15,
150 + SKB_GSO_UDP = 1 << 16,
153 #if BITS_PER_LONG > 32
154 --- a/include/linux/virtio_net.h
155 +++ b/include/linux/virtio_net.h
156 @@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(
157 const struct virtio_net_hdr *hdr,
160 - unsigned short gso_type = 0;
161 + unsigned int gso_type = 0;
163 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
164 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
165 @@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(
166 case VIRTIO_NET_HDR_GSO_TCPV6:
167 gso_type = SKB_GSO_TCPV6;
169 + case VIRTIO_NET_HDR_GSO_UDP:
170 + gso_type = SKB_GSO_UDP;
175 --- a/include/net/ipv6.h
176 +++ b/include/net/ipv6.h
177 @@ -727,7 +727,7 @@ static inline int ipv6_addr_diff(const s
178 __be32 ipv6_select_ident(struct net *net,
179 const struct in6_addr *daddr,
180 const struct in6_addr *saddr);
181 -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
182 +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
184 int ip6_dst_hoplimit(struct dst_entry *dst);
188 @@ -2735,7 +2735,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
189 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
192 - return skb->ip_summed != CHECKSUM_PARTIAL;
193 + return skb->ip_summed != CHECKSUM_PARTIAL &&
194 + skb->ip_summed != CHECKSUM_UNNECESSARY;
196 return skb->ip_summed == CHECKSUM_NONE;
198 --- a/net/ipv4/af_inet.c
199 +++ b/net/ipv4/af_inet.c
200 @@ -1221,9 +1221,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
201 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
202 netdev_features_t features)
204 - bool fixedid = false, gso_partial, encap;
205 + bool udpfrag = false, fixedid = false, gso_partial, encap;
206 struct sk_buff *segs = ERR_PTR(-EINVAL);
207 const struct net_offload *ops;
208 + unsigned int offset = 0;
212 @@ -1258,6 +1259,7 @@ struct sk_buff *inet_gso_segment(struct
213 segs = ERR_PTR(-EPROTONOSUPPORT);
215 if (!skb->encapsulation || encap) {
216 + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
217 fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
219 /* fixed ID is invalid if DF bit is not set */
220 @@ -1277,7 +1279,13 @@ struct sk_buff *inet_gso_segment(struct
223 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
224 - if (skb_is_gso(skb)) {
226 + iph->frag_off = htons(offset >> 3);
228 + iph->frag_off |= htons(IP_MF);
229 + offset += skb->len - nhoff - ihl;
230 + tot_len = skb->len - nhoff;
231 + } else if (skb_is_gso(skb)) {
234 id += skb_shinfo(skb)->gso_segs;
235 --- a/net/ipv4/udp_offload.c
236 +++ b/net/ipv4/udp_offload.c
237 @@ -187,16 +187,57 @@ out_unlock:
239 EXPORT_SYMBOL(skb_udp_tunnel_segment);
241 -static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
242 - netdev_features_t features)
243 +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
244 + netdev_features_t features)
246 struct sk_buff *segs = ERR_PTR(-EINVAL);
252 if (skb->encapsulation &&
253 (skb_shinfo(skb)->gso_type &
254 - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
255 + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
256 segs = skb_udp_tunnel_segment(skb, features, false);
260 + if (!pskb_may_pull(skb, sizeof(struct udphdr)))
263 + mss = skb_shinfo(skb)->gso_size;
264 + if (unlikely(skb->len <= mss))
267 + /* Do software UFO. Complete and fill in the UDP checksum as
268 + * HW cannot do checksum of UDP packets sent as multiple
276 + csum = skb_checksum(skb, 0, skb->len, 0);
277 + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
278 + if (uh->check == 0)
279 + uh->check = CSUM_MANGLED_0;
281 + skb->ip_summed = CHECKSUM_UNNECESSARY;
283 + /* If there is no outer header we can fake a checksum offload
284 + * due to the fact that we have already done the checksum in
285 + * software prior to segmenting the frame.
287 + if (!skb->encap_hdr_csum)
288 + features |= NETIF_F_HW_CSUM;
290 + /* Fragment the skb. IP headers of the fragments are updated in
291 + * inet_gso_segment()
293 + segs = skb_segment(skb, features);
298 @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_b
300 static const struct net_offload udpv4_offload = {
302 - .gso_segment = udp4_tunnel_segment,
303 + .gso_segment = udp4_ufo_fragment,
304 .gro_receive = udp4_gro_receive,
305 .gro_complete = udp4_gro_complete,
307 --- a/net/ipv6/output_core.c
308 +++ b/net/ipv6/output_core.c
309 @@ -39,7 +39,7 @@ static u32 __ipv6_select_ident(struct ne
311 * The network header must be set before calling this.
313 -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
314 +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
316 static u32 ip6_proxy_idents_hashrnd __read_mostly;
317 struct in6_addr buf[2];
318 @@ -51,14 +51,14 @@ void ipv6_proxy_select_ident(struct net
319 offsetof(struct ipv6hdr, saddr),
325 net_get_random_once(&ip6_proxy_idents_hashrnd,
326 sizeof(ip6_proxy_idents_hashrnd));
328 id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
329 &addrs[1], &addrs[0]);
330 - skb_shinfo(skb)->ip6_frag_id = htonl(id);
333 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
335 --- a/net/ipv6/udp_offload.c
336 +++ b/net/ipv6/udp_offload.c
338 #include <net/ip6_checksum.h>
339 #include "ip6_offload.h"
341 -static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
342 - netdev_features_t features)
343 +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
344 + netdev_features_t features)
346 struct sk_buff *segs = ERR_PTR(-EINVAL);
348 + unsigned int unfrag_ip6hlen, unfrag_len;
349 + struct frag_hdr *fptr;
350 + u8 *packet_start, *prevhdr;
352 + u8 frag_hdr_sz = sizeof(struct frag_hdr);
357 + mss = skb_shinfo(skb)->gso_size;
358 + if (unlikely(skb->len <= mss))
361 if (skb->encapsulation && skb_shinfo(skb)->gso_type &
362 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
363 segs = skb_udp_tunnel_segment(skb, features, true);
365 + const struct ipv6hdr *ipv6h;
368 + if (!pskb_may_pull(skb, sizeof(struct udphdr)))
371 + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
372 + * do checksum of UDP packets sent as multiple IP fragments.
376 + ipv6h = ipv6_hdr(skb);
379 + csum = skb_checksum(skb, 0, skb->len, 0);
380 + uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
381 + &ipv6h->daddr, csum);
382 + if (uh->check == 0)
383 + uh->check = CSUM_MANGLED_0;
385 + skb->ip_summed = CHECKSUM_UNNECESSARY;
387 + /* If there is no outer header we can fake a checksum offload
388 + * due to the fact that we have already done the checksum in
389 + * software prior to segmenting the frame.
391 + if (!skb->encap_hdr_csum)
392 + features |= NETIF_F_HW_CSUM;
394 + /* Check if there is enough headroom to insert fragment header. */
395 + tnl_hlen = skb_tnl_header_len(skb);
396 + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
397 + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
401 + /* Find the unfragmentable header and shift it left by frag_hdr_sz
402 + * bytes to insert fragment header.
404 + err = ip6_find_1stfragopt(skb, &prevhdr);
406 + return ERR_PTR(err);
407 + unfrag_ip6hlen = err;
408 + nexthdr = *prevhdr;
409 + *prevhdr = NEXTHDR_FRAGMENT;
410 + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
411 + unfrag_ip6hlen + tnl_hlen;
412 + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
413 + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
415 + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
416 + skb->mac_header -= frag_hdr_sz;
417 + skb->network_header -= frag_hdr_sz;
419 + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
420 + fptr->nexthdr = nexthdr;
421 + fptr->reserved = 0;
422 + fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);
424 + /* Fragment the skb. ipv6 header and the remaining fields of the
425 + * fragment header are updated in ipv6_gso_segment()
427 + segs = skb_segment(skb, features);
434 @@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_b
436 static const struct net_offload udpv6_offload = {
438 - .gso_segment = udp6_tunnel_segment,
439 + .gso_segment = udp6_ufo_fragment,
440 .gro_receive = udp6_gro_receive,
441 .gro_complete = udp6_gro_complete,
443 --- a/net/openvswitch/datapath.c
444 +++ b/net/openvswitch/datapath.c
445 @@ -335,6 +335,8 @@ static int queue_gso_packets(struct data
446 const struct dp_upcall_info *upcall_info,
449 + unsigned short gso_type = skb_shinfo(skb)->gso_type;
450 + struct sw_flow_key later_key;
451 struct sk_buff *segs, *nskb;
454 @@ -345,9 +347,21 @@ static int queue_gso_packets(struct data
458 + if (gso_type & SKB_GSO_UDP) {
459 + /* The initial flow key extracted by ovs_flow_key_extract()
460 + * in this case is for a first fragment, so we need to
461 + * properly mark later fragments.
464 + later_key.ip.frag = OVS_FRAG_TYPE_LATER;
467 /* Queue all of the segments. */
470 + if (gso_type & SKB_GSO_UDP && skb != segs)
473 err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
476 --- a/net/openvswitch/flow.c
477 +++ b/net/openvswitch/flow.c
478 @@ -584,7 +584,8 @@ static int key_extract(struct sk_buff *s
479 key->ip.frag = OVS_FRAG_TYPE_LATER;
482 - if (nh->frag_off & htons(IP_MF))
483 + if (nh->frag_off & htons(IP_MF) ||
484 + skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
485 key->ip.frag = OVS_FRAG_TYPE_FIRST;
487 key->ip.frag = OVS_FRAG_TYPE_NONE;
488 @@ -700,6 +701,9 @@ static int key_extract(struct sk_buff *s
490 if (key->ip.frag == OVS_FRAG_TYPE_LATER)
492 + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
493 + key->ip.frag = OVS_FRAG_TYPE_FIRST;
495 /* Transport layer. */
496 if (key->ip.proto == NEXTHDR_TCP) {
497 if (tcphdr_ok(skb)) {
498 --- a/net/sched/act_csum.c
499 +++ b/net/sched/act_csum.c
500 @@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_b
501 const struct iphdr *iph;
504 + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
508 * Support both UDP and UDPLITE checksum algorithms, Don't use
509 * udph->len to get the real length without any protocol check,
510 @@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_b
511 const struct ipv6hdr *ip6h;
514 + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
518 * Support both UDP and UDPLITE checksum algorithms, Don't use
519 * udph->len to get the real length without any protocol check,