From: Greg Kroah-Hartman Date: Fri, 17 Jun 2016 18:18:34 +0000 (-0700) Subject: 4.4-stable patches X-Git-Tag: v3.14.73~33 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=105b4da3f5a5b3b12b740774b21c8afc273503d3;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: bpf-inode-disallow-userns-mounts.patch bpf-use-mount_nodev-not-mount_ns-to-mount-the-bpf-filesystem.patch bridge-don-t-insert-unnecessary-local-fdb-entry-on-changing-mac-address.patch geneve-relax-mtu-constraints.patch ipv6-skip-xfrm-lookup-if-dst_entry-in-socket-cache-is-valid.patch l2tp-fix-configuration-passed-to-setup_udp_tunnel_sock.patch netlink-fix-dump-skb-leak-double-free.patch sfc-on-mc-reset-clear-pio-buffer-linkage-in-txqs.patch switchdev-pass-pointer-to-fib_info-instead-of-copy.patch tcp-record-tlp-and-er-timer-stats-in-v6-stats.patch team-don-t-call-netdev_change_features-under-team-lock.patch tipc-check-nl-sock-before-parsing-nested-attributes.patch tipc-fix-nametable-publication-field-in-nl-compat.patch tuntap-correctly-wake-up-process-during-uninit.patch uapi-glibc-compat-fix-compilation-when-__use_misc-in-glibc.patch udp-prevent-skbs-lingering-in-tunnel-socket-queues.patch vxlan-accept-user-specified-mtu-value-when-create-new-vxlan-link.patch vxlan-gre-geneve-set-a-large-mtu-on-ovs-created-tunnel-devices.patch vxlan-relax-mtu-constraints.patch --- diff --git a/queue-4.4/bpf-inode-disallow-userns-mounts.patch b/queue-4.4/bpf-inode-disallow-userns-mounts.patch new file mode 100644 index 00000000000..9b804a1c7fe --- /dev/null +++ b/queue-4.4/bpf-inode-disallow-userns-mounts.patch @@ -0,0 +1,49 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Daniel Borkmann +Date: Sun, 22 May 2016 23:16:18 +0200 +Subject: bpf, inode: disallow userns mounts + +From: Daniel Borkmann + +[ Upstream commit 612bacad78ba6d0a91166fc4487af114bac172a8 ] + +Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns +to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag. + +The original idea was to have a per mountns instance instead of a +single global fs instance, but that didn't work out and we had to +switch to mount_nodev() model. The intent of that middle ground was +that we avoid users who don't play nice to create endless instances +of bpf fs which are difficult to control and discover from an admin +point of view, but at the same time it would have allowed us to be +more flexible with regard to namespaces. + +Therefore, since we now did the switch to mount_nodev() as a fix +where individual instances are created, we also need to remove userns +mount flag along with it to avoid running into mentioned situation. +I don't expect any breakage at this early point in time with removing +the flag and we can revisit this later should the requirement for +this come up with future users. This and commit e27f4a942a0e have +been split to facilitate tracking should any of them run into the +unlikely case of causing a regression. + +Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") +Signed-off-by: Daniel Borkmann +Acked-by: Hannes Frederic Sowa +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/inode.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/kernel/bpf/inode.c ++++ b/kernel/bpf/inode.c +@@ -366,7 +366,6 @@ static struct file_system_type bpf_fs_ty + .name = "bpf", + .mount = bpf_mount, + .kill_sb = kill_litter_super, +- .fs_flags = FS_USERNS_MOUNT, + }; + + MODULE_ALIAS_FS("bpf"); diff --git a/queue-4.4/bpf-use-mount_nodev-not-mount_ns-to-mount-the-bpf-filesystem.patch b/queue-4.4/bpf-use-mount_nodev-not-mount_ns-to-mount-the-bpf-filesystem.patch new file mode 100644 index 00000000000..708df8ba3ef --- /dev/null +++ b/queue-4.4/bpf-use-mount_nodev-not-mount_ns-to-mount-the-bpf-filesystem.patch @@ -0,0 +1,52 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: "Eric W. Biederman" +Date: Fri, 20 May 2016 17:22:48 -0500 +Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem + +From: "Eric W. Biederman" + +[ Upstream commit e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 ] + +While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the +bpf filesystem. Looking at the code I saw a broken usage of mount_ns +with current->nsproxy->mnt_ns. As the code does not acquire a +reference to the mount namespace it can not possibly be correct to +store the mount namespace on the superblock as it does. + +Replace mount_ns with mount_nodev so that each mount of the bpf +filesystem returns a distinct instance, and the code is not buggy. + +In discussion with Hannes Frederic Sowa it was reported that the use +of mount_ns was an attempt to have one bpf instance per mount +namespace, in an attempt to keep resources that pin resources from +hiding. That intent simply does not work, the vfs is not built to +allow that kind of behavior. Which means that the bpf filesystem +really is buggy both semantically and in it's implemenation as it does +not nor can it implement the original intent. + +This change is userspace visible, but my experience with similar +filesystems leads me to believe nothing will break with a model of each +mount of the bpf filesystem is distinct from all others. + +Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") +Cc: Hannes Frederic Sowa +Acked-by: Daniel Borkmann +Signed-off-by: "Eric W. Biederman" +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/bpf/inode.c ++++ b/kernel/bpf/inode.c +@@ -358,7 +358,7 @@ static int bpf_fill_super(struct super_b + static struct dentry *bpf_mount(struct file_system_type *type, int flags, + const char *dev_name, void *data) + { +- return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); ++ return mount_nodev(type, flags, data, bpf_fill_super); + } + + static struct file_system_type bpf_fs_type = { diff --git a/queue-4.4/bridge-don-t-insert-unnecessary-local-fdb-entry-on-changing-mac-address.patch b/queue-4.4/bridge-don-t-insert-unnecessary-local-fdb-entry-on-changing-mac-address.patch new file mode 100644 index 00000000000..79b1dab047a --- /dev/null +++ b/queue-4.4/bridge-don-t-insert-unnecessary-local-fdb-entry-on-changing-mac-address.patch @@ -0,0 +1,34 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Toshiaki Makita +Date: Tue, 7 Jun 2016 19:14:17 +0900 +Subject: bridge: Don't insert unnecessary local fdb entry on changing mac address + +From: Toshiaki Makita + +[ Upstream commit 0b148def403153a4d1565f1640356cb78ce5109f ] + +The missing br_vlan_should_use() test caused creation of an unneeded +local fdb entry on changing mac address of a bridge device when there is +a vlan which is configured on a bridge port but not on the bridge +device. + +Fixes: 2594e9064a57 ("bridge: vlan: add per-vlan struct and move to rhashtables") +Signed-off-by: Toshiaki Makita +Acked-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_fdb.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/bridge/br_fdb.c ++++ b/net/bridge/br_fdb.c +@@ -278,6 +278,8 @@ void br_fdb_change_mac_address(struct ne + * change from under us. + */ + list_for_each_entry(v, &vg->vlan_list, vlist) { ++ if (!br_vlan_should_use(v)) ++ continue; + f = __br_fdb_get(br, br->dev->dev_addr, v->vid); + if (f && f->is_local && !f->dst) + fdb_delete_local(br, NULL, f); diff --git a/queue-4.4/geneve-relax-mtu-constraints.patch b/queue-4.4/geneve-relax-mtu-constraints.patch new file mode 100644 index 00000000000..07e7d1150b2 --- /dev/null +++ b/queue-4.4/geneve-relax-mtu-constraints.patch @@ -0,0 +1,54 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: David Wragg +Date: Fri, 3 Jun 2016 18:58:14 -0400 +Subject: geneve: Relax MTU constraints + +From: David Wragg + +[ Upstream commit 55e5bfb53cff286c1c1ff49f51325dc15c7fea63 ] + +Allow the MTU of geneve devices to be set to large values, in order to +exploit underlying networks with larger frame sizes. + +GENEVE does not have a fixed encapsulation overhead (an openvswitch +rule can add variable length options), so there is no relevant maximum +MTU to enforce. A maximum of IP_MAX_MTU is used instead. +Encapsulated packets that are too big for the underlying network will +get dropped on the floor. + +Signed-off-by: David Wragg +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/geneve.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -994,6 +994,17 @@ static netdev_tx_t geneve_xmit(struct sk + return geneve_xmit_skb(skb, dev, info); + } + ++static int geneve_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ /* GENEVE overhead is not fixed, so we can't enforce a more ++ * precise max MTU. ++ */ ++ if (new_mtu < 68 || new_mtu > IP_MAX_MTU) ++ return -EINVAL; ++ dev->mtu = new_mtu; ++ return 0; ++} ++ + static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) + { + struct ip_tunnel_info *info = skb_tunnel_info(skb); +@@ -1038,7 +1049,7 @@ static const struct net_device_ops genev + .ndo_stop = geneve_stop, + .ndo_start_xmit = geneve_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, +- .ndo_change_mtu = eth_change_mtu, ++ .ndo_change_mtu = geneve_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, + .ndo_fill_metadata_dst = geneve_fill_metadata_dst, diff --git a/queue-4.4/ipv6-skip-xfrm-lookup-if-dst_entry-in-socket-cache-is-valid.patch b/queue-4.4/ipv6-skip-xfrm-lookup-if-dst_entry-in-socket-cache-is-valid.patch new file mode 100644 index 00000000000..2425f7d1acb --- /dev/null +++ b/queue-4.4/ipv6-skip-xfrm-lookup-if-dst_entry-in-socket-cache-is-valid.patch @@ -0,0 +1,102 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Jakub Sitnicki +Date: Wed, 8 Jun 2016 15:13:34 +0200 +Subject: ipv6: Skip XFRM lookup if dst_entry in socket cache is valid + +From: Jakub Sitnicki + +[ Upstream commit 00bc0ef5880dc7b82f9c320dead4afaad48e47be ] + +At present we perform an xfrm_lookup() for each UDPv6 message we +send. The lookup involves querying the flow cache (flow_cache_lookup) +and, in case of a cache miss, creating an XFRM bundle. + +If we miss the flow cache, we can end up creating a new bundle and +deriving the path MTU (xfrm_init_pmtu) from on an already transformed +dst_entry, which we pass from the socket cache (sk->sk_dst_cache) down +to xfrm_lookup(). This can happen only if we're caching the dst_entry +in the socket, that is when we're using a connected UDP socket. + +To put it another way, the path MTU shrinks each time we miss the flow +cache, which later on leads to incorrectly fragmented payload. It can +be observed with ESPv6 in transport mode: + + 1) Set up a transformation and lower the MTU to trigger fragmentation + # ip xfrm policy add dir out src ::1 dst ::1 \ + tmpl src ::1 dst ::1 proto esp spi 1 + # ip xfrm state add src ::1 dst ::1 \ + proto esp spi 1 enc 'aes' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b + # ip link set dev lo mtu 1500 + + 2) Monitor the packet flow and set up an UDP sink + # tcpdump -ni lo -ttt & + # socat udp6-listen:12345,fork /dev/null & + + 3) Send a datagram that needs fragmentation with a connected socket + # perl -e 'print "@" x 1470 | socat - udp6:[::1]:12345 + 2016/06/07 18:52:52 socat[724] E read(3, 0x555bb3d5ba00, 8192): Protocol error + 00:00:00.000000 IP6 ::1 > ::1: frag (0|1448) ESP(spi=0x00000001,seq=0x2), length 1448 + 00:00:00.000014 IP6 ::1 > ::1: frag (1448|32) + 00:00:00.000050 IP6 ::1 > ::1: ESP(spi=0x00000001,seq=0x3), length 1272 + (^ ICMPv6 Parameter Problem) + 00:00:00.000022 IP6 ::1 > ::1: ESP(spi=0x00000001,seq=0x5), length 136 + + 4) Compare it to a non-connected socket + # perl -e 'print "@" x 1500' | socat - udp6-sendto:[::1]:12345 + 00:00:40.535488 IP6 ::1 > ::1: frag (0|1448) ESP(spi=0x00000001,seq=0x6), length 1448 + 00:00:00.000010 IP6 ::1 > ::1: frag (1448|64) + +What happens in step (3) is: + + 1) when connecting the socket in __ip6_datagram_connect(), we + perform an XFRM lookup, miss the flow cache, create an XFRM + bundle, and cache the destination, + + 2) afterwards, when sending the datagram, we perform an XFRM lookup, + again, miss the flow cache (due to mismatch of flowi6_iif and + flowi6_oif, which is an issue of its own), and recreate an XFRM + bundle based on the cached (and already transformed) destination. + +To prevent the recreation of an XFRM bundle, avoid an XFRM lookup +altogether whenever we already have a destination entry cached in the +socket. This prevents the path MTU shrinkage and brings us on par with +UDPv4. + +The fix also benefits connected PINGv6 sockets, another user of +ip6_sk_dst_lookup_flow(), who also suffer messages being transformed +twice. + +Joint work with Hannes Frederic Sowa. + +Reported-by: Jan Tluka +Signed-off-by: Jakub Sitnicki +Acked-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_output.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1072,17 +1072,12 @@ struct dst_entry *ip6_sk_dst_lookup_flow + const struct in6_addr *final_dst) + { + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); +- int err; + + dst = ip6_sk_dst_check(sk, dst, fl6); ++ if (!dst) ++ dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + +- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); +- if (err) +- return ERR_PTR(err); +- if (final_dst) +- fl6->daddr = *final_dst; +- +- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++ return dst; + } + EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); + diff --git a/queue-4.4/l2tp-fix-configuration-passed-to-setup_udp_tunnel_sock.patch b/queue-4.4/l2tp-fix-configuration-passed-to-setup_udp_tunnel_sock.patch new file mode 100644 index 00000000000..0f202544033 --- /dev/null +++ b/queue-4.4/l2tp-fix-configuration-passed-to-setup_udp_tunnel_sock.patch @@ -0,0 +1,91 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Guillaume Nault +Date: Wed, 8 Jun 2016 12:59:17 +0200 +Subject: l2tp: fix configuration passed to setup_udp_tunnel_sock() + +From: Guillaume Nault + +[ Upstream commit a5c5e2da8551eb69e5d5d09d51d526140b5db9fb ] + +Unused fields of udp_cfg must be all zeros. Otherwise +setup_udp_tunnel_sock() fills ->gro_receive and ->gro_complete +callbacks with garbage, eventually resulting in panic when used by +udp_gro_receive(). + +[ 72.694123] BUG: unable to handle kernel paging request at ffff880033f87d78 +[ 72.695518] IP: [] 0xffff880033f87d78 +[ 72.696530] PGD 26e2067 PUD 26e3067 PMD 342ed063 PTE 8000000033f87163 +[ 72.696530] Oops: 0011 [#1] SMP KASAN +[ 72.696530] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core ip6_udp_tunnel udp_tunnel pptp gre pppox ppp_generic slhc crc32c_intel ghash_clmulni_intel jitterentropy_rng sha256_generic hmac drbg ansi_cprng aesni_intel evdev aes_x86_64 ablk_helper cryptd lrw gf128mul glue_helper serio_raw acpi_cpufreq button proc\ +essor ext4 crc16 jbd2 mbcache virtio_blk virtio_net virtio_pci virtio_ring virtio +[ 72.696530] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.7.0-rc1 #1 +[ 72.696530] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014 +[ 72.696530] task: ffff880035b59700 ti: ffff880035b70000 task.ti: ffff880035b70000 +[ 72.696530] RIP: 0010:[] [] 0xffff880033f87d78 +[ 72.696530] RSP: 0018:ffff880035f87bc0 EFLAGS: 00010246 +[ 72.696530] RAX: ffffed000698f996 RBX: ffff88003326b840 RCX: ffffffff814cc823 +[ 72.696530] RDX: ffff88003326b840 RSI: ffff880033e48038 RDI: ffff880034c7c780 +[ 72.696530] RBP: ffff880035f87c18 R08: 000000000000a506 R09: 0000000000000000 +[ 72.696530] R10: ffff880035f87b38 R11: ffff880034b9344d R12: 00000000ebfea715 +[ 72.696530] R13: 0000000000000000 R14: ffff880034c7c780 R15: 0000000000000000 +[ 72.696530] FS: 0000000000000000(0000) GS:ffff880035f80000(0000) knlGS:0000000000000000 +[ 72.696530] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 72.696530] CR2: ffff880033f87d78 CR3: 0000000033c98000 CR4: 00000000000406a0 +[ 72.696530] Stack: +[ 72.696530] ffffffff814cc834 ffff880034b93468 0000001481416818 ffff88003326b874 +[ 72.696530] ffff880034c7ccb0 ffff880033e48038 ffff88003326b840 ffff880034b93462 +[ 72.696530] ffff88003326b88a ffff88003326b88c ffff880034b93468 ffff880035f87c70 +[ 72.696530] Call Trace: +[ 72.696530] +[ 72.696530] [] ? udp_gro_receive+0x1c6/0x1f9 +[ 72.696530] [] udp4_gro_receive+0x2b5/0x310 +[ 72.696530] [] inet_gro_receive+0x4a3/0x4cd +[ 72.696530] [] dev_gro_receive+0x584/0x7a3 +[ 72.696530] [] ? __lock_is_held+0x29/0x64 +[ 72.696530] [] napi_gro_receive+0x124/0x21d +[ 72.696530] [] virtnet_receive+0x8df/0x8f6 [virtio_net] +[ 72.696530] [] virtnet_poll+0x1d/0x8d [virtio_net] +[ 72.696530] [] net_rx_action+0x15b/0x3b9 +[ 72.696530] [] __do_softirq+0x216/0x546 +[ 72.696530] [] irq_exit+0x49/0xb6 +[ 72.696530] [] do_IRQ+0xe2/0xfa +[ 72.696530] [] common_interrupt+0x89/0x89 +[ 72.696530] +[ 72.696530] [] ? trace_hardirqs_on_caller+0x229/0x270 +[ 72.696530] [] ? default_idle+0x1c/0x2d +[ 72.696530] [] ? default_idle+0x1a/0x2d +[ 72.696530] [] arch_cpu_idle+0xa/0xc +[ 72.696530] [] default_idle_call+0x1a/0x1c +[ 72.696530] [] cpu_startup_entry+0x15b/0x20f +[ 72.696530] [] start_secondary+0x12c/0x133 +[ 72.696530] Code: ff ff ff ff ff ff ff ff ff ff 7f ff ff ff ff ff ff ff 7f 00 7e f8 33 00 88 ff ff 6d 61 58 81 ff ff ff ff 5e de 0a 81 ff ff ff ff <00> 5c e2 34 00 88 ff ff 00 00 00 00 00 00 00 00 00 00 00 00 00 +[ 72.696530] RIP [] 0xffff880033f87d78 +[ 72.696530] RSP +[ 72.696530] CR2: ffff880033f87d78 +[ 72.696530] ---[ end trace ad7758b9a1dccf99 ]--- +[ 72.696530] Kernel panic - not syncing: Fatal exception in interrupt +[ 72.696530] Kernel Offset: disabled +[ 72.696530] ---[ end Kernel panic - not syncing: Fatal exception in interrupt + +v2: use empty initialiser instead of "{ NULL }" to avoid relying on + first field's type. + +Fixes: 38fd2af24fcf ("udp: Add socket based GRO and config") +Signed-off-by: Guillaume Nault +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -1581,7 +1581,7 @@ int l2tp_tunnel_create(struct net *net, + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + tunnel->encap = encap; + if (encap == L2TP_ENCAPTYPE_UDP) { +- struct udp_tunnel_sock_cfg udp_cfg; ++ struct udp_tunnel_sock_cfg udp_cfg = { }; + + udp_cfg.sk_user_data = tunnel; + udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; diff --git a/queue-4.4/netlink-fix-dump-skb-leak-double-free.patch b/queue-4.4/netlink-fix-dump-skb-leak-double-free.patch new file mode 100644 index 00000000000..44254918f67 --- /dev/null +++ b/queue-4.4/netlink-fix-dump-skb-leak-double-free.patch @@ -0,0 +1,50 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Herbert Xu +Date: Mon, 16 May 2016 17:28:16 +0800 +Subject: netlink: Fix dump skb leak/double free + +From: Herbert Xu + +[ Upstream commit 92964c79b357efd980812c4de5c1fd2ec8bb5520 ] + +When we free cb->skb after a dump, we do it after releasing the +lock. This means that a new dump could have started in the time +being and we'll end up freeing their skb instead of ours. + +This patch saves the skb and module before we unlock so we free +the right memory. + +Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") +Reported-by: Baozeng Ding +Signed-off-by: Herbert Xu +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -2784,6 +2784,7 @@ static int netlink_dump(struct sock *sk) + struct netlink_callback *cb; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; ++ struct module *module; + int len, err = -ENOBUFS; + int alloc_min_size; + int alloc_size; +@@ -2863,9 +2864,11 @@ static int netlink_dump(struct sock *sk) + cb->done(cb); + + nlk->cb_running = false; ++ module = cb->module; ++ skb = cb->skb; + mutex_unlock(nlk->cb_mutex); +- module_put(cb->module); +- consume_skb(cb->skb); ++ module_put(module); ++ consume_skb(skb); + return 0; + + errout_skb: diff --git a/queue-4.4/series b/queue-4.4/series index 6067fd370fb..c206ed31ee2 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -1,2 +1,21 @@ scsi_lib-correctly-retry-failed-zero-length-req_type_fs-commands.patch scsi-add-qemu-cd-rom-to-vpd-inquiry-blacklist.patch +tipc-check-nl-sock-before-parsing-nested-attributes.patch +netlink-fix-dump-skb-leak-double-free.patch +tipc-fix-nametable-publication-field-in-nl-compat.patch +switchdev-pass-pointer-to-fib_info-instead-of-copy.patch +tuntap-correctly-wake-up-process-during-uninit.patch +bpf-use-mount_nodev-not-mount_ns-to-mount-the-bpf-filesystem.patch +udp-prevent-skbs-lingering-in-tunnel-socket-queues.patch +uapi-glibc-compat-fix-compilation-when-__use_misc-in-glibc.patch +bpf-inode-disallow-userns-mounts.patch +sfc-on-mc-reset-clear-pio-buffer-linkage-in-txqs.patch +team-don-t-call-netdev_change_features-under-team-lock.patch +vxlan-accept-user-specified-mtu-value-when-create-new-vxlan-link.patch +tcp-record-tlp-and-er-timer-stats-in-v6-stats.patch +bridge-don-t-insert-unnecessary-local-fdb-entry-on-changing-mac-address.patch +l2tp-fix-configuration-passed-to-setup_udp_tunnel_sock.patch +ipv6-skip-xfrm-lookup-if-dst_entry-in-socket-cache-is-valid.patch +vxlan-relax-mtu-constraints.patch +geneve-relax-mtu-constraints.patch +vxlan-gre-geneve-set-a-large-mtu-on-ovs-created-tunnel-devices.patch diff --git a/queue-4.4/sfc-on-mc-reset-clear-pio-buffer-linkage-in-txqs.patch b/queue-4.4/sfc-on-mc-reset-clear-pio-buffer-linkage-in-txqs.patch new file mode 100644 index 00000000000..1f20b866123 --- /dev/null +++ b/queue-4.4/sfc-on-mc-reset-clear-pio-buffer-linkage-in-txqs.patch @@ -0,0 +1,59 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Edward Cree +Date: Tue, 24 May 2016 18:53:36 +0100 +Subject: sfc: on MC reset, clear PIO buffer linkage in TXQs + +From: Edward Cree + +[ Upstream commit c0795bf64cba4d1b796fdc5b74b33772841ed1bb ] + +Otherwise, if we fail to allocate new PIO buffers, our TXQs will try to +use the old ones, which aren't there any more. + +Fixes: 183233bec810 "sfc: Allocate and link PIO buffers; map them with write-combining" +Signed-off-by: Edward Cree +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sfc/ef10.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/net/ethernet/sfc/ef10.c ++++ b/drivers/net/ethernet/sfc/ef10.c +@@ -619,6 +619,17 @@ fail: + return rc; + } + ++static void efx_ef10_forget_old_piobufs(struct efx_nic *efx) ++{ ++ struct efx_channel *channel; ++ struct efx_tx_queue *tx_queue; ++ ++ /* All our existing PIO buffers went away */ ++ efx_for_each_channel(channel, efx) ++ efx_for_each_channel_tx_queue(tx_queue, channel) ++ tx_queue->piobuf = NULL; ++} ++ + #else /* !EFX_USE_PIO */ + + static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n) +@@ -635,6 +646,10 @@ static void efx_ef10_free_piobufs(struct + { + } + ++static void efx_ef10_forget_old_piobufs(struct efx_nic *efx) ++{ ++} ++ + #endif /* EFX_USE_PIO */ + + static void efx_ef10_remove(struct efx_nic *efx) +@@ -1018,6 +1033,7 @@ static void efx_ef10_reset_mc_allocation + nic_data->must_realloc_vis = true; + nic_data->must_restore_filters = true; + nic_data->must_restore_piobufs = true; ++ efx_ef10_forget_old_piobufs(efx); + nic_data->rx_rss_context = EFX_EF10_RSS_CONTEXT_INVALID; + + /* Driver-created vswitches and vports must be re-created */ diff --git a/queue-4.4/switchdev-pass-pointer-to-fib_info-instead-of-copy.patch b/queue-4.4/switchdev-pass-pointer-to-fib_info-instead-of-copy.patch new file mode 100644 index 00000000000..c2fddab92e1 --- /dev/null +++ b/queue-4.4/switchdev-pass-pointer-to-fib_info-instead-of-copy.patch @@ -0,0 +1,95 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Jiri Pirko +Date: Tue, 17 May 2016 18:58:08 +0200 +Subject: switchdev: pass pointer to fib_info instead of copy + +From: Jiri Pirko + +[ Upstream commit da4ed55165d41b1073f9a476f1c18493e9bf8c8e ] + +The problem is that fib_info->nh is [0] so the struct fib_info +allocation size depends on number of nexthops. If we just copy fib_info, +we do not copy the nexthops info and driver accesses memory which is not +ours. + +Given the fact that fib4 does not defer operations and therefore it does +not need copy, just pass the pointer down to drivers as it was done +before. + +Fixes: 850d0cbc91 ("switchdev: remove pointers from switchdev objects") +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/rocker/rocker.c | 4 ++-- + include/net/switchdev.h | 2 +- + net/switchdev/switchdev.c | 6 ++---- + 3 files changed, 5 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/rocker/rocker.c ++++ b/drivers/net/ethernet/rocker/rocker.c +@@ -4475,7 +4475,7 @@ static int rocker_port_obj_add(struct ne + fib4 = SWITCHDEV_OBJ_IPV4_FIB(obj); + err = rocker_port_fib_ipv4(rocker_port, trans, + htonl(fib4->dst), fib4->dst_len, +- &fib4->fi, fib4->tb_id, 0); ++ fib4->fi, fib4->tb_id, 0); + break; + case SWITCHDEV_OBJ_ID_PORT_FDB: + err = rocker_port_fdb_add(rocker_port, trans, +@@ -4547,7 +4547,7 @@ static int rocker_port_obj_del(struct ne + fib4 = SWITCHDEV_OBJ_IPV4_FIB(obj); + err = rocker_port_fib_ipv4(rocker_port, NULL, + htonl(fib4->dst), fib4->dst_len, +- &fib4->fi, fib4->tb_id, ++ fib4->fi, fib4->tb_id, + ROCKER_OP_FLAG_REMOVE); + break; + case SWITCHDEV_OBJ_ID_PORT_FDB: +--- a/include/net/switchdev.h ++++ b/include/net/switchdev.h +@@ -88,7 +88,7 @@ struct switchdev_obj_ipv4_fib { + struct switchdev_obj obj; + u32 dst; + int dst_len; +- struct fib_info fi; ++ struct fib_info *fi; + u8 tos; + u8 type; + u32 nlflags; +--- a/net/switchdev/switchdev.c ++++ b/net/switchdev/switchdev.c +@@ -1169,6 +1169,7 @@ int switchdev_fib_ipv4_add(u32 dst, int + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, ++ .fi = fi, + .tos = tos, + .type = type, + .nlflags = nlflags, +@@ -1177,8 +1178,6 @@ int switchdev_fib_ipv4_add(u32 dst, int + struct net_device *dev; + int err = 0; + +- memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); +- + /* Don't offload route if using custom ip rules or if + * IPv4 FIB offloading has been disabled completely. + */ +@@ -1222,6 +1221,7 @@ int switchdev_fib_ipv4_del(u32 dst, int + .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, + .dst = dst, + .dst_len = dst_len, ++ .fi = fi, + .tos = tos, + .type = type, + .nlflags = 0, +@@ -1230,8 +1230,6 @@ int switchdev_fib_ipv4_del(u32 dst, int + struct net_device *dev; + int err = 0; + +- memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); +- + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) + return 0; + diff --git a/queue-4.4/tcp-record-tlp-and-er-timer-stats-in-v6-stats.patch b/queue-4.4/tcp-record-tlp-and-er-timer-stats-in-v6-stats.patch new file mode 100644 index 00000000000..d826afb89e4 --- /dev/null +++ b/queue-4.4/tcp-record-tlp-and-er-timer-stats-in-v6-stats.patch @@ -0,0 +1,35 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Yuchung Cheng +Date: Mon, 6 Jun 2016 15:07:18 -0700 +Subject: tcp: record TLP and ER timer stats in v6 stats + +From: Yuchung Cheng + +[ Upstream commit ce3cf4ec0305919fc69a972f6c2b2efd35d36abc ] + +The v6 tcp stats scan do not provide TLP and ER timer information +correctly like the v4 version . This patch fixes that. + +Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") +Fixes: eed530b6c676 ("tcp: early retransmit") +Signed-off-by: Yuchung Cheng +Signed-off-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/tcp_ipv6.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1706,7 +1706,9 @@ static void get_tcp6_sock(struct seq_fil + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); + +- if (icsk->icsk_pending == ICSK_TIME_RETRANS) { ++ if (icsk->icsk_pending == ICSK_TIME_RETRANS || ++ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || ++ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + timer_active = 1; + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { diff --git a/queue-4.4/team-don-t-call-netdev_change_features-under-team-lock.patch b/queue-4.4/team-don-t-call-netdev_change_features-under-team-lock.patch new file mode 100644 index 00000000000..a97bc24d53c --- /dev/null +++ b/queue-4.4/team-don-t-call-netdev_change_features-under-team-lock.patch @@ -0,0 +1,89 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Ivan Vecera +Date: Wed, 25 May 2016 21:21:52 +0200 +Subject: team: don't call netdev_change_features under team->lock + +From: Ivan Vecera + +[ Upstream commit f6988cb63a4e698d8a62a1d085d263d1fcc351ea ] + +The team_device_event() notifier calls team_compute_features() to fix +vlan_features under team->lock to protect team->port_list. The problem is +that subsequent __team_compute_features() calls netdev_change_features() +to propagate vlan_features to upper vlan devices while team->lock is still +taken. This can lead to deadlock when NETIF_F_LRO is modified on lower +devices or team device itself. + +Example: +The team0 as active backup with eth0 and eth1 NICs. Both eth0 & eth1 are +LRO capable and LRO is enabled. Thus LRO is also enabled on team0. + +The command 'ethtool -K team0 lro off' now hangs due to this deadlock: + +dev_ethtool() +-> ethtool_set_features() + -> __netdev_update_features(team) + -> netdev_sync_lower_features() + -> netdev_update_features(lower_1) + -> __netdev_update_features(lower_1) + -> netdev_features_change(lower_1) + -> call_netdevice_notifiers(...) + -> team_device_event(lower_1) + -> team_compute_features(team) [TAKES team->lock] + -> netdev_change_features(team) + -> __netdev_update_features(team) + -> netdev_sync_lower_features() + -> netdev_update_features(lower_2) + -> __netdev_update_features(lower_2) + -> netdev_features_change(lower_2) + -> call_netdevice_notifiers(...) + -> team_device_event(lower_2) + -> team_compute_features(team) [DEADLOCK] + +The bug is present in team from the beginning but it appeared after the commit +fd867d5 (net/core: generic support for disabling netdev features down stack) +that adds synchronization of features with lower devices. + +Fixes: fd867d5 (net/core: generic support for disabling netdev features down stack) +Cc: Jiri Pirko +Signed-off-by: Ivan Vecera +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/team/team.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/net/team/team.c ++++ b/drivers/net/team/team.c +@@ -969,7 +969,7 @@ static void team_port_disable(struct tea + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ + NETIF_F_HIGHDMA | NETIF_F_LRO) + +-static void __team_compute_features(struct team *team) ++static void ___team_compute_features(struct team *team) + { + struct team_port *port; + u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; +@@ -993,15 +993,20 @@ static void __team_compute_features(stru + team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; ++} + ++static void __team_compute_features(struct team *team) ++{ ++ ___team_compute_features(team); + netdev_change_features(team->dev); + } + + static void team_compute_features(struct team *team) + { + mutex_lock(&team->lock); +- __team_compute_features(team); ++ ___team_compute_features(team); + mutex_unlock(&team->lock); ++ netdev_change_features(team->dev); + } + + static int team_port_enter(struct team *team, struct team_port *port) diff --git a/queue-4.4/tipc-check-nl-sock-before-parsing-nested-attributes.patch b/queue-4.4/tipc-check-nl-sock-before-parsing-nested-attributes.patch new file mode 100644 index 00000000000..95a014bfc22 --- /dev/null +++ b/queue-4.4/tipc-check-nl-sock-before-parsing-nested-attributes.patch @@ -0,0 +1,36 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Richard Alpe +Date: Mon, 16 May 2016 11:14:54 +0200 +Subject: tipc: check nl sock before parsing nested attributes + +From: Richard Alpe + +[ Upstream commit 45e093ae2830cd1264677d47ff9a95a71f5d9f9c ] + +Make sure the socket for which the user is listing publication exists +before parsing the socket netlink attributes. + +Prior to this patch a call without any socket caused a NULL pointer +dereference in tipc_nl_publ_dump(). + +Tested-and-reported-by: Baozeng Ding +Signed-off-by: Richard Alpe +Acked-by: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/socket.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -2814,6 +2814,9 @@ int tipc_nl_publ_dump(struct sk_buff *sk + if (err) + return err; + ++ if (!attrs[TIPC_NLA_SOCK]) ++ return -EINVAL; ++ + err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, + attrs[TIPC_NLA_SOCK], + tipc_nl_sock_policy); diff --git a/queue-4.4/tipc-fix-nametable-publication-field-in-nl-compat.patch b/queue-4.4/tipc-fix-nametable-publication-field-in-nl-compat.patch new file mode 100644 index 00000000000..61f25158904 --- /dev/null +++ b/queue-4.4/tipc-fix-nametable-publication-field-in-nl-compat.patch @@ -0,0 +1,32 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Richard Alpe +Date: Tue, 17 May 2016 16:57:37 +0200 +Subject: tipc: fix nametable publication field in nl compat + +From: Richard Alpe + +[ Upstream commit 03aaaa9b941e136757b55c4cf775aab6068dfd94 ] + +The publication field of the old netlink API should contain the +publication key and not the publication reference. + +Fixes: 44a8ae94fd55 (tipc: convert legacy nl name table dump to nl compat) +Signed-off-by: Richard Alpe +Acked-by: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/netlink_compat.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/tipc/netlink_compat.c ++++ b/net/tipc/netlink_compat.c +@@ -802,7 +802,7 @@ static int tipc_nl_compat_name_table_dum + goto out; + + tipc_tlv_sprintf(msg->rep, "%-10u %s", +- nla_get_u32(publ[TIPC_NLA_PUBL_REF]), ++ nla_get_u32(publ[TIPC_NLA_PUBL_KEY]), + scope_str[nla_get_u32(publ[TIPC_NLA_PUBL_SCOPE])]); + out: + tipc_tlv_sprintf(msg->rep, "\n"); diff --git a/queue-4.4/tuntap-correctly-wake-up-process-during-uninit.patch b/queue-4.4/tuntap-correctly-wake-up-process-during-uninit.patch new file mode 100644 index 00000000000..a2787e658ff --- /dev/null +++ b/queue-4.4/tuntap-correctly-wake-up-process-during-uninit.patch @@ -0,0 +1,68 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Jason Wang +Date: Thu, 19 May 2016 13:36:51 +0800 +Subject: tuntap: correctly wake up process during uninit + +From: Jason Wang + +[ Upstream commit addf8fc4acb1cf79492ac64966f07178793cb3d7 ] + +We used to check dev->reg_state against NETREG_REGISTERED after each +time we are woke up. But after commit 9e641bdcfa4e ("net-tun: +restructure tun_do_read for better sleep/wakeup efficiency"), it uses +skb_recv_datagram() which does not check dev->reg_state. This will +result if we delete a tun/tap device after a process is blocked in the +reading. The device will wait for the reference count which was held +by that process for ever. + +Fixes this by using RCV_SHUTDOWN which will be checked during +sk_recv_datagram() before trying to wake up the process during uninit. + +Fixes: 9e641bdcfa4e ("net-tun: restructure tun_do_read for better +sleep/wakeup efficiency") +Cc: Eric Dumazet +Cc: Xi Wang +Cc: Michael S. Tsirkin +Signed-off-by: Jason Wang +Acked-by: Eric Dumazet +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -567,11 +567,13 @@ static void tun_detach_all(struct net_de + for (i = 0; i < n; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + BUG_ON(!tfile); ++ tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; + tfile->socket.sk->sk_data_ready(tfile->socket.sk); + RCU_INIT_POINTER(tfile->tun, NULL); + --tun->numqueues; + } + list_for_each_entry(tfile, &tun->disabled, next) { ++ tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; + tfile->socket.sk->sk_data_ready(tfile->socket.sk); + RCU_INIT_POINTER(tfile->tun, NULL); + } +@@ -627,6 +629,7 @@ static int tun_attach(struct tun_struct + goto out; + } + tfile->queue_index = tun->numqueues; ++ tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; + rcu_assign_pointer(tfile->tun, tun); + rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); + tun->numqueues++; +@@ -1408,9 +1411,6 @@ static ssize_t tun_do_read(struct tun_st + if (!iov_iter_count(to)) + return 0; + +- if (tun->dev->reg_state != NETREG_REGISTERED) +- return -EIO; +- + /* Read frames from queue */ + skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, + &peeked, &off, &err); diff --git a/queue-4.4/uapi-glibc-compat-fix-compilation-when-__use_misc-in-glibc.patch b/queue-4.4/uapi-glibc-compat-fix-compilation-when-__use_misc-in-glibc.patch new file mode 100644 index 00000000000..3a656d41068 --- /dev/null +++ b/queue-4.4/uapi-glibc-compat-fix-compilation-when-__use_misc-in-glibc.patch @@ -0,0 +1,37 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Nicolas Dichtel +Date: Thu, 19 May 2016 17:26:29 +0200 +Subject: uapi glibc compat: fix compilation when !__USE_MISC in glibc + +From: Nicolas Dichtel + +[ Upstream commit f0a3fdca794d1e68ae284ef4caefe681f7c18e89 ] + +These structures are defined only if __USE_MISC is set in glibc net/if.h +headers, ie when _BSD_SOURCE or _SVID_SOURCE are defined. + +CC: Jan Engelhardt +CC: Josh Boyer +CC: Stephen Hemminger +CC: Waldemar Brodkorb +CC: Gabriel Laskar +CC: Mikko Rapeli +Fixes: 4a91cb61bb99 ("uapi glibc compat: fix compile errors when glibc net/if.h included before linux/if.h") +Signed-off-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/libc-compat.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/uapi/linux/libc-compat.h ++++ b/include/uapi/linux/libc-compat.h +@@ -52,7 +52,7 @@ + #if defined(__GLIBC__) + + /* Coordinate with glibc net/if.h header. */ +-#if defined(_NET_IF_H) ++#if defined(_NET_IF_H) && defined(__USE_MISC) + + /* GLIBC headers included first so don't define anything + * that would already be defined. */ diff --git a/queue-4.4/udp-prevent-skbs-lingering-in-tunnel-socket-queues.patch b/queue-4.4/udp-prevent-skbs-lingering-in-tunnel-socket-queues.patch new file mode 100644 index 00000000000..dcadde91e81 --- /dev/null +++ b/queue-4.4/udp-prevent-skbs-lingering-in-tunnel-socket-queues.patch @@ -0,0 +1,109 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Hannes Frederic Sowa +Date: Thu, 19 May 2016 15:58:33 +0200 +Subject: udp: prevent skbs lingering in tunnel socket queues + +From: Hannes Frederic Sowa + +[ Upstream commit e5aed006be918af163eb397e45aa5ea6cefd5e01 ] + +In case we find a socket with encapsulation enabled we should call +the encap_recv function even if just a udp header without payload is +available. The callbacks are responsible for correctly verifying and +dropping the packets. + +Also, in case the header validation fails for geneve and vxlan we +shouldn't put the skb back into the socket queue, no one will pick +them up there. Instead we can simply discard them in the respective +encap_recv functions. + +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/geneve.c | 10 +++------- + drivers/net/vxlan.c | 10 ++-------- + net/ipv4/udp.c | 2 +- + net/ipv6/udp.c | 2 +- + 4 files changed, 7 insertions(+), 17 deletions(-) + +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -310,15 +310,15 @@ static int geneve_udp_encap_recv(struct + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) +- goto error; ++ goto drop; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + if (unlikely(geneveh->ver != GENEVE_VER)) +- goto error; ++ goto drop; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) +- goto error; ++ goto drop; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, +@@ -336,10 +336,6 @@ drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; +- +-error: +- /* Let the UDP layer deal with the skb */ +- return 1; + } + + static struct socket *geneve_create_sock(struct net *net, bool ipv6, +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -1254,7 +1254,7 @@ static int vxlan_udp_encap_recv(struct s + + /* Need Vxlan and inner Ethernet header to be present */ + if (!pskb_may_pull(skb, VXLAN_HLEN)) +- goto error; ++ goto drop; + + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + flags = ntohl(vxh->vx_flags); +@@ -1344,13 +1344,7 @@ drop: + bad_flags: + netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); +- +-error: +- if (tun_dst) +- dst_release((struct dst_entry *)tun_dst); +- +- /* Return non vxlan pkt */ +- return 1; ++ goto drop; + } + + static int arp_reduce(struct net_device *dev, struct sk_buff *skb) +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1531,7 +1531,7 @@ int udp_queue_rcv_skb(struct sock *sk, s + + /* if we're overly short, let UDP handle it */ + encap_rcv = ACCESS_ONCE(up->encap_rcv); +- if (skb->len > sizeof(struct udphdr) && encap_rcv) { ++ if (encap_rcv) { + int ret; + + /* Verify checksum before giving to encap */ +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -647,7 +647,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, + + /* if we're overly short, let UDP handle it */ + encap_rcv = ACCESS_ONCE(up->encap_rcv); +- if (skb->len > sizeof(struct udphdr) && encap_rcv) { ++ if (encap_rcv) { + int ret; + + /* Verify checksum before giving to encap */ diff --git a/queue-4.4/vxlan-accept-user-specified-mtu-value-when-create-new-vxlan-link.patch b/queue-4.4/vxlan-accept-user-specified-mtu-value-when-create-new-vxlan-link.patch new file mode 100644 index 00000000000..74d8c253447 --- /dev/null +++ b/queue-4.4/vxlan-accept-user-specified-mtu-value-when-create-new-vxlan-link.patch @@ -0,0 +1,39 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: Chen Haiquan +Date: Fri, 27 May 2016 10:49:11 +0800 +Subject: vxlan: Accept user specified MTU value when create new vxlan link + +From: Chen Haiquan + +[ Upstream commit ce577668a426c6a9e2470a09dcd07fbd6e45272a ] + +When create a new vxlan link, example: + ip link add vtap mtu 1440 type vxlan vni 1 dev eth0 + +The argument "mtu" has no effect, because it is not set to conf->mtu. The +default value is used in vxlan_dev_configure function. + +This problem was introduced by commit 0dfbdf4102b9 (vxlan: Factor out device +configuration). + +Fixes: 0dfbdf4102b9 (vxlan: Factor out device configuration) +Signed-off-by: Chen Haiquan +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -2985,6 +2985,9 @@ static int vxlan_newlink(struct net *src + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) + conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL; + ++ if (tb[IFLA_MTU]) ++ conf.mtu = nla_get_u32(tb[IFLA_MTU]); ++ + err = vxlan_dev_configure(src_net, dev, &conf); + switch (err) { + case -ENODEV: diff --git a/queue-4.4/vxlan-gre-geneve-set-a-large-mtu-on-ovs-created-tunnel-devices.patch b/queue-4.4/vxlan-gre-geneve-set-a-large-mtu-on-ovs-created-tunnel-devices.patch new file mode 100644 index 00000000000..43fcef82d2f --- /dev/null +++ b/queue-4.4/vxlan-gre-geneve-set-a-large-mtu-on-ovs-created-tunnel-devices.patch @@ -0,0 +1,171 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: David Wragg +Date: Fri, 3 Jun 2016 18:58:15 -0400 +Subject: vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices + +From: David Wragg + +[ Upstream commit 7e059158d57b79159eaf1f504825d19866ef2c42 ] + +Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could +transmit vxlan packets of any size, constrained only by the ability to +send out the resulting packets. 4.3 introduced netdevs corresponding +to tunnel vports. These netdevs have an MTU, which limits the size of +a packet that can be successfully encapsulated. The default MTU +values are low (1500 or less), which is awkwardly small in the context +of physical networks supporting jumbo frames, and leads to a +conspicuous change in behaviour for userspace. + +Instead, set the MTU on openvswitch-created netdevs to be the relevant +maximum (i.e. the maximum IP packet size minus any relevant overhead), +effectively restoring the behaviour prior to 4.3. + +Signed-off-by: David Wragg +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/geneve.c | 18 ++++++++++++++---- + drivers/net/vxlan.c | 11 ++++++++--- + include/net/ip_tunnels.h | 1 + + net/ipv4/ip_gre.c | 8 ++++++++ + net/ipv4/ip_tunnel.c | 20 +++++++++++++++++--- + net/openvswitch/vport-vxlan.c | 2 ++ + 6 files changed, 50 insertions(+), 10 deletions(-) + +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -1356,11 +1356,21 @@ struct net_device *geneve_dev_create_fb( + + err = geneve_configure(net, dev, &geneve_remote_unspec, + 0, 0, 0, htons(dst_port), true); +- if (err) { +- free_netdev(dev); +- return ERR_PTR(err); +- } ++ if (err) ++ goto err; ++ ++ /* openvswitch users expect packet sizes to be unrestricted, ++ * so set the largest MTU we can. ++ */ ++ err = geneve_change_mtu(dev, IP_MAX_MTU); ++ if (err) ++ goto err; ++ + return dev; ++ ++ err: ++ free_netdev(dev); ++ return ERR_PTR(err); + } + EXPORT_SYMBOL_GPL(geneve_dev_create_fb); + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -2776,6 +2776,7 @@ static int vxlan_dev_configure(struct ne + int err; + bool use_ipv6 = false; + __be16 default_port = vxlan->cfg.dst_port; ++ struct net_device *lowerdev = NULL; + + vxlan->net = src_net; + +@@ -2796,9 +2797,7 @@ static int vxlan_dev_configure(struct ne + } + + if (conf->remote_ifindex) { +- struct net_device *lowerdev +- = __dev_get_by_index(src_net, conf->remote_ifindex); +- ++ lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); + dst->remote_ifindex = conf->remote_ifindex; + + if (!lowerdev) { +@@ -2822,6 +2821,12 @@ static int vxlan_dev_configure(struct ne + needed_headroom = lowerdev->hard_header_len; + } + ++ if (conf->mtu) { ++ err = __vxlan_change_mtu(dev, lowerdev, dst, conf->mtu, false); ++ if (err) ++ return err; ++ } ++ + if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) + needed_headroom += VXLAN6_HEADROOM; + else +--- a/include/net/ip_tunnels.h ++++ b/include/net/ip_tunnels.h +@@ -230,6 +230,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, + int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4); ++int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); + int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); + + struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -1247,6 +1247,14 @@ struct net_device *gretap_fb_dev_create( + err = ipgre_newlink(net, dev, tb, NULL); + if (err < 0) + goto out; ++ ++ /* openvswitch users expect packet sizes to be unrestricted, ++ * so set the largest MTU we can. ++ */ ++ err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); ++ if (err) ++ goto out; ++ + return dev; + out: + free_netdev(dev); +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -948,17 +948,31 @@ done: + } + EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); + +-int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) ++int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) + { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); ++ int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; + +- if (new_mtu < 68 || +- new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) ++ if (new_mtu < 68) + return -EINVAL; ++ ++ if (new_mtu > max_mtu) { ++ if (strict) ++ return -EINVAL; ++ ++ new_mtu = max_mtu; ++ } ++ + dev->mtu = new_mtu; + return 0; + } ++EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); ++ ++int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ return __ip_tunnel_change_mtu(dev, new_mtu, true); ++} + EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); + + static void ip_tunnel_dev_free(struct net_device *dev) +--- a/net/openvswitch/vport-vxlan.c ++++ b/net/openvswitch/vport-vxlan.c +@@ -91,6 +91,8 @@ static struct vport *vxlan_tnl_create(co + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, ++ /* Don't restrict the packets that can be sent by MTU */ ++ .mtu = IP_MAX_MTU, + }; + + if (!options) { diff --git a/queue-4.4/vxlan-relax-mtu-constraints.patch b/queue-4.4/vxlan-relax-mtu-constraints.patch new file mode 100644 index 00000000000..97fb47f0ad1 --- /dev/null +++ b/queue-4.4/vxlan-relax-mtu-constraints.patch @@ -0,0 +1,85 @@ +From foo@baz Fri Jun 17 11:18:18 PDT 2016 +From: David Wragg +Date: Fri, 3 Jun 2016 18:58:13 -0400 +Subject: vxlan: Relax MTU constraints + +From: David Wragg + +[ Upstream commit 72564b59ffc438ea103b0727a921aaddce766728 ] + +Allow the MTU of vxlan devices without an underlying device to be set +to larger values (up to a maximum based on IP packet limits and vxlan +overhead). + +Previously, their MTUs could not be set to higher than the +conventional ethernet value of 1500. This is a very arbitrary value +in the context of vxlan, and prevented vxlan devices from being able +to take advantage of jumbo frames etc. + +The default MTU remains 1500, for compatibility. + +Signed-off-by: David Wragg +Acked-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 36 +++++++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 11 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -2364,29 +2364,43 @@ static void vxlan_set_multicast_list(str + { + } + +-static int vxlan_change_mtu(struct net_device *dev, int new_mtu) ++static int __vxlan_change_mtu(struct net_device *dev, ++ struct net_device *lowerdev, ++ struct vxlan_rdst *dst, int new_mtu, bool strict) + { +- struct vxlan_dev *vxlan = netdev_priv(dev); +- struct vxlan_rdst *dst = &vxlan->default_dst; +- struct net_device *lowerdev; +- int max_mtu; ++ int max_mtu = IP_MAX_MTU; + +- lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); +- if (lowerdev == NULL) +- return eth_change_mtu(dev, new_mtu); ++ if (lowerdev) ++ max_mtu = lowerdev->mtu; + + if (dst->remote_ip.sa.sa_family == AF_INET6) +- max_mtu = lowerdev->mtu - VXLAN6_HEADROOM; ++ max_mtu -= VXLAN6_HEADROOM; + else +- max_mtu = lowerdev->mtu - VXLAN_HEADROOM; ++ max_mtu -= VXLAN_HEADROOM; + +- if (new_mtu < 68 || new_mtu > max_mtu) ++ if (new_mtu < 68) + return -EINVAL; + ++ if (new_mtu > max_mtu) { ++ if (strict) ++ return -EINVAL; ++ ++ new_mtu = max_mtu; ++ } ++ + dev->mtu = new_mtu; + return 0; + } + ++static int vxlan_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct vxlan_dev *vxlan = netdev_priv(dev); ++ struct vxlan_rdst *dst = &vxlan->default_dst; ++ struct net_device *lowerdev = __dev_get_by_index(vxlan->net, ++ dst->remote_ifindex); ++ return __vxlan_change_mtu(dev, lowerdev, dst, new_mtu, true); ++} ++ + static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb, + struct ip_tunnel_info *info, + __be16 sport, __be16 dport)