From: Greg Kroah-Hartman Date: Tue, 17 Dec 2019 19:21:37 +0000 (+0100) Subject: 5.3-stable patches X-Git-Tag: v4.19.90~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b515428244534eb4af3dbc5af2cb892b4c7195e1;p=thirdparty%2Fkernel%2Fstable-queue.git 5.3-stable patches added patches: fixed-updating-of-ethertype-in-function-skb_mpls_pop.patch gre-refetch-erspan-header-from-skb-data-after-pskb_may_pull.patch hsr-fix-a-null-pointer-dereference-in-hsr_dev_xmit.patch inet-protect-against-too-small-mtu-values.patch mqprio-fix-out-of-bounds-access-in-mqprio_dump.patch net-bridge-deny-dev_set_mac_address-when-unregistering.patch net-dsa-fix-flow-dissection-on-tx-path.patch net-ethernet-ti-cpsw-fix-extra-rx-interrupt.patch net-fixed-updating-of-ethertype-in-skb_mpls_push.patch net-ipv6-add-net-argument-to-ip6_dst_lookup_flow.patch net-ipv6_stub-use-ip6_dst_lookup_flow-instead-of-ip6_dst_lookup.patch net-mlx5e-fix-txq-indices-to-be-sequential.patch net-mlx5e-query-global-pause-state-before-setting-prio2buffer.patch net-sched-fix-dump-qlen-for-sch_mq-sch_mqprio-with-nolock-subqueues.patch net-sysfs-call-dev_hold-always-in-netdev_queue_add_kobject.patch net-thunderx-start-phy-before-starting-autonegotiation.patch net-tls-fix-return-values-to-avoid-enotsupp.patch net_sched-validate-tca_kind-attribute-in-tc_chain_tmplt_add.patch openvswitch-support-asymmetric-conntrack.patch page_pool-do-not-release-pool-until-inflight-0.patch tcp-fix-rejected-syncookies-due-to-stale-timestamps.patch tcp-md5-fix-potential-overestimation-of-tcp-option-space.patch tcp-protect-accesses-to-.ts_recent_stamp-with-read-write-_once.patch tcp-tighten-acceptance-of-acks-not-matching-a-child-socket.patch tipc-fix-ordering-of-tipc-module-init-and-exit-routine.patch xdp-obtain-the-mem_id-mutex-before-trying-to-remove-an-entry.patch --- diff --git a/queue-5.3/fixed-updating-of-ethertype-in-function-skb_mpls_pop.patch b/queue-5.3/fixed-updating-of-ethertype-in-function-skb_mpls_pop.patch new file mode 100644 index 00000000000..0922a6aa0ee --- /dev/null +++ b/queue-5.3/fixed-updating-of-ethertype-in-function-skb_mpls_pop.patch @@ -0,0 +1,105 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Martin Varghese +Date: Mon, 2 Dec 2019 10:49:51 +0530 +Subject: Fixed updating of ethertype in function skb_mpls_pop + +From: Martin Varghese + +[ Upstream commit 040b5cfbcefa263ccf2c118c4938308606bb7ed8 ] + +The skb_mpls_pop was not updating ethertype of an ethernet packet if the +packet was originally received from a non ARPHRD_ETHER device. + +In the below OVS data path flow, since the device corresponding to port 7 +is an l3 device (ARPHRD_NONE) the skb_mpls_pop function does not update +the ethertype of the packet even though the previous push_eth action had +added an ethernet header to the packet. + +recirc_id(0),in_port(7),eth_type(0x8847), +mpls(label=12/0xfffff,tc=0/0,ttl=0/0x0,bos=1/1), +actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00), +pop_mpls(eth_type=0x800),4 + +Fixes: ed246cee09b9 ("net: core: move pop MPLS functionality from OvS to core helper") +Signed-off-by: Martin Varghese +Acked-by: Pravin B Shelar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/skbuff.h | 3 ++- + net/core/skbuff.c | 6 ++++-- + net/openvswitch/actions.c | 3 ++- + net/sched/act_mpls.c | 4 +++- + 4 files changed, 11 insertions(+), 5 deletions(-) + +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -3483,7 +3483,8 @@ int skb_vlan_pop(struct sk_buff *skb); + int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); + int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len); +-int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len); ++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, ++ bool ethernet); + int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); + int skb_mpls_dec_ttl(struct sk_buff *skb); + struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -5517,12 +5517,14 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); + * @skb: buffer + * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header ++ * @ethernet: flag to indicate if ethernet header is present in packet + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +-int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len) ++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, ++ bool ethernet) + { + int err; + +@@ -5541,7 +5543,7 @@ int skb_mpls_pop(struct sk_buff *skb, __ + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + +- if (skb->dev && skb->dev->type == ARPHRD_ETHER) { ++ if (ethernet) { + struct ethhdr *hdr; + + /* use mpls_hdr() to get ethertype to account for VLANs. */ +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -179,7 +179,8 @@ static int pop_mpls(struct sk_buff *skb, + { + int err; + +- err = skb_mpls_pop(skb, ethertype, skb->mac_len); ++ err = skb_mpls_pop(skb, ethertype, skb->mac_len, ++ ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); + if (err) + return err; + +--- a/net/sched/act_mpls.c ++++ b/net/sched/act_mpls.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + /* Copyright (C) 2019 Netronome Systems, Inc. */ + ++#include + #include + #include + #include +@@ -76,7 +77,8 @@ static int tcf_mpls_act(struct sk_buff * + + switch (p->tcfm_action) { + case TCA_MPLS_ACT_POP: +- if (skb_mpls_pop(skb, p->tcfm_proto, mac_len)) ++ if (skb_mpls_pop(skb, p->tcfm_proto, mac_len, ++ skb->dev && skb->dev->type == ARPHRD_ETHER)) + goto drop; + break; + case TCA_MPLS_ACT_PUSH: diff --git a/queue-5.3/gre-refetch-erspan-header-from-skb-data-after-pskb_may_pull.patch b/queue-5.3/gre-refetch-erspan-header-from-skb-data-after-pskb_may_pull.patch new file mode 100644 index 00000000000..8172872b7d0 --- /dev/null +++ b/queue-5.3/gre-refetch-erspan-header-from-skb-data-after-pskb_may_pull.patch @@ -0,0 +1,41 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Cong Wang +Date: Thu, 5 Dec 2019 19:39:02 -0800 +Subject: gre: refetch erspan header from skb->data after pskb_may_pull() + +From: Cong Wang + +[ Upstream commit 0e4940928c26527ce8f97237fef4c8a91cd34207 ] + +After pskb_may_pull() we should always refetch the header +pointers from the skb->data in case it got reallocated. + +In gre_parse_header(), the erspan header is still fetched +from the 'options' pointer which is fetched before +pskb_may_pull(). + +Found this during code review of a KMSAN bug report. + +Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") +Cc: Lorenzo Bianconi +Signed-off-by: Cong Wang +Acked-by: Lorenzo Bianconi +Acked-by: William Tu +Reviewed-by: Simon Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/gre_demux.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv4/gre_demux.c ++++ b/net/ipv4/gre_demux.c +@@ -127,7 +127,7 @@ int gre_parse_header(struct sk_buff *skb + if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) + return -EINVAL; + +- ershdr = (struct erspan_base_hdr *)options; ++ ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len); + tpi->key = cpu_to_be32(get_session_id(ershdr)); + } + diff --git a/queue-5.3/hsr-fix-a-null-pointer-dereference-in-hsr_dev_xmit.patch b/queue-5.3/hsr-fix-a-null-pointer-dereference-in-hsr_dev_xmit.patch new file mode 100644 index 00000000000..c2b92cabb33 --- /dev/null +++ b/queue-5.3/hsr-fix-a-null-pointer-dereference-in-hsr_dev_xmit.patch @@ -0,0 +1,97 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Taehee Yoo +Date: Thu, 5 Dec 2019 07:23:39 +0000 +Subject: hsr: fix a NULL pointer dereference in hsr_dev_xmit() + +From: Taehee Yoo + +[ Upstream commit df95467b6d2bfce49667ee4b71c67249b01957f7 ] + +hsr_dev_xmit() calls hsr_port_get_hsr() to find master node and that would +return NULL if master node is not existing in the list. +But hsr_dev_xmit() doesn't check return pointer so a NULL dereference +could occur. + +Test commands: + ip netns add nst + ip link add veth0 type veth peer name veth1 + ip link add veth2 type veth peer name veth3 + ip link set veth1 netns nst + ip link set veth3 netns nst + ip link set veth0 up + ip link set veth2 up + ip link add hsr0 type hsr slave1 veth0 slave2 veth2 + ip a a 192.168.100.1/24 dev hsr0 + ip link set hsr0 up + ip netns exec nst ip link set veth1 up + ip netns exec nst ip link set veth3 up + ip netns exec nst ip link add hsr1 type hsr slave1 veth1 slave2 veth3 + ip netns exec nst ip a a 192.168.100.2/24 dev hsr1 + ip netns exec nst ip link set hsr1 up + hping3 192.168.100.2 -2 --flood & + modprobe -rv hsr + +Splat looks like: +[ 217.351122][ T1635] kasan: CONFIG_KASAN_INLINE enabled +[ 217.352969][ T1635] kasan: GPF could be caused by NULL-ptr deref or user memory access +[ 217.354297][ T1635] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI +[ 217.355507][ T1635] CPU: 1 PID: 1635 Comm: hping3 Not tainted 5.4.0+ #192 +[ 217.356472][ T1635] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +[ 217.357804][ T1635] RIP: 0010:hsr_dev_xmit+0x34/0x90 [hsr] +[ 217.373010][ T1635] Code: 48 8d be 00 0c 00 00 be 04 00 00 00 48 83 ec 08 e8 21 be ff ff 48 8d 78 10 48 ba 00 b +[ 217.376919][ T1635] RSP: 0018:ffff8880cd8af058 EFLAGS: 00010202 +[ 217.377571][ T1635] RAX: 0000000000000000 RBX: ffff8880acde6840 RCX: 0000000000000002 +[ 217.379465][ T1635] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: 0000000000000010 +[ 217.380274][ T1635] RBP: ffff8880acde6840 R08: ffffed101b440d5d R09: 0000000000000001 +[ 217.381078][ T1635] R10: 0000000000000001 R11: ffffed101b440d5c R12: ffff8880bffcc000 +[ 217.382023][ T1635] R13: ffff8880bffcc088 R14: 0000000000000000 R15: ffff8880ca675c00 +[ 217.383094][ T1635] FS: 00007f060d9d1740(0000) GS:ffff8880da000000(0000) knlGS:0000000000000000 +[ 217.384289][ T1635] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 217.385009][ T1635] CR2: 00007faf15381dd0 CR3: 00000000d523c001 CR4: 00000000000606e0 +[ 217.385940][ T1635] Call Trace: +[ 217.386544][ T1635] dev_hard_start_xmit+0x160/0x740 +[ 217.387114][ T1635] __dev_queue_xmit+0x1961/0x2e10 +[ 217.388118][ T1635] ? check_object+0xaf/0x260 +[ 217.391466][ T1635] ? __alloc_skb+0xb9/0x500 +[ 217.392017][ T1635] ? init_object+0x6b/0x80 +[ 217.392629][ T1635] ? netdev_core_pick_tx+0x2e0/0x2e0 +[ 217.393175][ T1635] ? __alloc_skb+0xb9/0x500 +[ 217.393727][ T1635] ? rcu_read_lock_sched_held+0x90/0xc0 +[ 217.394331][ T1635] ? rcu_read_lock_bh_held+0xa0/0xa0 +[ 217.395013][ T1635] ? kasan_unpoison_shadow+0x30/0x40 +[ 217.395668][ T1635] ? __kasan_kmalloc.constprop.4+0xa0/0xd0 +[ 217.396280][ T1635] ? __kmalloc_node_track_caller+0x3a8/0x3f0 +[ 217.399007][ T1635] ? __kasan_kmalloc.constprop.4+0xa0/0xd0 +[ 217.400093][ T1635] ? __kmalloc_reserve.isra.46+0x2e/0xb0 +[ 217.401118][ T1635] ? memset+0x1f/0x40 +[ 217.402529][ T1635] ? __alloc_skb+0x317/0x500 +[ 217.404915][ T1635] ? arp_xmit+0xca/0x2c0 +[ ... ] + +Fixes: 311633b60406 ("hsr: switch ->dellink() to ->ndo_uninit()") +Acked-by: Cong Wang +Signed-off-by: Taehee Yoo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/hsr/hsr_device.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/net/hsr/hsr_device.c ++++ b/net/hsr/hsr_device.c +@@ -227,8 +227,13 @@ static int hsr_dev_xmit(struct sk_buff * + struct hsr_port *master; + + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); +- skb->dev = master->dev; +- hsr_forward_skb(skb, master); ++ if (master) { ++ skb->dev = master->dev; ++ hsr_forward_skb(skb, master); ++ } else { ++ atomic_long_inc(&dev->tx_dropped); ++ dev_kfree_skb_any(skb); ++ } + return NETDEV_TX_OK; + } + diff --git a/queue-5.3/inet-protect-against-too-small-mtu-values.patch b/queue-5.3/inet-protect-against-too-small-mtu-values.patch new file mode 100644 index 00000000000..b3f3c26f500 --- /dev/null +++ b/queue-5.3/inet-protect-against-too-small-mtu-values.patch @@ -0,0 +1,177 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Eric Dumazet +Date: Thu, 5 Dec 2019 20:43:46 -0800 +Subject: inet: protect against too small mtu values. + +From: Eric Dumazet + +[ Upstream commit 501a90c945103e8627406763dac418f20f3837b2 ] + +syzbot was once again able to crash a host by setting a very small mtu +on loopback device. + +Let's make inetdev_valid_mtu() available in include/net/ip.h, +and use it in ip_setup_cork(), so that we protect both ip_append_page() +and __ip_append_data() + +Also add a READ_ONCE() when the device mtu is read. + +Pairs this lockless read with one WRITE_ONCE() in __dev_set_mtu(), +even if other code paths might write over this field. + +Add a big comment in include/linux/netdevice.h about dev->mtu +needing READ_ONCE()/WRITE_ONCE() annotations. + +Hopefully we will add the missing ones in followup patches. + +[1] + +refcount_t: saturated; leaking memory. +WARNING: CPU: 0 PID: 9464 at lib/refcount.c:22 refcount_warn_saturate+0x138/0x1f0 lib/refcount.c:22 +Kernel panic - not syncing: panic_on_warn set ... +CPU: 0 PID: 9464 Comm: syz-executor850 Not tainted 5.4.0-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x197/0x210 lib/dump_stack.c:118 + panic+0x2e3/0x75c kernel/panic.c:221 + __warn.cold+0x2f/0x3e kernel/panic.c:582 + report_bug+0x289/0x300 lib/bug.c:195 + fixup_bug arch/x86/kernel/traps.c:174 [inline] + fixup_bug arch/x86/kernel/traps.c:169 [inline] + do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:267 + do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:286 + invalid_op+0x23/0x30 arch/x86/entry/entry_64.S:1027 +RIP: 0010:refcount_warn_saturate+0x138/0x1f0 lib/refcount.c:22 +Code: 06 31 ff 89 de e8 c8 f5 e6 fd 84 db 0f 85 6f ff ff ff e8 7b f4 e6 fd 48 c7 c7 e0 71 4f 88 c6 05 56 a6 a4 06 01 e8 c7 a8 b7 fd <0f> 0b e9 50 ff ff ff e8 5c f4 e6 fd 0f b6 1d 3d a6 a4 06 31 ff 89 +RSP: 0018:ffff88809689f550 EFLAGS: 00010286 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 +RDX: 0000000000000000 RSI: ffffffff815e4336 RDI: ffffed1012d13e9c +RBP: ffff88809689f560 R08: ffff88809c50a3c0 R09: fffffbfff15d31b1 +R10: fffffbfff15d31b0 R11: ffffffff8ae98d87 R12: 0000000000000001 +R13: 0000000000040100 R14: ffff888099041104 R15: ffff888218d96e40 + refcount_add include/linux/refcount.h:193 [inline] + skb_set_owner_w+0x2b6/0x410 net/core/sock.c:1999 + sock_wmalloc+0xf1/0x120 net/core/sock.c:2096 + ip_append_page+0x7ef/0x1190 net/ipv4/ip_output.c:1383 + udp_sendpage+0x1c7/0x480 net/ipv4/udp.c:1276 + inet_sendpage+0xdb/0x150 net/ipv4/af_inet.c:821 + kernel_sendpage+0x92/0xf0 net/socket.c:3794 + sock_sendpage+0x8b/0xc0 net/socket.c:936 + pipe_to_sendpage+0x2da/0x3c0 fs/splice.c:458 + splice_from_pipe_feed fs/splice.c:512 [inline] + __splice_from_pipe+0x3ee/0x7c0 fs/splice.c:636 + splice_from_pipe+0x108/0x170 fs/splice.c:671 + generic_splice_sendpage+0x3c/0x50 fs/splice.c:842 + do_splice_from fs/splice.c:861 [inline] + direct_splice_actor+0x123/0x190 fs/splice.c:1035 + splice_direct_to_actor+0x3b4/0xa30 fs/splice.c:990 + do_splice_direct+0x1da/0x2a0 fs/splice.c:1078 + do_sendfile+0x597/0xd00 fs/read_write.c:1464 + __do_sys_sendfile64 fs/read_write.c:1525 [inline] + __se_sys_sendfile64 fs/read_write.c:1511 [inline] + __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 + do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 + entry_SYSCALL_64_after_hwframe+0x49/0xbe +RIP: 0033:0x441409 +Code: e8 ac e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 +RSP: 002b:00007fffb64c4f78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 +RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441409 +RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000005 +RBP: 0000000000073b8a R08: 0000000000000010 R09: 0000000000000010 +R10: 0000000000010001 R11: 0000000000000246 R12: 0000000000402180 +R13: 0000000000402210 R14: 0000000000000000 R15: 0000000000000000 +Kernel Offset: disabled +Rebooting in 86400 seconds.. + +Fixes: 1470ddf7f8ce ("inet: Remove explicit write references to sk/inet in ip_append_data") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/netdevice.h | 5 +++++ + include/net/ip.h | 5 +++++ + net/core/dev.c | 3 ++- + net/ipv4/devinet.c | 5 ----- + net/ipv4/ip_output.c | 13 ++++++++----- + 5 files changed, 20 insertions(+), 11 deletions(-) + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1848,6 +1848,11 @@ struct net_device { + unsigned char if_port; + unsigned char dma; + ++ /* Note : dev->mtu is often read without holding a lock. ++ * Writers usually hold RTNL. ++ * It is recommended to use READ_ONCE() to annotate the reads, ++ * and to use WRITE_ONCE() to annotate the writes. ++ */ + unsigned int mtu; + unsigned int min_mtu; + unsigned int max_mtu; +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -759,4 +759,9 @@ int ip_misc_proc_init(void); + int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, + struct netlink_ext_ack *extack); + ++static inline bool inetdev_valid_mtu(unsigned int mtu) ++{ ++ return likely(mtu >= IPV4_MIN_MTU); ++} ++ + #endif /* _IP_H */ +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -7662,7 +7662,8 @@ int __dev_set_mtu(struct net_device *dev + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + +- dev->mtu = new_mtu; ++ /* Pairs with all the lockless reads of dev->mtu in the stack */ ++ WRITE_ONCE(dev->mtu, new_mtu); + return 0; + } + EXPORT_SYMBOL(__dev_set_mtu); +--- a/net/ipv4/devinet.c ++++ b/net/ipv4/devinet.c +@@ -1496,11 +1496,6 @@ skip: + } + } + +-static bool inetdev_valid_mtu(unsigned int mtu) +-{ +- return mtu >= IPV4_MIN_MTU; +-} +- + static void inetdev_send_gratuitous_arp(struct net_device *dev, + struct in_device *in_dev) + +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -1258,15 +1258,18 @@ static int ip_setup_cork(struct sock *sk + cork->addr = ipc->addr; + } + +- /* +- * We steal reference to this route, caller should not release it +- */ +- *rtp = NULL; + cork->fragsize = ip_sk_use_pmtu(sk) ? +- dst_mtu(&rt->dst) : rt->dst.dev->mtu; ++ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); ++ ++ if (!inetdev_valid_mtu(cork->fragsize)) ++ return -ENETUNREACH; + + cork->gso_size = ipc->gso_size; ++ + cork->dst = &rt->dst; ++ /* We stole this route, caller should not release it. */ ++ *rtp = NULL; ++ + cork->length = 0; + cork->ttl = ipc->ttl; + cork->tos = ipc->tos; diff --git a/queue-5.3/mqprio-fix-out-of-bounds-access-in-mqprio_dump.patch b/queue-5.3/mqprio-fix-out-of-bounds-access-in-mqprio_dump.patch new file mode 100644 index 00000000000..6f014817f3c --- /dev/null +++ b/queue-5.3/mqprio-fix-out-of-bounds-access-in-mqprio_dump.patch @@ -0,0 +1,42 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Vladyslav Tarasiuk +Date: Fri, 6 Dec 2019 13:51:05 +0000 +Subject: mqprio: Fix out-of-bounds access in mqprio_dump + +From: Vladyslav Tarasiuk + +[ Upstream commit 9f104c7736904ac72385bbb48669e0c923ca879b ] + +When user runs a command like +tc qdisc add dev eth1 root mqprio +KASAN stack-out-of-bounds warning is emitted. +Currently, NLA_ALIGN macro used in mqprio_dump provides too large +buffer size as argument for nla_put and memcpy down the call stack. +The flow looks like this: +1. nla_put expects exact object size as an argument; +2. Later it provides this size to memcpy; +3. To calculate correct padding for SKB, nla_put applies NLA_ALIGN + macro itself. + +Therefore, NLA_ALIGN should not be applied to the nla_put parameter. +Otherwise it will lead to out-of-bounds memory access in memcpy. + +Fixes: 4e8b86c06269 ("mqprio: Introduce new hardware offload mode and shaper in mqprio") +Signed-off-by: Vladyslav Tarasiuk +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_mqprio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/sch_mqprio.c ++++ b/net/sched/sch_mqprio.c +@@ -433,7 +433,7 @@ static int mqprio_dump(struct Qdisc *sch + opt.offset[tc] = dev->tc_to_txq[tc].offset; + } + +- if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) ++ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MODE) && diff --git a/queue-5.3/net-bridge-deny-dev_set_mac_address-when-unregistering.patch b/queue-5.3/net-bridge-deny-dev_set_mac_address-when-unregistering.patch new file mode 100644 index 00000000000..e67dd9f32cc --- /dev/null +++ b/queue-5.3/net-bridge-deny-dev_set_mac_address-when-unregistering.patch @@ -0,0 +1,76 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Nikolay Aleksandrov +Date: Tue, 3 Dec 2019 16:48:06 +0200 +Subject: net: bridge: deny dev_set_mac_address() when unregistering + +From: Nikolay Aleksandrov + +[ Upstream commit c4b4c421857dc7b1cf0dccbd738472360ff2cd70 ] + +We have an interesting memory leak in the bridge when it is being +unregistered and is a slave to a master device which would change the +mac of its slaves on unregister (e.g. bond, team). This is a very +unusual setup but we do end up leaking 1 fdb entry because +dev_set_mac_address() would cause the bridge to insert the new mac address +into its table after all fdbs are flushed, i.e. after dellink() on the +bridge has finished and we call NETDEV_UNREGISTER the bond/team would +release it and will call dev_set_mac_address() to restore its original +address and that in turn will add an fdb in the bridge. +One fix is to check for the bridge dev's reg_state in its +ndo_set_mac_address callback and return an error if the bridge is not in +NETREG_REGISTERED. + +Easy steps to reproduce: + 1. add bond in mode != A/B + 2. add any slave to the bond + 3. add bridge dev as a slave to the bond + 4. destroy the bridge device + +Trace: + unreferenced object 0xffff888035c4d080 (size 128): + comm "ip", pid 4068, jiffies 4296209429 (age 1413.753s) + hex dump (first 32 bytes): + 41 1d c9 36 80 88 ff ff 00 00 00 00 00 00 00 00 A..6............ + d2 19 c9 5e 3f d7 00 00 00 00 00 00 00 00 00 00 ...^?........... + backtrace: + [<00000000ddb525dc>] kmem_cache_alloc+0x155/0x26f + [<00000000633ff1e0>] fdb_create+0x21/0x486 [bridge] + [<0000000092b17e9c>] fdb_insert+0x91/0xdc [bridge] + [<00000000f2a0f0ff>] br_fdb_change_mac_address+0xb3/0x175 [bridge] + [<000000001de02dbd>] br_stp_change_bridge_id+0xf/0xff [bridge] + [<00000000ac0e32b1>] br_set_mac_address+0x76/0x99 [bridge] + [<000000006846a77f>] dev_set_mac_address+0x63/0x9b + [<00000000d30738fc>] __bond_release_one+0x3f6/0x455 [bonding] + [<00000000fc7ec01d>] bond_netdev_event+0x2f2/0x400 [bonding] + [<00000000305d7795>] notifier_call_chain+0x38/0x56 + [<0000000028885d4a>] call_netdevice_notifiers+0x1e/0x23 + [<000000008279477b>] rollback_registered_many+0x353/0x6a4 + [<0000000018ef753a>] unregister_netdevice_many+0x17/0x6f + [<00000000ba854b7a>] rtnl_delete_link+0x3c/0x43 + [<00000000adf8618d>] rtnl_dellink+0x1dc/0x20a + [<000000009b6395fd>] rtnetlink_rcv_msg+0x23d/0x268 + +Fixes: 43598813386f ("bridge: add local MAC address to forwarding table (v2)") +Reported-by: syzbot+2add91c08eb181fea1bf@syzkaller.appspotmail.com +Signed-off-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_device.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/net/bridge/br_device.c ++++ b/net/bridge/br_device.c +@@ -253,6 +253,12 @@ static int br_set_mac_address(struct net + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + ++ /* dev_set_mac_addr() can be called by a master device on bridge's ++ * NETDEV_UNREGISTER, but since it's being destroyed do nothing ++ */ ++ if (dev->reg_state != NETREG_REGISTERED) ++ return -EBUSY; ++ + spin_lock_bh(&br->lock); + if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { + /* Mac address will be changed in br_stp_change_bridge_id(). */ diff --git a/queue-5.3/net-dsa-fix-flow-dissection-on-tx-path.patch b/queue-5.3/net-dsa-fix-flow-dissection-on-tx-path.patch new file mode 100644 index 00000000000..5c754746cd9 --- /dev/null +++ b/queue-5.3/net-dsa-fix-flow-dissection-on-tx-path.patch @@ -0,0 +1,114 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Alexander Lobakin +Date: Thu, 5 Dec 2019 13:02:35 +0300 +Subject: net: dsa: fix flow dissection on Tx path + +From: Alexander Lobakin + +[ Upstream commit 8bef0af09a5415df761b04fa487a6c34acae74bc ] + +Commit 43e665287f93 ("net-next: dsa: fix flow dissection") added an +ability to override protocol and network offset during flow dissection +for DSA-enabled devices (i.e. controllers shipped as switch CPU ports) +in order to fix skb hashing for RPS on Rx path. + +However, skb_hash() and added part of code can be invoked not only on +Rx, but also on Tx path if we have a multi-queued device and: + - kernel is running on UP system or + - XPS is not configured. + +The call stack in this two cases will be like: dev_queue_xmit() -> +__dev_queue_xmit() -> netdev_core_pick_tx() -> netdev_pick_tx() -> +skb_tx_hash() -> skb_get_hash(). + +The problem is that skbs queued for Tx have both network offset and +correct protocol already set up even after inserting a CPU tag by DSA +tagger, so calling tag_ops->flow_dissect() on this path actually only +breaks flow dissection and hashing. + +This can be observed by adding debug prints just before and right after +tag_ops->flow_dissect() call to the related block of code: + +Before the patch: + +Rx path (RPS): + +[ 19.240001] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 19.244271] tag_ops->flow_dissect() +[ 19.247811] Rx: proto: 0x0800, nhoff: 8 /* ETH_P_IP */ + +[ 19.215435] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 19.219746] tag_ops->flow_dissect() +[ 19.223241] Rx: proto: 0x0806, nhoff: 8 /* ETH_P_ARP */ + +[ 18.654057] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 18.658332] tag_ops->flow_dissect() +[ 18.661826] Rx: proto: 0x8100, nhoff: 8 /* ETH_P_8021Q */ + +Tx path (UP system): + +[ 18.759560] Tx: proto: 0x0800, nhoff: 26 /* ETH_P_IP */ +[ 18.763933] tag_ops->flow_dissect() +[ 18.767485] Tx: proto: 0x920b, nhoff: 34 /* junk */ + +[ 22.800020] Tx: proto: 0x0806, nhoff: 26 /* ETH_P_ARP */ +[ 22.804392] tag_ops->flow_dissect() +[ 22.807921] Tx: proto: 0x920b, nhoff: 34 /* junk */ + +[ 16.898342] Tx: proto: 0x86dd, nhoff: 26 /* ETH_P_IPV6 */ +[ 16.902705] tag_ops->flow_dissect() +[ 16.906227] Tx: proto: 0x920b, nhoff: 34 /* junk */ + +After: + +Rx path (RPS): + +[ 16.520993] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 16.525260] tag_ops->flow_dissect() +[ 16.528808] Rx: proto: 0x0800, nhoff: 8 /* ETH_P_IP */ + +[ 15.484807] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 15.490417] tag_ops->flow_dissect() +[ 15.495223] Rx: proto: 0x0806, nhoff: 8 /* ETH_P_ARP */ + +[ 17.134621] Rx: proto: 0x00f8, nhoff: 0 /* ETH_P_XDSA */ +[ 17.138895] tag_ops->flow_dissect() +[ 17.142388] Rx: proto: 0x8100, nhoff: 8 /* ETH_P_8021Q */ + +Tx path (UP system): + +[ 15.499558] Tx: proto: 0x0800, nhoff: 26 /* ETH_P_IP */ + +[ 20.664689] Tx: proto: 0x0806, nhoff: 26 /* ETH_P_ARP */ + +[ 18.565782] Tx: proto: 0x86dd, nhoff: 26 /* ETH_P_IPV6 */ + +In order to fix that we can add the check 'proto == htons(ETH_P_XDSA)' +to prevent code from calling tag_ops->flow_dissect() on Tx. +I also decided to initialize 'offset' variable so tagger callbacks can +now safely leave it untouched without provoking a chaos. + +Fixes: 43e665287f93 ("net-next: dsa: fix flow dissection") +Signed-off-by: Alexander Lobakin +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/flow_dissector.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/net/core/flow_dissector.c ++++ b/net/core/flow_dissector.c +@@ -853,9 +853,10 @@ bool __skb_flow_dissect(const struct net + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); + #if IS_ENABLED(CONFIG_NET_DSA) +- if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) { ++ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && ++ proto == htons(ETH_P_XDSA))) { + const struct dsa_device_ops *ops; +- int offset; ++ int offset = 0; + + ops = skb->dev->dsa_ptr->tag_ops; + if (ops->flow_dissect && diff --git a/queue-5.3/net-ethernet-ti-cpsw-fix-extra-rx-interrupt.patch b/queue-5.3/net-ethernet-ti-cpsw-fix-extra-rx-interrupt.patch new file mode 100644 index 00000000000..1f4e79a92b0 --- /dev/null +++ b/queue-5.3/net-ethernet-ti-cpsw-fix-extra-rx-interrupt.patch @@ -0,0 +1,36 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Grygorii Strashko +Date: Fri, 6 Dec 2019 14:28:20 +0200 +Subject: net: ethernet: ti: cpsw: fix extra rx interrupt + +From: Grygorii Strashko + +[ Upstream commit 51302f77bedab8768b761ed1899c08f89af9e4e2 ] + +Now RX interrupt is triggered twice every time, because in +cpsw_rx_interrupt() it is asked first and then disabled. So there will be +pending interrupt always, when RX interrupt is enabled again in NAPI +handler. + +Fix it by first disabling IRQ and then do ask. + +Fixes: 870915feabdc ("drivers: net: cpsw: remove disable_irq/enable_irq as irq can be masked from cpsw itself") +Signed-off-by: Grygorii Strashko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ti/cpsw.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/ti/cpsw.c ++++ b/drivers/net/ethernet/ti/cpsw.c +@@ -890,8 +890,8 @@ static irqreturn_t cpsw_rx_interrupt(int + { + struct cpsw_common *cpsw = dev_id; + +- cpdma_ctlr_eoi(cpsw->dma, CPDMA_EOI_RX); + writel(0, &cpsw->wr_regs->rx_en); ++ cpdma_ctlr_eoi(cpsw->dma, CPDMA_EOI_RX); + + if (cpsw->quirk_irq) { + disable_irq_nosync(cpsw->irqs_table[0]); diff --git a/queue-5.3/net-fixed-updating-of-ethertype-in-skb_mpls_push.patch b/queue-5.3/net-fixed-updating-of-ethertype-in-skb_mpls_push.patch new file mode 100644 index 00000000000..b7faa4a8b68 --- /dev/null +++ b/queue-5.3/net-fixed-updating-of-ethertype-in-skb_mpls_push.patch @@ -0,0 +1,87 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Martin Varghese +Date: Thu, 5 Dec 2019 05:57:22 +0530 +Subject: net: Fixed updating of ethertype in skb_mpls_push() + +From: Martin Varghese + +[ Upstream commit d04ac224b1688f005a84f764cfe29844f8e9da08 ] + +The skb_mpls_push was not updating ethertype of an ethernet packet if +the packet was originally received from a non ARPHRD_ETHER device. + +In the below OVS data path flow, since the device corresponding to +port 7 is an l3 device (ARPHRD_NONE) the skb_mpls_push function does +not update the ethertype of the packet even though the previous +push_eth action had added an ethernet header to the packet. + +recirc_id(0),in_port(7),eth_type(0x0800),ipv4(tos=0/0xfc,ttl=64,frag=no), +actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00), +push_mpls(label=13,tc=0,ttl=64,bos=1,eth_type=0x8847),4 + +Fixes: 8822e270d697 ("net: core: move push MPLS functionality from OvS to core helper") +Signed-off-by: Martin Varghese +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/skbuff.h | 2 +- + net/core/skbuff.c | 4 ++-- + net/openvswitch/actions.c | 3 ++- + net/sched/act_mpls.c | 3 ++- + 4 files changed, 7 insertions(+), 5 deletions(-) + +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -3482,7 +3482,7 @@ int __skb_vlan_pop(struct sk_buff *skb, + int skb_vlan_pop(struct sk_buff *skb); + int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); + int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, +- int mac_len); ++ int mac_len, bool ethernet); + int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet); + int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -5472,7 +5472,7 @@ static void skb_mod_eth_type(struct sk_b + * Returns 0 on success, -errno otherwise. + */ + int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, +- int mac_len) ++ int mac_len, bool ethernet) + { + struct mpls_shim_hdr *lse; + int err; +@@ -5503,7 +5503,7 @@ int skb_mpls_push(struct sk_buff *skb, _ + lse->label_stack_entry = mpls_lse; + skb_postpush_rcsum(skb, lse, MPLS_HLEN); + +- if (skb->dev && skb->dev->type == ARPHRD_ETHER) ++ if (ethernet) + skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); + skb->protocol = mpls_proto; + +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -166,7 +166,8 @@ static int push_mpls(struct sk_buff *skb + int err; + + err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, +- skb->mac_len); ++ skb->mac_len, ++ ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); + if (err) + return err; + +--- a/net/sched/act_mpls.c ++++ b/net/sched/act_mpls.c +@@ -83,7 +83,8 @@ static int tcf_mpls_act(struct sk_buff * + break; + case TCA_MPLS_ACT_PUSH: + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); +- if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len)) ++ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, ++ skb->dev && skb->dev->type == ARPHRD_ETHER)) + goto drop; + break; + case TCA_MPLS_ACT_MODIFY: diff --git a/queue-5.3/net-ipv6-add-net-argument-to-ip6_dst_lookup_flow.patch b/queue-5.3/net-ipv6-add-net-argument-to-ip6_dst_lookup_flow.patch new file mode 100644 index 00000000000..2d71e3a3c4e --- /dev/null +++ b/queue-5.3/net-ipv6-add-net-argument-to-ip6_dst_lookup_flow.patch @@ -0,0 +1,220 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Sabrina Dubroca +Date: Wed, 4 Dec 2019 15:35:52 +0100 +Subject: net: ipv6: add net argument to ip6_dst_lookup_flow + +From: Sabrina Dubroca + +[ Upstream commit c4e85f73afb6384123e5ef1bba3315b2e3ad031e ] + +This will be used in the conversion of ipv6_stub to ip6_dst_lookup_flow, +as some modules currently pass a net argument without a socket to +ip6_dst_lookup. This is equivalent to commit 343d60aada5a ("ipv6: change +ipv6_stub_impl.ipv6_dst_lookup to take net argument"). + +Signed-off-by: Sabrina Dubroca +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/ipv6.h | 2 +- + net/dccp/ipv6.c | 6 +++--- + net/ipv6/af_inet6.c | 2 +- + net/ipv6/datagram.c | 2 +- + net/ipv6/inet6_connection_sock.c | 4 ++-- + net/ipv6/ip6_output.c | 8 ++++---- + net/ipv6/raw.c | 2 +- + net/ipv6/syncookies.c | 2 +- + net/ipv6/tcp_ipv6.c | 4 ++-- + net/l2tp/l2tp_ip6.c | 2 +- + net/sctp/ipv6.c | 4 ++-- + 11 files changed, 19 insertions(+), 19 deletions(-) + +--- a/include/net/ipv6.h ++++ b/include/net/ipv6.h +@@ -1017,7 +1017,7 @@ static inline struct sk_buff *ip6_finish + + int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6); +-struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, ++struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst); + struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, +--- a/net/dccp/ipv6.c ++++ b/net/dccp/ipv6.c +@@ -210,7 +210,7 @@ static int dccp_v6_send_response(const s + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; +@@ -281,7 +281,7 @@ static void dccp_v6_ctl_send_reset(const + security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); + + /* sk = NULL, but it is safe for now. RST socket required. */ +- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); ++ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); + if (!IS_ERR(dst)) { + skb_dst_set(skb, dst); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); +@@ -911,7 +911,7 @@ static int dccp_v6_connect(struct sock * + opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -765,7 +765,7 @@ int inet6_sk_rebuild_header(struct sock + &final); + rcu_read_unlock(); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + sk->sk_route_caps = 0; + sk->sk_err_soft = -PTR_ERR(dst); +--- a/net/ipv6/datagram.c ++++ b/net/ipv6/datagram.c +@@ -85,7 +85,7 @@ int ip6_datagram_dst_update(struct sock + final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; +--- a/net/ipv6/inet6_connection_sock.c ++++ b/net/ipv6/inet6_connection_sock.c +@@ -48,7 +48,7 @@ struct dst_entry *inet6_csk_route_req(co + fl6->flowi6_uid = sk->sk_uid; + security_req_classify_flow(req, flowi6_to_flowi(fl6)); + +- dst = ip6_dst_lookup_flow(sk, fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (IS_ERR(dst)) + return NULL; + +@@ -103,7 +103,7 @@ static struct dst_entry *inet6_csk_route + + dst = __inet6_csk_dst_check(sk, np->dst_cookie); + if (!dst) { +- dst = ip6_dst_lookup_flow(sk, fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + + if (!IS_ERR(dst)) + ip6_dst_store(sk, dst, NULL, NULL); +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1144,19 +1144,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +-struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, ++struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) + { + struct dst_entry *dst = NULL; + int err; + +- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); ++ err = ip6_dst_lookup_tail(net, sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + +- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++ return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); + } + EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +@@ -1188,7 +1188,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow + if (dst) + return dst; + +- dst = ip6_dst_lookup_flow(sk, fl6, final_dst); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); + if (connected && !IS_ERR(dst)) + ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); + +--- a/net/ipv6/raw.c ++++ b/net/ipv6/raw.c +@@ -923,7 +923,7 @@ static int rawv6_sendmsg(struct sock *sk + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; +--- a/net/ipv6/syncookies.c ++++ b/net/ipv6/syncookies.c +@@ -235,7 +235,7 @@ struct sock *cookie_v6_check(struct sock + fl6.flowi6_uid = sk->sk_uid; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) + goto out_free; + } +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -275,7 +275,7 @@ static int tcp_v6_connect(struct sock *s + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; +@@ -904,7 +904,7 @@ static void tcp_v6_send_response(const s + * Underlying function will use this to retrieve the network + * namespace + */ +- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); ++ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); +--- a/net/l2tp/l2tp_ip6.c ++++ b/net/l2tp/l2tp_ip6.c +@@ -615,7 +615,7 @@ static int l2tp_ip6_sendmsg(struct sock + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + +- dst = ip6_dst_lookup_flow(sk, &fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; +--- a/net/sctp/ipv6.c ++++ b/net/sctp/ipv6.c +@@ -275,7 +275,7 @@ static void sctp_v6_get_dst(struct sctp_ + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); + +- dst = ip6_dst_lookup_flow(sk, fl6, final_p); ++ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (!asoc || saddr) + goto out; + +@@ -328,7 +328,7 @@ static void sctp_v6_get_dst(struct sctp_ + fl6->saddr = laddr->a.v6.sin6_addr; + fl6->fl6_sport = laddr->a.v6.sin6_port; + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); +- bdst = ip6_dst_lookup_flow(sk, fl6, final_p); ++ bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + + if (IS_ERR(bdst)) + continue; diff --git a/queue-5.3/net-ipv6_stub-use-ip6_dst_lookup_flow-instead-of-ip6_dst_lookup.patch b/queue-5.3/net-ipv6_stub-use-ip6_dst_lookup_flow-instead-of-ip6_dst_lookup.patch new file mode 100644 index 00000000000..c4e1f4c564c --- /dev/null +++ b/queue-5.3/net-ipv6_stub-use-ip6_dst_lookup_flow-instead-of-ip6_dst_lookup.patch @@ -0,0 +1,243 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Sabrina Dubroca +Date: Wed, 4 Dec 2019 15:35:53 +0100 +Subject: net: ipv6_stub: use ip6_dst_lookup_flow instead of ip6_dst_lookup + +From: Sabrina Dubroca + +[ Upstream commit 6c8991f41546c3c472503dff1ea9daaddf9331c2 ] + +ipv6_stub uses the ip6_dst_lookup function to allow other modules to +perform IPv6 lookups. However, this function skips the XFRM layer +entirely. + +All users of ipv6_stub->ip6_dst_lookup use ip_route_output_flow (via the +ip_route_output_key and ip_route_output helpers) for their IPv4 lookups, +which calls xfrm_lookup_route(). This patch fixes this inconsistent +behavior by switching the stub to ip6_dst_lookup_flow, which also calls +xfrm_lookup_route(). + +This requires some changes in all the callers, as these two functions +take different arguments and have different return types. + +Fixes: 5f81bd2e5d80 ("ipv6: export a stub for IPv6 symbols used by vxlan") +Reported-by: Xiumei Mu +Signed-off-by: Sabrina Dubroca +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/core/addr.c | 7 +++---- + drivers/infiniband/sw/rxe/rxe_net.c | 8 +++++--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 8 ++++---- + drivers/net/geneve.c | 4 +++- + drivers/net/vxlan.c | 8 +++----- + include/net/ipv6_stubs.h | 6 ++++-- + net/core/lwt_bpf.c | 4 +--- + net/ipv6/addrconf_core.c | 11 ++++++----- + net/ipv6/af_inet6.c | 2 +- + net/mpls/af_mpls.c | 7 +++---- + net/tipc/udp_media.c | 9 ++++++--- + 11 files changed, 39 insertions(+), 35 deletions(-) + +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -421,16 +421,15 @@ static int addr6_resolve(struct sockaddr + (const struct sockaddr_in6 *)dst_sock; + struct flowi6 fl6; + struct dst_entry *dst; +- int ret; + + memset(&fl6, 0, sizeof fl6); + fl6.daddr = dst_in->sin6_addr; + fl6.saddr = src_in->sin6_addr; + fl6.flowi6_oif = addr->bound_dev_if; + +- ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); +- if (ret < 0) +- return ret; ++ dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); ++ if (IS_ERR(dst)) ++ return PTR_ERR(dst); + + if (ipv6_addr_any(&src_in->sin6_addr)) + src_in->sin6_addr = fl6.saddr; +--- a/drivers/infiniband/sw/rxe/rxe_net.c ++++ b/drivers/infiniband/sw/rxe/rxe_net.c +@@ -117,10 +117,12 @@ static struct dst_entry *rxe_find_route6 + memcpy(&fl6.daddr, daddr, sizeof(*daddr)); + fl6.flowi6_proto = IPPROTO_UDP; + +- if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), +- recv_sockets.sk6->sk, &ndst, &fl6))) { ++ ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), ++ recv_sockets.sk6->sk, &fl6, ++ NULL); ++ if (unlikely(IS_ERR(ndst))) { + pr_err_ratelimited("no route to %pI6\n", daddr); +- goto put; ++ return NULL; + } + + if (unlikely(ndst->error)) { +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +@@ -137,10 +137,10 @@ static int mlx5e_route_lookup_ipv6(struc + #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + int ret; + +- ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst, +- fl6); +- if (ret < 0) +- return ret; ++ dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(mirred_dev), NULL, fl6, ++ NULL); ++ if (IS_ERR(dst)) ++ return PTR_ERR(dst); + + if (!(*out_ttl)) + *out_ttl = ip6_dst_hoplimit(dst); +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -853,7 +853,9 @@ static struct dst_entry *geneve_get_v6_d + if (dst) + return dst; + } +- if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { ++ dst = ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, fl6, ++ NULL); ++ if (IS_ERR(dst)) { + netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr); + return ERR_PTR(-ENETUNREACH); + } +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -2276,7 +2276,6 @@ static struct dst_entry *vxlan6_get_rout + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); + struct dst_entry *ndst; + struct flowi6 fl6; +- int err; + + if (!sock6) + return ERR_PTR(-EIO); +@@ -2299,10 +2298,9 @@ static struct dst_entry *vxlan6_get_rout + fl6.fl6_dport = dport; + fl6.fl6_sport = sport; + +- err = ipv6_stub->ipv6_dst_lookup(vxlan->net, +- sock6->sock->sk, +- &ndst, &fl6); +- if (unlikely(err < 0)) { ++ ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, ++ &fl6, NULL); ++ if (unlikely(IS_ERR(ndst))) { + netdev_dbg(dev, "no route to %pI6\n", daddr); + return ERR_PTR(-ENETUNREACH); + } +--- a/include/net/ipv6_stubs.h ++++ b/include/net/ipv6_stubs.h +@@ -24,8 +24,10 @@ struct ipv6_stub { + const struct in6_addr *addr); + int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, + const struct in6_addr *addr); +- int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, +- struct dst_entry **dst, struct flowi6 *fl6); ++ struct dst_entry *(*ipv6_dst_lookup_flow)(struct net *net, ++ const struct sock *sk, ++ struct flowi6 *fl6, ++ const struct in6_addr *final_dst); + int (*ipv6_route_input)(struct sk_buff *skb); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); +--- a/net/core/lwt_bpf.c ++++ b/net/core/lwt_bpf.c +@@ -230,9 +230,7 @@ static int bpf_lwt_xmit_reroute(struct s + fl6.daddr = iph6->daddr; + fl6.saddr = iph6->saddr; + +- err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); +- if (unlikely(err)) +- goto err; ++ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err; +--- a/net/ipv6/addrconf_core.c ++++ b/net/ipv6/addrconf_core.c +@@ -128,11 +128,12 @@ int inet6addr_validator_notifier_call_ch + } + EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); + +-static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, +- struct dst_entry **u2, +- struct flowi6 *u3) ++static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, ++ const struct sock *sk, ++ struct flowi6 *fl6, ++ const struct in6_addr *final_dst) + { +- return -EAFNOSUPPORT; ++ return ERR_PTR(-EAFNOSUPPORT); + } + + static int eafnosupport_ipv6_route_input(struct sk_buff *skb) +@@ -189,7 +190,7 @@ static int eafnosupport_ip6_del_rt(struc + } + + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { +- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, ++ .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, + .ipv6_route_input = eafnosupport_ipv6_route_input, + .fib6_get_table = eafnosupport_fib6_get_table, + .fib6_table_lookup = eafnosupport_fib6_table_lookup, +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -946,7 +946,7 @@ static int ipv6_route_input(struct sk_bu + static const struct ipv6_stub ipv6_stub_impl = { + .ipv6_sock_mc_join = ipv6_sock_mc_join, + .ipv6_sock_mc_drop = ipv6_sock_mc_drop, +- .ipv6_dst_lookup = ip6_dst_lookup, ++ .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, + .ipv6_route_input = ipv6_route_input, + .fib6_get_table = fib6_get_table, + .fib6_table_lookup = fib6_table_lookup, +--- a/net/mpls/af_mpls.c ++++ b/net/mpls/af_mpls.c +@@ -617,16 +617,15 @@ static struct net_device *inet6_fib_look + struct net_device *dev; + struct dst_entry *dst; + struct flowi6 fl6; +- int err; + + if (!ipv6_stub) + return ERR_PTR(-EAFNOSUPPORT); + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); +- err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); +- if (err) +- return ERR_PTR(err); ++ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); ++ if (IS_ERR(dst)) ++ return ERR_CAST(dst); + + dev = dst->dev; + dev_hold(dev); +--- a/net/tipc/udp_media.c ++++ b/net/tipc/udp_media.c +@@ -195,10 +195,13 @@ static int tipc_udp_xmit(struct net *net + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; +- err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, +- &ndst, &fl6); +- if (err) ++ ndst = ipv6_stub->ipv6_dst_lookup_flow(net, ++ ub->ubsock->sk, ++ &fl6, NULL); ++ if (IS_ERR(ndst)) { ++ err = PTR_ERR(ndst); + goto tx_error; ++ } + dst_cache_set_ip6(cache, ndst, &fl6.saddr); + } + ttl = ip6_dst_hoplimit(ndst); diff --git a/queue-5.3/net-mlx5e-fix-txq-indices-to-be-sequential.patch b/queue-5.3/net-mlx5e-fix-txq-indices-to-be-sequential.patch new file mode 100644 index 00000000000..15613ba40d4 --- /dev/null +++ b/queue-5.3/net-mlx5e-fix-txq-indices-to-be-sequential.patch @@ -0,0 +1,150 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Eran Ben Elisha +Date: Mon, 25 Nov 2019 12:11:49 +0200 +Subject: net/mlx5e: Fix TXQ indices to be sequential + +From: Eran Ben Elisha + +[ Upstream commit c55d8b108caa2ec1ae8dddd02cb9d3a740f7c838 ] + +Cited patch changed (channel index, tc) => (TXQ index) mapping to be a +static one, in order to keep indices consistent when changing number of +channels or TCs. + +For 32 channels (OOB) and 8 TCs, real num of TXQs is 256. +When reducing the amount of channels to 8, the real num of TXQs will be +changed to 64. +This indices method is buggy: +- Channel #0, TC 3, the TXQ index is 96. +- Index 8 is not valid, as there is no such TXQ from driver perspective + (As it represents channel #8, TC 0, which is not valid with the above + configuration). + +As part of driver's select queue, it calls netdev_pick_tx which returns an +index in the range of real number of TXQs. Depends on the return value, +with the examples above, driver could have returned index larger than the +real number of tx queues, or crash the kernel as it tries to read invalid +address of SQ which was not allocated. + +Fix that by allocating sequential TXQ indices, and hold a new mapping +between (channel index, tc) => (real TXQ index). This mapping will be +updated as part of priv channels activation, and is used in +mlx5e_select_queue to find the selected queue index. + +The existing indices mapping (channel_tc2txq) is no longer needed, as it +is used only for statistics structures and can be calculated on run time. +Delete its definintion and updates. + +Fixes: 8bfaf07f7806 ("net/mlx5e: Present SW stats when state is not opened") +Signed-off-by: Eran Ben Elisha +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 - + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 31 ++++++++------------- + drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 2 - + drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 - + 4 files changed, 15 insertions(+), 22 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h +@@ -792,7 +792,7 @@ struct mlx5e_xsk { + struct mlx5e_priv { + /* priv data path fields - start */ + struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC]; +- int channel_tc2txq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC]; ++ int channel_tc2realtxq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC]; + #ifdef CONFIG_MLX5_CORE_EN_DCB + struct mlx5e_dcbx_dp dcbx_dp; + #endif +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -1678,11 +1678,10 @@ static int mlx5e_open_sqs(struct mlx5e_c + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam) + { +- struct mlx5e_priv *priv = c->priv; + int err, tc; + + for (tc = 0; tc < params->num_tc; tc++) { +- int txq_ix = c->ix + tc * priv->max_nch; ++ int txq_ix = c->ix + tc * params->num_channels; + + err = mlx5e_open_txqsq(c, c->priv->tisn[tc], txq_ix, + params, &cparam->sq, &c->sq[tc], tc); +@@ -2856,26 +2855,21 @@ static void mlx5e_netdev_set_tcs(struct + netdev_set_tc_queue(netdev, tc, nch, 0); + } + +-static void mlx5e_build_tc2txq_maps(struct mlx5e_priv *priv) ++static void mlx5e_build_txq_maps(struct mlx5e_priv *priv) + { +- int i, tc; ++ int i, ch; + +- for (i = 0; i < priv->max_nch; i++) +- for (tc = 0; tc < priv->profile->max_tc; tc++) +- priv->channel_tc2txq[i][tc] = i + tc * priv->max_nch; +-} ++ ch = priv->channels.num; + +-static void mlx5e_build_tx2sq_maps(struct mlx5e_priv *priv) +-{ +- struct mlx5e_channel *c; +- struct mlx5e_txqsq *sq; +- int i, tc; ++ for (i = 0; i < ch; i++) { ++ int tc; ++ ++ for (tc = 0; tc < priv->channels.params.num_tc; tc++) { ++ struct mlx5e_channel *c = priv->channels.c[i]; ++ struct mlx5e_txqsq *sq = &c->sq[tc]; + +- for (i = 0; i < priv->channels.num; i++) { +- c = priv->channels.c[i]; +- for (tc = 0; tc < c->num_tc; tc++) { +- sq = &c->sq[tc]; + priv->txq2sq[sq->txq_ix] = sq; ++ priv->channel_tc2realtxq[i][tc] = i + tc * ch; + } + } + } +@@ -2890,7 +2884,7 @@ void mlx5e_activate_priv_channels(struct + netif_set_real_num_tx_queues(netdev, num_txqs); + netif_set_real_num_rx_queues(netdev, num_rxqs); + +- mlx5e_build_tx2sq_maps(priv); ++ mlx5e_build_txq_maps(priv); + mlx5e_activate_channels(&priv->channels); + mlx5e_xdp_tx_enable(priv); + netif_tx_start_all_queues(priv->netdev); +@@ -4968,7 +4962,6 @@ static int mlx5e_nic_init(struct mlx5_co + if (err) + mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); + mlx5e_build_nic_netdev(netdev); +- mlx5e_build_tc2txq_maps(priv); + + return 0; + } +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +@@ -1435,7 +1435,7 @@ static int mlx5e_grp_channels_fill_strin + for (j = 0; j < NUM_SQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + sq_stats_desc[j].format, +- priv->channel_tc2txq[i][tc]); ++ i + tc * max_nch); + + for (i = 0; i < max_nch; i++) { + for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++) +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +@@ -93,7 +93,7 @@ u16 mlx5e_select_queue(struct net_device + if (txq_ix >= num_channels) + txq_ix = priv->txq2sq[txq_ix]->ch_ix; + +- return priv->channel_tc2txq[txq_ix][up]; ++ return priv->channel_tc2realtxq[txq_ix][up]; + } + + static inline int mlx5e_skb_l2_header_offset(struct sk_buff *skb) diff --git a/queue-5.3/net-mlx5e-query-global-pause-state-before-setting-prio2buffer.patch b/queue-5.3/net-mlx5e-query-global-pause-state-before-setting-prio2buffer.patch new file mode 100644 index 00000000000..3e7e1bd9273 --- /dev/null +++ b/queue-5.3/net-mlx5e-query-global-pause-state-before-setting-prio2buffer.patch @@ -0,0 +1,78 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Huy Nguyen +Date: Fri, 6 Sep 2019 09:28:46 -0500 +Subject: net/mlx5e: Query global pause state before setting prio2buffer + +From: Huy Nguyen + +[ Upstream commit 73e6551699a32fac703ceea09214d6580edcf2d5 ] + +When the user changes prio2buffer mapping while global pause is +enabled, mlx5 driver incorrectly sets all active buffers +(buffer that has at least one priority mapped) to lossy. + +Solution: +If global pause is enabled, set all the active buffers to lossless +in prio2buffer command. +Also, add error message when buffer size is not enough to meet +xoff threshold. + +Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") +Signed-off-by: Huy Nguyen +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 27 +++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +@@ -155,8 +155,11 @@ static int update_xoff_threshold(struct + } + + if (port_buffer->buffer[i].size < +- (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) ++ (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) { ++ pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n", ++ i, port_buffer->buffer[i].size); + return -ENOMEM; ++ } + + port_buffer->buffer[i].xoff = port_buffer->buffer[i].size - xoff; + port_buffer->buffer[i].xon = +@@ -232,6 +235,26 @@ static int update_buffer_lossy(unsigned + return 0; + } + ++static int fill_pfc_en(struct mlx5_core_dev *mdev, u8 *pfc_en) ++{ ++ u32 g_rx_pause, g_tx_pause; ++ int err; ++ ++ err = mlx5_query_port_pause(mdev, &g_rx_pause, &g_tx_pause); ++ if (err) ++ return err; ++ ++ /* If global pause enabled, set all active buffers to lossless. ++ * Otherwise, check PFC setting. ++ */ ++ if (g_rx_pause || g_tx_pause) ++ *pfc_en = 0xff; ++ else ++ err = mlx5_query_port_pfc(mdev, pfc_en, NULL); ++ ++ return err; ++} ++ + #define MINIMUM_MAX_MTU 9216 + int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, + u32 change, unsigned int mtu, +@@ -277,7 +300,7 @@ int mlx5e_port_manual_buffer_config(stru + + if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) { + update_prio2buffer = true; +- err = mlx5_query_port_pfc(priv->mdev, &curr_pfc_en, NULL); ++ err = fill_pfc_en(priv->mdev, &curr_pfc_en); + if (err) + return err; + diff --git a/queue-5.3/net-sched-fix-dump-qlen-for-sch_mq-sch_mqprio-with-nolock-subqueues.patch b/queue-5.3/net-sched-fix-dump-qlen-for-sch_mq-sch_mqprio-with-nolock-subqueues.patch new file mode 100644 index 00000000000..bc45a18dae6 --- /dev/null +++ b/queue-5.3/net-sched-fix-dump-qlen-for-sch_mq-sch_mqprio-with-nolock-subqueues.patch @@ -0,0 +1,42 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Dust Li +Date: Tue, 3 Dec 2019 11:17:40 +0800 +Subject: net: sched: fix dump qlen for sch_mq/sch_mqprio with NOLOCK subqueues + +From: Dust Li + +[ Upstream commit 2f23cd42e19c22c24ff0e221089b7b6123b117c5 ] + +sch->q.len hasn't been set if the subqueue is a NOLOCK qdisc + in mq_dump() and mqprio_dump(). + +Fixes: ce679e8df7ed ("net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio") +Signed-off-by: Dust Li +Signed-off-by: Tony Lu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_mq.c | 1 + + net/sched/sch_mqprio.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/net/sched/sch_mq.c ++++ b/net/sched/sch_mq.c +@@ -153,6 +153,7 @@ static int mq_dump(struct Qdisc *sch, st + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); ++ sch->q.qlen += qlen; + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; +--- a/net/sched/sch_mqprio.c ++++ b/net/sched/sch_mqprio.c +@@ -411,6 +411,7 @@ static int mqprio_dump(struct Qdisc *sch + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); ++ sch->q.qlen += qlen; + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; diff --git a/queue-5.3/net-sysfs-call-dev_hold-always-in-netdev_queue_add_kobject.patch b/queue-5.3/net-sysfs-call-dev_hold-always-in-netdev_queue_add_kobject.patch new file mode 100644 index 00000000000..b58326740ee --- /dev/null +++ b/queue-5.3/net-sysfs-call-dev_hold-always-in-netdev_queue_add_kobject.patch @@ -0,0 +1,46 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Jouni Hogander +Date: Thu, 5 Dec 2019 15:57:07 +0200 +Subject: net-sysfs: Call dev_hold always in netdev_queue_add_kobject + +From: Jouni Hogander + +[ Upstream commit e0b60903b434a7ee21ba8d8659f207ed84101e89 ] + +Dev_hold has to be called always in netdev_queue_add_kobject. +Otherwise usage count drops below 0 in case of failure in +kobject_init_and_add. + +Fixes: b8eb718348b8 ("net-sysfs: Fix reference count leak in rx|netdev_queue_add_kobject") +Reported-by: Hulk Robot +Cc: Tetsuo Handa +Cc: David Miller +Cc: Lukas Bulwahn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/net-sysfs.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/net/core/net-sysfs.c ++++ b/net/core/net-sysfs.c +@@ -1457,14 +1457,17 @@ static int netdev_queue_add_kobject(stru + struct kobject *kobj = &queue->kobj; + int error = 0; + ++ /* Kobject_put later will trigger netdev_queue_release call ++ * which decreases dev refcount: Take that reference here ++ */ ++ dev_hold(queue->dev); ++ + kobj->kset = dev->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) + return error; + +- dev_hold(queue->dev); +- + #ifdef CONFIG_BQL + error = sysfs_create_group(kobj, &dql_group); + if (error) { diff --git a/queue-5.3/net-thunderx-start-phy-before-starting-autonegotiation.patch b/queue-5.3/net-thunderx-start-phy-before-starting-autonegotiation.patch new file mode 100644 index 00000000000..ee81effcf7b --- /dev/null +++ b/queue-5.3/net-thunderx-start-phy-before-starting-autonegotiation.patch @@ -0,0 +1,39 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Mian Yousaf Kaukab +Date: Thu, 5 Dec 2019 10:41:16 +0100 +Subject: net: thunderx: start phy before starting autonegotiation + +From: Mian Yousaf Kaukab + +[ Upstream commit a350d2e7adbb57181d33e3aa6f0565632747feaa ] + +Since commit 2b3e88ea6528 ("net: phy: improve phy state checking") +phy_start_aneg() expects phy state to be >= PHY_UP. Call phy_start() +before calling phy_start_aneg() during probe so that autonegotiation +is initiated. + +As phy_start() takes care of calling phy_start_aneg(), drop the explicit +call to phy_start_aneg(). + +Network fails without this patch on Octeon TX. + +Fixes: 2b3e88ea6528 ("net: phy: improve phy state checking") +Signed-off-by: Mian Yousaf Kaukab +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c ++++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +@@ -1115,7 +1115,7 @@ static int bgx_lmac_enable(struct bgx *b + phy_interface_mode(lmac->lmac_type))) + return -ENODEV; + +- phy_start_aneg(lmac->phydev); ++ phy_start(lmac->phydev); + return 0; + } + diff --git a/queue-5.3/net-tls-fix-return-values-to-avoid-enotsupp.patch b/queue-5.3/net-tls-fix-return-values-to-avoid-enotsupp.patch new file mode 100644 index 00000000000..f3e8bcb9afc --- /dev/null +++ b/queue-5.3/net-tls-fix-return-values-to-avoid-enotsupp.patch @@ -0,0 +1,147 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Valentin Vidic +Date: Thu, 5 Dec 2019 07:41:18 +0100 +Subject: net/tls: Fix return values to avoid ENOTSUPP + +From: Valentin Vidic + +[ Upstream commit 4a5cdc604b9cf645e6fa24d8d9f055955c3c8516 ] + +ENOTSUPP is not available in userspace, for example: + + setsockopt failed, 524, Unknown error 524 + +Signed-off-by: Valentin Vidic +Acked-by: Jakub Kicinski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tls/tls_device.c | 8 ++++---- + net/tls/tls_main.c | 4 ++-- + net/tls/tls_sw.c | 8 ++++---- + tools/testing/selftests/net/tls.c | 8 ++------ + 4 files changed, 12 insertions(+), 16 deletions(-) + +--- a/net/tls/tls_device.c ++++ b/net/tls/tls_device.c +@@ -385,7 +385,7 @@ static int tls_push_data(struct sock *sk + + if (flags & + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + + if (sk->sk_err) + return -sk->sk_err; +@@ -519,7 +519,7 @@ int tls_device_sendpage(struct sock *sk, + lock_sock(sk); + + if (flags & MSG_OOB) { +- rc = -ENOTSUPP; ++ rc = -EOPNOTSUPP; + goto out; + } + +@@ -961,7 +961,7 @@ int tls_set_device_offload(struct sock * + } + + if (!(netdev->features & NETIF_F_HW_TLS_TX)) { +- rc = -ENOTSUPP; ++ rc = -EOPNOTSUPP; + goto release_netdev; + } + +@@ -1034,7 +1034,7 @@ int tls_set_device_offload_rx(struct soc + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { +- rc = -ENOTSUPP; ++ rc = -EOPNOTSUPP; + goto release_netdev; + } + +--- a/net/tls/tls_main.c ++++ b/net/tls/tls_main.c +@@ -473,7 +473,7 @@ static int do_tls_setsockopt_conf(struct + /* check version */ + if (crypto_info->version != TLS_1_2_VERSION && + crypto_info->version != TLS_1_3_VERSION) { +- rc = -ENOTSUPP; ++ rc = -EINVAL; + goto err_crypto_info; + } + +@@ -782,7 +782,7 @@ static int tls_init(struct sock *sk) + * share the ulp context. + */ + if (sk->sk_state != TCP_ESTABLISHED) +- return -ENOTSUPP; ++ return -ENOTCONN; + + tls_build_proto(sk); + +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -900,7 +900,7 @@ int tls_sw_sendmsg(struct sock *sk, stru + int ret = 0; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + + mutex_lock(&tls_ctx->tx_lock); + lock_sock(sk); +@@ -1215,7 +1215,7 @@ int tls_sw_sendpage_locked(struct sock * + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | + MSG_NO_SHARED_FRAGS)) +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); + } +@@ -1228,7 +1228,7 @@ int tls_sw_sendpage(struct sock *sk, str + + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) +- return -ENOTSUPP; ++ return -EOPNOTSUPP; + + mutex_lock(&tls_ctx->tx_lock); + lock_sock(sk); +@@ -1928,7 +1928,7 @@ ssize_t tls_sw_splice_read(struct socket + + /* splice does not support reading control messages */ + if (ctx->control != TLS_RECORD_TYPE_DATA) { +- err = -ENOTSUPP; ++ err = -EINVAL; + goto splice_read_end; + } + +--- a/tools/testing/selftests/net/tls.c ++++ b/tools/testing/selftests/net/tls.c +@@ -25,10 +25,6 @@ + #define TLS_PAYLOAD_MAX_LEN 16384 + #define SOL_TLS 282 + +-#ifndef ENOTSUPP +-#define ENOTSUPP 524 +-#endif +- + FIXTURE(tls_basic) + { + int fd, cfd; +@@ -1205,11 +1201,11 @@ TEST(non_established) { + /* TLS ULP not supported */ + if (errno == ENOENT) + return; +- EXPECT_EQ(errno, ENOTSUPP); ++ EXPECT_EQ(errno, ENOTCONN); + + ret = setsockopt(sfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); + EXPECT_EQ(ret, -1); +- EXPECT_EQ(errno, ENOTSUPP); ++ EXPECT_EQ(errno, ENOTCONN); + + ret = getsockname(sfd, &addr, &len); + ASSERT_EQ(ret, 0); diff --git a/queue-5.3/net_sched-validate-tca_kind-attribute-in-tc_chain_tmplt_add.patch b/queue-5.3/net_sched-validate-tca_kind-attribute-in-tc_chain_tmplt_add.patch new file mode 100644 index 00000000000..3e2739b3be6 --- /dev/null +++ b/queue-5.3/net_sched-validate-tca_kind-attribute-in-tc_chain_tmplt_add.patch @@ -0,0 +1,114 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Eric Dumazet +Date: Sat, 7 Dec 2019 11:34:45 -0800 +Subject: net_sched: validate TCA_KIND attribute in tc_chain_tmplt_add() + +From: Eric Dumazet + +[ Upstream commit 2dd5616ecdcebdf5a8d007af64e040d4e9214efe ] + +Use the new tcf_proto_check_kind() helper to make sure user +provided value is well formed. + +BUG: KMSAN: uninit-value in string_nocheck lib/vsprintf.c:606 [inline] +BUG: KMSAN: uninit-value in string+0x4be/0x600 lib/vsprintf.c:668 +CPU: 0 PID: 12358 Comm: syz-executor.1 Not tainted 5.4.0-rc8-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x1c9/0x220 lib/dump_stack.c:118 + kmsan_report+0x128/0x220 mm/kmsan/kmsan_report.c:108 + __msan_warning+0x64/0xc0 mm/kmsan/kmsan_instr.c:245 + string_nocheck lib/vsprintf.c:606 [inline] + string+0x4be/0x600 lib/vsprintf.c:668 + vsnprintf+0x218f/0x3210 lib/vsprintf.c:2510 + __request_module+0x2b1/0x11c0 kernel/kmod.c:143 + tcf_proto_lookup_ops+0x171/0x700 net/sched/cls_api.c:139 + tc_chain_tmplt_add net/sched/cls_api.c:2730 [inline] + tc_ctl_chain+0x1904/0x38a0 net/sched/cls_api.c:2850 + rtnetlink_rcv_msg+0x115a/0x1580 net/core/rtnetlink.c:5224 + netlink_rcv_skb+0x431/0x620 net/netlink/af_netlink.c:2477 + rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5242 + netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] + netlink_unicast+0xf3e/0x1020 net/netlink/af_netlink.c:1328 + netlink_sendmsg+0x110f/0x1330 net/netlink/af_netlink.c:1917 + sock_sendmsg_nosec net/socket.c:637 [inline] + sock_sendmsg net/socket.c:657 [inline] + ___sys_sendmsg+0x14ff/0x1590 net/socket.c:2311 + __sys_sendmsg net/socket.c:2356 [inline] + __do_sys_sendmsg net/socket.c:2365 [inline] + __se_sys_sendmsg+0x305/0x460 net/socket.c:2363 + __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2363 + do_syscall_64+0xb6/0x160 arch/x86/entry/common.c:291 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 +RIP: 0033:0x45a649 +Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 +RSP: 002b:00007f0790795c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e +RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 000000000045a649 +RDX: 0000000000000000 RSI: 0000000020000300 RDI: 0000000000000006 +RBP: 000000000075bfc8 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000246 R12: 00007f07907966d4 +R13: 00000000004c8db5 R14: 00000000004df630 R15: 00000000ffffffff + +Uninit was created at: + kmsan_save_stack_with_flags mm/kmsan/kmsan.c:149 [inline] + kmsan_internal_poison_shadow+0x5c/0x110 mm/kmsan/kmsan.c:132 + kmsan_slab_alloc+0x97/0x100 mm/kmsan/kmsan_hooks.c:86 + slab_alloc_node mm/slub.c:2773 [inline] + __kmalloc_node_track_caller+0xe27/0x11a0 mm/slub.c:4381 + __kmalloc_reserve net/core/skbuff.c:141 [inline] + __alloc_skb+0x306/0xa10 net/core/skbuff.c:209 + alloc_skb include/linux/skbuff.h:1049 [inline] + netlink_alloc_large_skb net/netlink/af_netlink.c:1174 [inline] + netlink_sendmsg+0x783/0x1330 net/netlink/af_netlink.c:1892 + sock_sendmsg_nosec net/socket.c:637 [inline] + sock_sendmsg net/socket.c:657 [inline] + ___sys_sendmsg+0x14ff/0x1590 net/socket.c:2311 + __sys_sendmsg net/socket.c:2356 [inline] + __do_sys_sendmsg net/socket.c:2365 [inline] + __se_sys_sendmsg+0x305/0x460 net/socket.c:2363 + __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2363 + do_syscall_64+0xb6/0x160 arch/x86/entry/common.c:291 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Fixes: 6f96c3c6904c ("net_sched: fix backward compatibility for TCA_KIND") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Acked-by: Cong Wang +Cc: Marcelo Ricardo Leitner +Cc: Jamal Hadi Salim +Cc: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_api.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c +index 5c1769999a92..758ca7e5304c 100644 +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -2854,13 +2854,19 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + struct netlink_ext_ack *extack) + { + const struct tcf_proto_ops *ops; ++ char name[IFNAMSIZ]; + void *tmplt_priv; + + /* If kind is not set, user did not specify template. */ + if (!tca[TCA_KIND]) + return 0; + +- ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack); ++ if (tcf_proto_check_kind(tca[TCA_KIND], name)) { ++ NL_SET_ERR_MSG(extack, "Specified TC chain template name too long"); ++ return -EINVAL; ++ } ++ ++ ops = tcf_proto_lookup_ops(name, true, extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { +-- +2.20.1 + diff --git a/queue-5.3/openvswitch-support-asymmetric-conntrack.patch b/queue-5.3/openvswitch-support-asymmetric-conntrack.patch new file mode 100644 index 00000000000..15671c076e2 --- /dev/null +++ b/queue-5.3/openvswitch-support-asymmetric-conntrack.patch @@ -0,0 +1,46 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Aaron Conole +Date: Tue, 3 Dec 2019 16:34:13 -0500 +Subject: openvswitch: support asymmetric conntrack + +From: Aaron Conole + +[ Upstream commit 5d50aa83e2c8e91ced2cca77c198b468ca9210f4 ] + +The openvswitch module shares a common conntrack and NAT infrastructure +exposed via netfilter. It's possible that a packet needs both SNAT and +DNAT manipulation, due to e.g. tuple collision. Netfilter can support +this because it runs through the NAT table twice - once on ingress and +again after egress. The openvswitch module doesn't have such capability. + +Like netfilter hook infrastructure, we should run through NAT twice to +keep the symmetry. + +Fixes: 05752523e565 ("openvswitch: Interface with NAT.") +Signed-off-by: Aaron Conole +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/conntrack.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/net/openvswitch/conntrack.c ++++ b/net/openvswitch/conntrack.c +@@ -903,6 +903,17 @@ static int ovs_ct_nat(struct net *net, s + } + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + ++ if (err == NF_ACCEPT && ++ ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { ++ if (maniptype == NF_NAT_MANIP_SRC) ++ maniptype = NF_NAT_MANIP_DST; ++ else ++ maniptype = NF_NAT_MANIP_SRC; ++ ++ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, ++ maniptype); ++ } ++ + /* Mark NAT done if successful and update the flow key. */ + if (err == NF_ACCEPT) + ovs_nat_update_key(key, skb, maniptype); diff --git a/queue-5.3/page_pool-do-not-release-pool-until-inflight-0.patch b/queue-5.3/page_pool-do-not-release-pool-until-inflight-0.patch new file mode 100644 index 00000000000..37f486e9e57 --- /dev/null +++ b/queue-5.3/page_pool-do-not-release-pool-until-inflight-0.patch @@ -0,0 +1,565 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Jonathan Lemon +Date: Thu, 14 Nov 2019 14:13:00 -0800 +Subject: page_pool: do not release pool until inflight == 0. + +From: Jonathan Lemon + +[ Upstream commit c3f812cea0d7006469d1cf33a4a9f0a12bb4b3a3 ] + +The page pool keeps track of the number of pages in flight, and +it isn't safe to remove the pool until all pages are returned. + +Disallow removing the pool until all pages are back, so the pool +is always available for page producers. + +Make the page pool responsible for its own delayed destruction +instead of relying on XDP, so the page pool can be used without +the xdp memory model. + +When all pages are returned, free the pool and notify xdp if the +pool is registered with the xdp memory system. Have the callback +perform a table walk since some drivers (cpsw) may share the pool +among multiple xdp_rxq_info. + +Note that the increment of pages_state_release_cnt may result in +inflight == 0, resulting in the pool being released. + +Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning") +Signed-off-by: Jonathan Lemon +Acked-by: Jesper Dangaard Brouer +Acked-by: Ilias Apalodimas +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 + include/net/page_pool.h | 52 ++------- + include/net/xdp_priv.h | 4 + include/trace/events/xdp.h | 19 --- + net/core/page_pool.c | 122 +++++++++++++--------- + net/core/xdp.c | 121 +++++++-------------- + 6 files changed, 139 insertions(+), 183 deletions(-) + +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -1477,10 +1477,8 @@ static void free_dma_rx_desc_resources(s + rx_q->dma_erx, rx_q->dma_rx_phy); + + kfree(rx_q->buf_pool); +- if (rx_q->page_pool) { +- page_pool_request_shutdown(rx_q->page_pool); ++ if (rx_q->page_pool) + page_pool_destroy(rx_q->page_pool); +- } + } + } + +--- a/include/net/page_pool.h ++++ b/include/net/page_pool.h +@@ -70,7 +70,12 @@ struct page_pool_params { + struct page_pool { + struct page_pool_params p; + +- u32 pages_state_hold_cnt; ++ struct delayed_work release_dw; ++ void (*disconnect)(void *); ++ unsigned long defer_start; ++ unsigned long defer_warn; ++ ++ u32 pages_state_hold_cnt; + + /* + * Data structure for allocation side +@@ -129,25 +134,19 @@ inline enum dma_data_direction page_pool + + struct page_pool *page_pool_create(const struct page_pool_params *params); + +-void __page_pool_free(struct page_pool *pool); +-static inline void page_pool_free(struct page_pool *pool) +-{ +- /* When page_pool isn't compiled-in, net/core/xdp.c doesn't +- * allow registering MEM_TYPE_PAGE_POOL, but shield linker. +- */ + #ifdef CONFIG_PAGE_POOL +- __page_pool_free(pool); +-#endif +-} +- +-/* Drivers use this instead of page_pool_free */ ++void page_pool_destroy(struct page_pool *pool); ++void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); ++#else + static inline void page_pool_destroy(struct page_pool *pool) + { +- if (!pool) +- return; ++} + +- page_pool_free(pool); ++static inline void page_pool_use_xdp_mem(struct page_pool *pool, ++ void (*disconnect)(void *)) ++{ + } ++#endif + + /* Never call this directly, use helpers below */ + void __page_pool_put_page(struct page_pool *pool, +@@ -170,24 +169,6 @@ static inline void page_pool_recycle_dir + __page_pool_put_page(pool, page, true); + } + +-/* API user MUST have disconnected alloc-side (not allowed to call +- * page_pool_alloc_pages()) before calling this. The free-side can +- * still run concurrently, to handle in-flight packet-pages. +- * +- * A request to shutdown can fail (with false) if there are still +- * in-flight packet-pages. +- */ +-bool __page_pool_request_shutdown(struct page_pool *pool); +-static inline bool page_pool_request_shutdown(struct page_pool *pool) +-{ +- bool safe_to_remove = false; +- +-#ifdef CONFIG_PAGE_POOL +- safe_to_remove = __page_pool_request_shutdown(pool); +-#endif +- return safe_to_remove; +-} +- + /* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal +@@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled + #endif + } + +-static inline void page_pool_get(struct page_pool *pool) +-{ +- refcount_inc(&pool->user_cnt); +-} +- + static inline bool page_pool_put(struct page_pool *pool) + { + return refcount_dec_and_test(&pool->user_cnt); +--- a/include/net/xdp_priv.h ++++ b/include/net/xdp_priv.h +@@ -12,12 +12,8 @@ struct xdp_mem_allocator { + struct page_pool *page_pool; + struct zero_copy_allocator *zc_alloc; + }; +- int disconnect_cnt; +- unsigned long defer_start; + struct rhash_head node; + struct rcu_head rcu; +- struct delayed_work defer_wq; +- unsigned long defer_warn; + }; + + #endif /* __LINUX_NET_XDP_PRIV_H__ */ +--- a/include/trace/events/xdp.h ++++ b/include/trace/events/xdp.h +@@ -316,19 +316,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) + + TRACE_EVENT(mem_disconnect, + +- TP_PROTO(const struct xdp_mem_allocator *xa, +- bool safe_to_remove, bool force), ++ TP_PROTO(const struct xdp_mem_allocator *xa), + +- TP_ARGS(xa, safe_to_remove, force), ++ TP_ARGS(xa), + + TP_STRUCT__entry( + __field(const struct xdp_mem_allocator *, xa) + __field(u32, mem_id) + __field(u32, mem_type) + __field(const void *, allocator) +- __field(bool, safe_to_remove) +- __field(bool, force) +- __field(int, disconnect_cnt) + ), + + TP_fast_assign( +@@ -336,19 +332,12 @@ TRACE_EVENT(mem_disconnect, + __entry->mem_id = xa->mem.id; + __entry->mem_type = xa->mem.type; + __entry->allocator = xa->allocator; +- __entry->safe_to_remove = safe_to_remove; +- __entry->force = force; +- __entry->disconnect_cnt = xa->disconnect_cnt; + ), + +- TP_printk("mem_id=%d mem_type=%s allocator=%p" +- " safe_to_remove=%s force=%s disconnect_cnt=%d", ++ TP_printk("mem_id=%d mem_type=%s allocator=%p", + __entry->mem_id, + __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), +- __entry->allocator, +- __entry->safe_to_remove ? "true" : "false", +- __entry->force ? "true" : "false", +- __entry->disconnect_cnt ++ __entry->allocator + ) + ); + +--- a/net/core/page_pool.c ++++ b/net/core/page_pool.c +@@ -18,6 +18,9 @@ + + #include + ++#define DEFER_TIME (msecs_to_jiffies(1000)) ++#define DEFER_WARN_INTERVAL (60 * HZ) ++ + static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) + { +@@ -200,22 +203,14 @@ static s32 page_pool_inflight(struct pag + { + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); +- s32 distance; +- +- distance = _distance(hold_cnt, release_cnt); +- +- trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); +- return distance; +-} ++ s32 inflight; + +-static bool __page_pool_safe_to_destroy(struct page_pool *pool) +-{ +- s32 inflight = page_pool_inflight(pool); ++ inflight = _distance(hold_cnt, release_cnt); + +- /* The distance should not be able to become negative */ ++ trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + +- return (inflight == 0); ++ return inflight; + } + + /* Cleanup page_pool state from page */ +@@ -223,6 +218,7 @@ static void __page_pool_clean_page(struc + struct page *page) + { + dma_addr_t dma; ++ int count; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_unmap; +@@ -234,9 +230,11 @@ static void __page_pool_clean_page(struc + DMA_ATTR_SKIP_CPU_SYNC); + page->dma_addr = 0; + skip_dma_unmap: +- atomic_inc(&pool->pages_state_release_cnt); +- trace_page_pool_state_release(pool, page, +- atomic_read(&pool->pages_state_release_cnt)); ++ /* This may be the last page returned, releasing the pool, so ++ * it is not safe to reference pool afterwards. ++ */ ++ count = atomic_inc_return(&pool->pages_state_release_cnt); ++ trace_page_pool_state_release(pool, page, count); + } + + /* unmap the page and clean our state */ +@@ -345,31 +343,10 @@ static void __page_pool_empty_ring(struc + } + } + +-static void __warn_in_flight(struct page_pool *pool) ++static void page_pool_free(struct page_pool *pool) + { +- u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); +- u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); +- s32 distance; +- +- distance = _distance(hold_cnt, release_cnt); +- +- /* Drivers should fix this, but only problematic when DMA is used */ +- WARN(1, "Still in-flight pages:%d hold:%u released:%u", +- distance, hold_cnt, release_cnt); +-} +- +-void __page_pool_free(struct page_pool *pool) +-{ +- /* Only last user actually free/release resources */ +- if (!page_pool_put(pool)) +- return; +- +- WARN(pool->alloc.count, "API usage violation"); +- WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); +- +- /* Can happen due to forced shutdown */ +- if (!__page_pool_safe_to_destroy(pool)) +- __warn_in_flight(pool); ++ if (pool->disconnect) ++ pool->disconnect(pool); + + ptr_ring_cleanup(&pool->ring, NULL); + +@@ -378,12 +355,8 @@ void __page_pool_free(struct page_pool * + + kfree(pool); + } +-EXPORT_SYMBOL(__page_pool_free); + +-/* Request to shutdown: release pages cached by page_pool, and check +- * for in-flight pages +- */ +-bool __page_pool_request_shutdown(struct page_pool *pool) ++static void page_pool_scrub(struct page_pool *pool) + { + struct page *page; + +@@ -400,7 +373,64 @@ bool __page_pool_request_shutdown(struct + * be in-flight. + */ + __page_pool_empty_ring(pool); ++} ++ ++static int page_pool_release(struct page_pool *pool) ++{ ++ int inflight; ++ ++ page_pool_scrub(pool); ++ inflight = page_pool_inflight(pool); ++ if (!inflight) ++ page_pool_free(pool); ++ ++ return inflight; ++} ++ ++static void page_pool_release_retry(struct work_struct *wq) ++{ ++ struct delayed_work *dwq = to_delayed_work(wq); ++ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); ++ int inflight; ++ ++ inflight = page_pool_release(pool); ++ if (!inflight) ++ return; ++ ++ /* Periodic warning */ ++ if (time_after_eq(jiffies, pool->defer_warn)) { ++ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; ++ ++ pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", ++ __func__, inflight, sec); ++ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; ++ } ++ ++ /* Still not ready to be disconnected, retry later */ ++ schedule_delayed_work(&pool->release_dw, DEFER_TIME); ++} ++ ++void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) ++{ ++ refcount_inc(&pool->user_cnt); ++ pool->disconnect = disconnect; ++} ++ ++void page_pool_destroy(struct page_pool *pool) ++{ ++ if (!pool) ++ return; ++ ++ if (!page_pool_put(pool)) ++ return; ++ ++ if (!page_pool_release(pool)) ++ return; ++ ++ pool->defer_start = jiffies; ++ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + +- return __page_pool_safe_to_destroy(pool); ++ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); ++ schedule_delayed_work(&pool->release_dw, DEFER_TIME); + } +-EXPORT_SYMBOL(__page_pool_request_shutdown); ++EXPORT_SYMBOL(page_pool_destroy); +--- a/net/core/xdp.c ++++ b/net/core/xdp.c +@@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + +- /* Allocator have indicated safe to remove before this is called */ +- if (xa->mem.type == MEM_TYPE_PAGE_POOL) +- page_pool_free(xa->page_pool); +- + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + +@@ -85,62 +81,57 @@ static void __xdp_mem_allocator_rcu_free + kfree(xa); + } + +-static bool __mem_id_disconnect(int id, bool force) ++static void mem_xa_remove(struct xdp_mem_allocator *xa) + { +- struct xdp_mem_allocator *xa; +- bool safe_to_remove = true; ++ trace_mem_disconnect(xa); + + mutex_lock(&mem_id_lock); + +- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); +- if (!xa) { +- mutex_unlock(&mem_id_lock); +- WARN(1, "Request remove non-existing id(%d), driver bug?", id); +- return true; +- } +- xa->disconnect_cnt++; +- +- /* Detects in-flight packet-pages for page_pool */ +- if (xa->mem.type == MEM_TYPE_PAGE_POOL) +- safe_to_remove = page_pool_request_shutdown(xa->page_pool); +- +- trace_mem_disconnect(xa, safe_to_remove, force); +- +- if ((safe_to_remove || force) && +- !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) ++ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +- return (safe_to_remove|force); + } + +-#define DEFER_TIME (msecs_to_jiffies(1000)) +-#define DEFER_WARN_INTERVAL (30 * HZ) +-#define DEFER_MAX_RETRIES 120 ++static void mem_allocator_disconnect(void *allocator) ++{ ++ struct xdp_mem_allocator *xa; ++ struct rhashtable_iter iter; ++ ++ rhashtable_walk_enter(mem_id_ht, &iter); ++ do { ++ rhashtable_walk_start(&iter); ++ ++ while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { ++ if (xa->allocator == allocator) ++ mem_xa_remove(xa); ++ } ++ ++ rhashtable_walk_stop(&iter); ++ ++ } while (xa == ERR_PTR(-EAGAIN)); ++ rhashtable_walk_exit(&iter); ++} + +-static void mem_id_disconnect_defer_retry(struct work_struct *wq) ++static void mem_id_disconnect(int id) + { +- struct delayed_work *dwq = to_delayed_work(wq); +- struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); +- bool force = false; ++ struct xdp_mem_allocator *xa; + +- if (xa->disconnect_cnt > DEFER_MAX_RETRIES) +- force = true; ++ mutex_lock(&mem_id_lock); + +- if (__mem_id_disconnect(xa->mem.id, force)) ++ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); ++ if (!xa) { ++ mutex_unlock(&mem_id_lock); ++ WARN(1, "Request remove non-existing id(%d), driver bug?", id); + return; ++ } + +- /* Periodic warning */ +- if (time_after_eq(jiffies, xa->defer_warn)) { +- int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; ++ trace_mem_disconnect(xa); + +- pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", +- __func__, xa->mem.id, xa->disconnect_cnt, sec); +- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; +- } ++ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) ++ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + +- /* Still not ready to be disconnected, retry later */ +- schedule_delayed_work(&xa->defer_wq, DEFER_TIME); ++ mutex_unlock(&mem_id_lock); + } + + void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +@@ -153,38 +144,21 @@ void xdp_rxq_info_unreg_mem_model(struct + return; + } + +- if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && +- xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { +- return; +- } +- + if (id == 0) + return; + +- if (__mem_id_disconnect(id, false)) +- return; +- +- /* Could not disconnect, defer new disconnect attempt to later */ +- mutex_lock(&mem_id_lock); ++ if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) ++ return mem_id_disconnect(id); + +- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); +- if (!xa) { +- mutex_unlock(&mem_id_lock); +- return; ++ if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { ++ rcu_read_lock(); ++ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); ++ page_pool_destroy(xa->page_pool); ++ rcu_read_unlock(); + } +- xa->defer_start = jiffies; +- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; +- +- INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); +- mutex_unlock(&mem_id_lock); +- schedule_delayed_work(&xa->defer_wq, DEFER_TIME); + } + EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); + +-/* This unregister operation will also cleanup and destroy the +- * allocator. The page_pool_free() operation is first called when it's +- * safe to remove, possibly deferred to a workqueue. +- */ + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) + { + /* Simplify driver cleanup code paths, allow unreg "unused" */ +@@ -371,7 +345,7 @@ int xdp_rxq_info_reg_mem_model(struct xd + } + + if (type == MEM_TYPE_PAGE_POOL) +- page_pool_get(xdp_alloc->page_pool); ++ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); + + mutex_unlock(&mem_id_lock); + +@@ -402,15 +376,8 @@ static void __xdp_return(void *data, str + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); +- if (likely(xa)) { +- napi_direct &= !xdp_return_frame_no_direct(); +- page_pool_put_page(xa->page_pool, page, napi_direct); +- } else { +- /* Hopefully stack show who to blame for late return */ +- WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); +- trace_mem_return_failed(mem, page); +- put_page(page); +- } ++ napi_direct &= !xdp_return_frame_no_direct(); ++ page_pool_put_page(xa->page_pool, page, napi_direct); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: diff --git a/queue-5.3/series b/queue-5.3/series new file mode 100644 index 00000000000..576379f0d07 --- /dev/null +++ b/queue-5.3/series @@ -0,0 +1,26 @@ +inet-protect-against-too-small-mtu-values.patch +mqprio-fix-out-of-bounds-access-in-mqprio_dump.patch +net-bridge-deny-dev_set_mac_address-when-unregistering.patch +net-dsa-fix-flow-dissection-on-tx-path.patch +net-ethernet-ti-cpsw-fix-extra-rx-interrupt.patch +net-sched-fix-dump-qlen-for-sch_mq-sch_mqprio-with-nolock-subqueues.patch +net-sysfs-call-dev_hold-always-in-netdev_queue_add_kobject.patch +net-thunderx-start-phy-before-starting-autonegotiation.patch +net-tls-fix-return-values-to-avoid-enotsupp.patch +openvswitch-support-asymmetric-conntrack.patch +tcp-md5-fix-potential-overestimation-of-tcp-option-space.patch +tipc-fix-ordering-of-tipc-module-init-and-exit-routine.patch +net-mlx5e-query-global-pause-state-before-setting-prio2buffer.patch +net-ipv6-add-net-argument-to-ip6_dst_lookup_flow.patch +net-ipv6_stub-use-ip6_dst_lookup_flow-instead-of-ip6_dst_lookup.patch +tcp-fix-rejected-syncookies-due-to-stale-timestamps.patch +tcp-tighten-acceptance-of-acks-not-matching-a-child-socket.patch +tcp-protect-accesses-to-.ts_recent_stamp-with-read-write-_once.patch +gre-refetch-erspan-header-from-skb-data-after-pskb_may_pull.patch +fixed-updating-of-ethertype-in-function-skb_mpls_pop.patch +hsr-fix-a-null-pointer-dereference-in-hsr_dev_xmit.patch +net-fixed-updating-of-ethertype-in-skb_mpls_push.patch +net-mlx5e-fix-txq-indices-to-be-sequential.patch +page_pool-do-not-release-pool-until-inflight-0.patch +xdp-obtain-the-mem_id-mutex-before-trying-to-remove-an-entry.patch +net_sched-validate-tca_kind-attribute-in-tc_chain_tmplt_add.patch diff --git a/queue-5.3/tcp-fix-rejected-syncookies-due-to-stale-timestamps.patch b/queue-5.3/tcp-fix-rejected-syncookies-due-to-stale-timestamps.patch new file mode 100644 index 00000000000..de23ae96375 --- /dev/null +++ b/queue-5.3/tcp-fix-rejected-syncookies-due-to-stale-timestamps.patch @@ -0,0 +1,117 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Guillaume Nault +Date: Fri, 6 Dec 2019 12:38:36 +0100 +Subject: tcp: fix rejected syncookies due to stale timestamps + +From: Guillaume Nault + +[ Upstream commit 04d26e7b159a396372646a480f4caa166d1b6720 ] + +If no synflood happens for a long enough period of time, then the +synflood timestamp isn't refreshed and jiffies can advance so much +that time_after32() can't accurately compare them any more. + +Therefore, we can end up in a situation where time_after32(now, +last_overflow + HZ) returns false, just because these two values are +too far apart. In that case, the synflood timestamp isn't updated as +it should be, which can trick tcp_synq_no_recent_overflow() into +rejecting valid syncookies. + +For example, let's consider the following scenario on a system +with HZ=1000: + + * The synflood timestamp is 0, either because that's the timestamp + of the last synflood or, more commonly, because we're working with + a freshly created socket. + + * We receive a new SYN, which triggers synflood protection. Let's say + that this happens when jiffies == 2147484649 (that is, + 'synflood timestamp' + HZ + 2^31 + 1). + + * Then tcp_synq_overflow() doesn't update the synflood timestamp, + because time_after32(2147484649, 1000) returns false. + With: + - 2147484649: the value of jiffies, aka. 'now'. + - 1000: the value of 'last_overflow' + HZ. + + * A bit later, we receive the ACK completing the 3WHS. But + cookie_v[46]_check() rejects it because tcp_synq_no_recent_overflow() + says that we're not under synflood. That's because + time_after32(2147484649, 120000) returns false. + With: + - 2147484649: the value of jiffies, aka. 'now'. + - 120000: the value of 'last_overflow' + TCP_SYNCOOKIE_VALID. + + Of course, in reality jiffies would have increased a bit, but this + condition will last for the next 119 seconds, which is far enough + to accommodate for jiffie's growth. + +Fix this by updating the overflow timestamp whenever jiffies isn't +within the [last_overflow, last_overflow + HZ] range. That shouldn't +have any performance impact since the update still happens at most once +per second. + +Now we're guaranteed to have fresh timestamps while under synflood, so +tcp_synq_no_recent_overflow() can safely use it with time_after32() in +such situations. + +Stale timestamps can still make tcp_synq_no_recent_overflow() return +the wrong verdict when not under synflood. This will be handled in the +next patch. + +For 64 bits architectures, the problem was introduced with the +conversion of ->tw_ts_recent_stamp to 32 bits integer by commit +cca9bab1b72c ("tcp: use monotonic timestamps for PAWS"). +The problem has always been there on 32 bits architectures. + +Fixes: cca9bab1b72c ("tcp: use monotonic timestamps for PAWS") +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Guillaume Nault +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/time.h | 13 +++++++++++++ + include/net/tcp.h | 5 +++-- + 2 files changed, 16 insertions(+), 2 deletions(-) + +--- a/include/linux/time.h ++++ b/include/linux/time.h +@@ -96,4 +96,17 @@ static inline bool itimerspec64_valid(co + */ + #define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) + #define time_before32(b, a) time_after32(a, b) ++ ++/** ++ * time_between32 - check if a 32-bit timestamp is within a given time range ++ * @t: the time which may be within [l,h] ++ * @l: the lower bound of the range ++ * @h: the higher bound of the range ++ * ++ * time_before32(t, l, h) returns true if @l <= @t <= @h. All operands are ++ * treated as 32-bit integers. ++ * ++ * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). ++ */ ++#define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) + #endif +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -484,14 +484,15 @@ static inline void tcp_synq_overflow(con + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); +- if (time_after32(now, last_overflow + HZ)) ++ if (!time_between32(now, last_overflow, ++ last_overflow + HZ)) + WRITE_ONCE(reuse->synq_overflow_ts, now); + return; + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; +- if (time_after32(now, last_overflow + HZ)) ++ if (!time_between32(now, last_overflow, last_overflow + HZ)) + tcp_sk(sk)->rx_opt.ts_recent_stamp = now; + } + diff --git a/queue-5.3/tcp-md5-fix-potential-overestimation-of-tcp-option-space.patch b/queue-5.3/tcp-md5-fix-potential-overestimation-of-tcp-option-space.patch new file mode 100644 index 00000000000..e1df6c9dd8f --- /dev/null +++ b/queue-5.3/tcp-md5-fix-potential-overestimation-of-tcp-option-space.patch @@ -0,0 +1,46 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Eric Dumazet +Date: Thu, 5 Dec 2019 10:10:15 -0800 +Subject: tcp: md5: fix potential overestimation of TCP option space + +From: Eric Dumazet + +[ Upstream commit 9424e2e7ad93ffffa88f882c9bc5023570904b55 ] + +Back in 2008, Adam Langley fixed the corner case of packets for flows +having all of the following options : MD5 TS SACK + +Since MD5 needs 20 bytes, and TS needs 12 bytes, no sack block +can be cooked from the remaining 8 bytes. + +tcp_established_options() correctly sets opts->num_sack_blocks +to zero, but returns 36 instead of 32. + +This means TCP cooks packets with 4 extra bytes at the end +of options, containing unitialized bytes. + +Fixes: 33ad798c924b ("tcp: options clean up") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Acked-by: Neal Cardwell +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -755,8 +755,9 @@ static unsigned int tcp_established_opti + min_t(unsigned int, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); +- size += TCPOLEN_SACK_BASE_ALIGNED + +- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; ++ if (likely(opts->num_sack_blocks)) ++ size += TCPOLEN_SACK_BASE_ALIGNED + ++ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; diff --git a/queue-5.3/tcp-protect-accesses-to-.ts_recent_stamp-with-read-write-_once.patch b/queue-5.3/tcp-protect-accesses-to-.ts_recent_stamp-with-read-write-_once.patch new file mode 100644 index 00000000000..a864b2b8b6c --- /dev/null +++ b/queue-5.3/tcp-protect-accesses-to-.ts_recent_stamp-with-read-write-_once.patch @@ -0,0 +1,50 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Guillaume Nault +Date: Fri, 6 Dec 2019 12:38:49 +0100 +Subject: tcp: Protect accesses to .ts_recent_stamp with {READ,WRITE}_ONCE() + +From: Guillaume Nault + +[ Upstream commit 721c8dafad26ccfa90ff659ee19755e3377b829d ] + +Syncookies borrow the ->rx_opt.ts_recent_stamp field to store the +timestamp of the last synflood. Protect them with READ_ONCE() and +WRITE_ONCE() since reads and writes aren't serialised. + +Use of .rx_opt.ts_recent_stamp for storing the synflood timestamp was +introduced by a0f82f64e269 ("syncookies: remove last_synq_overflow from +struct tcp_sock"). But unprotected accesses were already there when +timestamp was stored in .last_synq_overflow. + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Guillaume Nault +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -491,9 +491,9 @@ static inline void tcp_synq_overflow(con + } + } + +- last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; ++ last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); + if (!time_between32(now, last_overflow, last_overflow + HZ)) +- tcp_sk(sk)->rx_opt.ts_recent_stamp = now; ++ WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); + } + + /* syncookies: no recent synqueue overflow on this listening socket? */ +@@ -514,7 +514,7 @@ static inline bool tcp_synq_no_recent_ov + } + } + +- last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; ++ last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); + + /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, + * then we're under synflood. However, we have to use diff --git a/queue-5.3/tcp-tighten-acceptance-of-acks-not-matching-a-child-socket.patch b/queue-5.3/tcp-tighten-acceptance-of-acks-not-matching-a-child-socket.patch new file mode 100644 index 00000000000..6ec62112030 --- /dev/null +++ b/queue-5.3/tcp-tighten-acceptance-of-acks-not-matching-a-child-socket.patch @@ -0,0 +1,86 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Guillaume Nault +Date: Fri, 6 Dec 2019 12:38:43 +0100 +Subject: tcp: tighten acceptance of ACKs not matching a child socket + +From: Guillaume Nault + +[ Upstream commit cb44a08f8647fd2e8db5cc9ac27cd8355fa392d8 ] + +When no synflood occurs, the synflood timestamp isn't updated. +Therefore it can be so old that time_after32() can consider it to be +in the future. + +That's a problem for tcp_synq_no_recent_overflow() as it may report +that a recent overflow occurred while, in fact, it's just that jiffies +has grown past 'last_overflow' + TCP_SYNCOOKIE_VALID + 2^31. + +Spurious detection of recent overflows lead to extra syncookie +verification in cookie_v[46]_check(). At that point, the verification +should fail and the packet dropped. But we should have dropped the +packet earlier as we didn't even send a syncookie. + +Let's refine tcp_synq_no_recent_overflow() to report a recent overflow +only if jiffies is within the +[last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval. This +way, no spurious recent overflow is reported when jiffies wraps and +'last_overflow' becomes in the future from the point of view of +time_after32(). + +However, if jiffies wraps and enters the +[last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval (with +'last_overflow' being a stale synflood timestamp), then +tcp_synq_no_recent_overflow() still erroneously reports an +overflow. In such cases, we have to rely on syncookie verification +to drop the packet. We unfortunately have no way to differentiate +between a fresh and a stale syncookie timestamp. + +In practice, using last_overflow as lower bound is problematic. +If the synflood timestamp is concurrently updated between the time +we read jiffies and the moment we store the timestamp in +'last_overflow', then 'now' becomes smaller than 'last_overflow' and +tcp_synq_no_recent_overflow() returns true, potentially dropping a +valid syncookie. + +Reading jiffies after loading the timestamp could fix the problem, +but that'd require a memory barrier. Let's just accommodate for +potential timestamp growth instead and extend the interval using +'last_overflow - HZ' as lower bound. + +Signed-off-by: Guillaume Nault +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -508,13 +508,23 @@ static inline bool tcp_synq_no_recent_ov + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); +- return time_after32(now, last_overflow + +- TCP_SYNCOOKIE_VALID); ++ return !time_between32(now, last_overflow - HZ, ++ last_overflow + ++ TCP_SYNCOOKIE_VALID); + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; +- return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); ++ ++ /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, ++ * then we're under synflood. However, we have to use ++ * 'last_overflow - HZ' as lower bound. That's because a concurrent ++ * tcp_synq_overflow() could update .ts_recent_stamp after we read ++ * jiffies but before we store .ts_recent_stamp into last_overflow, ++ * which could lead to rejecting a valid syncookie. ++ */ ++ return !time_between32(now, last_overflow - HZ, ++ last_overflow + TCP_SYNCOOKIE_VALID); + } + + static inline u32 tcp_cookie_time(void) diff --git a/queue-5.3/tipc-fix-ordering-of-tipc-module-init-and-exit-routine.patch b/queue-5.3/tipc-fix-ordering-of-tipc-module-init-and-exit-routine.patch new file mode 100644 index 00000000000..d8389378270 --- /dev/null +++ b/queue-5.3/tipc-fix-ordering-of-tipc-module-init-and-exit-routine.patch @@ -0,0 +1,159 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Taehee Yoo +Date: Fri, 6 Dec 2019 05:25:48 +0000 +Subject: tipc: fix ordering of tipc module init and exit routine + +From: Taehee Yoo + +[ Upstream commit 9cf1cd8ee3ee09ef2859017df2058e2f53c5347f ] + +In order to set/get/dump, the tipc uses the generic netlink +infrastructure. So, when tipc module is inserted, init function +calls genl_register_family(). +After genl_register_family(), set/get/dump commands are immediately +allowed and these callbacks internally use the net_generic. +net_generic is allocated by register_pernet_device() but this +is called after genl_register_family() in the __init function. +So, these callbacks would use un-initialized net_generic. + +Test commands: + #SHELL1 + while : + do + modprobe tipc + modprobe -rv tipc + done + + #SHELL2 + while : + do + tipc link list + done + +Splat looks like: +[ 59.616322][ T2788] kasan: CONFIG_KASAN_INLINE enabled +[ 59.617234][ T2788] kasan: GPF could be caused by NULL-ptr deref or user memory access +[ 59.618398][ T2788] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI +[ 59.619389][ T2788] CPU: 3 PID: 2788 Comm: tipc Not tainted 5.4.0+ #194 +[ 59.620231][ T2788] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +[ 59.621428][ T2788] RIP: 0010:tipc_bcast_get_broadcast_mode+0x131/0x310 [tipc] +[ 59.622379][ T2788] Code: c7 c6 ef 8b 38 c0 65 ff 0d 84 83 c9 3f e8 d7 a5 f2 e3 48 8d bb 38 11 00 00 48 b8 00 00 00 00 +[ 59.622550][ T2780] NET: Registered protocol family 30 +[ 59.624627][ T2788] RSP: 0018:ffff88804b09f578 EFLAGS: 00010202 +[ 59.624630][ T2788] RAX: dffffc0000000000 RBX: 0000000000000011 RCX: 000000008bc66907 +[ 59.624631][ T2788] RDX: 0000000000000229 RSI: 000000004b3cf4cc RDI: 0000000000001149 +[ 59.624633][ T2788] RBP: ffff88804b09f588 R08: 0000000000000003 R09: fffffbfff4fb3df1 +[ 59.624635][ T2788] R10: fffffbfff50318f8 R11: ffff888066cadc18 R12: ffffffffa6cc2f40 +[ 59.624637][ T2788] R13: 1ffff11009613eba R14: ffff8880662e9328 R15: ffff8880662e9328 +[ 59.624639][ T2788] FS: 00007f57d8f7b740(0000) GS:ffff88806cc00000(0000) knlGS:0000000000000000 +[ 59.624645][ T2788] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 59.625875][ T2780] tipc: Started in single node mode +[ 59.626128][ T2788] CR2: 00007f57d887a8c0 CR3: 000000004b140002 CR4: 00000000000606e0 +[ 59.633991][ T2788] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 59.635195][ T2788] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 59.636478][ T2788] Call Trace: +[ 59.637025][ T2788] tipc_nl_add_bc_link+0x179/0x1470 [tipc] +[ 59.638219][ T2788] ? lock_downgrade+0x6e0/0x6e0 +[ 59.638923][ T2788] ? __tipc_nl_add_link+0xf90/0xf90 [tipc] +[ 59.639533][ T2788] ? tipc_nl_node_dump_link+0x318/0xa50 [tipc] +[ 59.640160][ T2788] ? mutex_lock_io_nested+0x1380/0x1380 +[ 59.640746][ T2788] tipc_nl_node_dump_link+0x4fd/0xa50 [tipc] +[ 59.641356][ T2788] ? tipc_nl_node_reset_link_stats+0x340/0x340 [tipc] +[ 59.642088][ T2788] ? __skb_ext_del+0x270/0x270 +[ 59.642594][ T2788] genl_lock_dumpit+0x85/0xb0 +[ 59.643050][ T2788] netlink_dump+0x49c/0xed0 +[ 59.643529][ T2788] ? __netlink_sendskb+0xc0/0xc0 +[ 59.644044][ T2788] ? __netlink_dump_start+0x190/0x800 +[ 59.644617][ T2788] ? __mutex_unlock_slowpath+0xd0/0x670 +[ 59.645177][ T2788] __netlink_dump_start+0x5a0/0x800 +[ 59.645692][ T2788] genl_rcv_msg+0xa75/0xe90 +[ 59.646144][ T2788] ? __lock_acquire+0xdfe/0x3de0 +[ 59.646692][ T2788] ? genl_family_rcv_msg_attrs_parse+0x320/0x320 +[ 59.647340][ T2788] ? genl_lock_dumpit+0xb0/0xb0 +[ 59.647821][ T2788] ? genl_unlock+0x20/0x20 +[ 59.648290][ T2788] ? genl_parallel_done+0xe0/0xe0 +[ 59.648787][ T2788] ? find_held_lock+0x39/0x1d0 +[ 59.649276][ T2788] ? genl_rcv+0x15/0x40 +[ 59.649722][ T2788] ? lock_contended+0xcd0/0xcd0 +[ 59.650296][ T2788] netlink_rcv_skb+0x121/0x350 +[ 59.650828][ T2788] ? genl_family_rcv_msg_attrs_parse+0x320/0x320 +[ 59.651491][ T2788] ? netlink_ack+0x940/0x940 +[ 59.651953][ T2788] ? lock_acquire+0x164/0x3b0 +[ 59.652449][ T2788] genl_rcv+0x24/0x40 +[ 59.652841][ T2788] netlink_unicast+0x421/0x600 +[ ... ] + +Fixes: 7e4369057806 ("tipc: fix a slab object leak") +Fixes: a62fbccecd62 ("tipc: make subscriber server support net namespace") +Signed-off-by: Taehee Yoo +Acked-by: Jon Maloy +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/core.c | 29 +++++++++++++++-------------- + 1 file changed, 15 insertions(+), 14 deletions(-) + +--- a/net/tipc/core.c ++++ b/net/tipc/core.c +@@ -122,14 +122,6 @@ static int __init tipc_init(void) + sysctl_tipc_rmem[1] = RCVBUF_DEF; + sysctl_tipc_rmem[2] = RCVBUF_MAX; + +- err = tipc_netlink_start(); +- if (err) +- goto out_netlink; +- +- err = tipc_netlink_compat_start(); +- if (err) +- goto out_netlink_compat; +- + err = tipc_register_sysctl(); + if (err) + goto out_sysctl; +@@ -150,8 +142,21 @@ static int __init tipc_init(void) + if (err) + goto out_bearer; + ++ err = tipc_netlink_start(); ++ if (err) ++ goto out_netlink; ++ ++ err = tipc_netlink_compat_start(); ++ if (err) ++ goto out_netlink_compat; ++ + pr_info("Started in single node mode\n"); + return 0; ++ ++out_netlink_compat: ++ tipc_netlink_stop(); ++out_netlink: ++ tipc_bearer_cleanup(); + out_bearer: + unregister_pernet_device(&tipc_topsrv_net_ops); + out_pernet_topsrv: +@@ -161,22 +166,18 @@ out_socket: + out_pernet: + tipc_unregister_sysctl(); + out_sysctl: +- tipc_netlink_compat_stop(); +-out_netlink_compat: +- tipc_netlink_stop(); +-out_netlink: + pr_err("Unable to start in single node mode\n"); + return err; + } + + static void __exit tipc_exit(void) + { ++ tipc_netlink_compat_stop(); ++ tipc_netlink_stop(); + tipc_bearer_cleanup(); + unregister_pernet_device(&tipc_topsrv_net_ops); + tipc_socket_stop(); + unregister_pernet_device(&tipc_net_ops); +- tipc_netlink_stop(); +- tipc_netlink_compat_stop(); + tipc_unregister_sysctl(); + + pr_info("Deactivated\n"); diff --git a/queue-5.3/xdp-obtain-the-mem_id-mutex-before-trying-to-remove-an-entry.patch b/queue-5.3/xdp-obtain-the-mem_id-mutex-before-trying-to-remove-an-entry.patch new file mode 100644 index 00000000000..db4cc11cb6c --- /dev/null +++ b/queue-5.3/xdp-obtain-the-mem_id-mutex-before-trying-to-remove-an-entry.patch @@ -0,0 +1,60 @@ +From foo@baz Tue 17 Dec 2019 08:14:58 PM CET +From: Jonathan Lemon +Date: Tue, 3 Dec 2019 14:01:14 -0800 +Subject: xdp: obtain the mem_id mutex before trying to remove an entry. + +From: Jonathan Lemon + +[ Upstream commit 86c76c09898332143be365c702cf8d586ed4ed21 ] + +A lockdep splat was observed when trying to remove an xdp memory +model from the table since the mutex was obtained when trying to +remove the entry, but not before the table walk started: + +Fix the splat by obtaining the lock before starting the table walk. + +Fixes: c3f812cea0d7 ("page_pool: do not release pool until inflight == 0.") +Reported-by: Grygorii Strashko +Signed-off-by: Jonathan Lemon +Tested-by: Grygorii Strashko +Acked-by: Jesper Dangaard Brouer +Acked-by: Ilias Apalodimas +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/xdp.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/net/core/xdp.c ++++ b/net/core/xdp.c +@@ -85,12 +85,8 @@ static void mem_xa_remove(struct xdp_mem + { + trace_mem_disconnect(xa); + +- mutex_lock(&mem_id_lock); +- + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); +- +- mutex_unlock(&mem_id_lock); + } + + static void mem_allocator_disconnect(void *allocator) +@@ -98,6 +94,8 @@ static void mem_allocator_disconnect(voi + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + ++ mutex_lock(&mem_id_lock); ++ + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); +@@ -111,6 +109,8 @@ static void mem_allocator_disconnect(voi + + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); ++ ++ mutex_unlock(&mem_id_lock); + } + + static void mem_id_disconnect(int id)