5.10-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)
diff --git a/queue-5.10/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch b/queue-5.10/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch

new file mode 100644 (file)

index 0000000..d2b5173
--- /dev/null
+++ b/queue-5.10/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch
@@ -0,0 +1,37 @@
+From 1faba27f11c8da244e793546a1b35a9b1da8208e Mon Sep 17 00:00:00 2001
+From: Leon Romanovsky <leonro@nvidia.com>
+Date: Wed, 3 Feb 2021 15:51:09 +0200
+Subject: ipv6: silence compilation warning for non-IPV6 builds
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+commit 1faba27f11c8da244e793546a1b35a9b1da8208e upstream.
+
+The W=1 compilation of allmodconfig generates the following warning:
+
+net/ipv6/icmp.c:448:6: warning: no previous prototype for 'icmp6_send' [-Wmissing-prototypes]
+  448 | void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+      |      ^~~~~~~~~~
+
+Fix it by providing function declaration for builds with ipv6 as a module.
+
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/icmpv6.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/icmpv6.h
++++ b/include/linux/icmpv6.h
+@@ -16,9 +16,9 @@ static inline struct icmp6hdr *icmp6_hdr
+ 
+ typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+                            const struct in6_addr *force_saddr);
+-#if IS_BUILTIN(CONFIG_IPV6)
+ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+               const struct in6_addr *force_saddr);
++#if IS_BUILTIN(CONFIG_IPV6)
+ static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+ {
+       icmp6_send(skb, type, code, info, NULL);
diff --git a/queue-5.10/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch b/queue-5.10/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch

new file mode 100644 (file)

index 0000000..e4a4717
--- /dev/null
+++ b/queue-5.10/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch
@@ -0,0 +1,338 @@
+From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 23 Feb 2021 14:18:58 +0100
+Subject: net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit ee576c47db60432c37e54b1e2b43a8ca6d3a8dca upstream.
+
+The icmp{,v6}_send functions make all sorts of use of skb->cb, casting
+it with IPCB or IP6CB, assuming the skb to have come directly from the
+inet layer. But when the packet comes from the ndo layer, especially
+when forwarded, there's no telling what might be in skb->cb at that
+point. As a result, the icmp sending code risks reading bogus memory
+contents, which can result in nasty stack overflows such as this one
+reported by a user:
+
+    panic+0x108/0x2ea
+    __stack_chk_fail+0x14/0x20
+    __icmp_send+0x5bd/0x5c0
+    icmp_ndo_send+0x148/0x160
+
+In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read
+from it. The optlen parameter there is of particular note, as it can
+induce writes beyond bounds. There are quite a few ways that can happen
+in __ip_options_echo. For example:
+
+    // sptr/skb are attacker-controlled skb bytes
+    sptr = skb_network_header(skb);
+    // dptr/dopt points to stack memory allocated by __icmp_send
+    dptr = dopt->__data;
+    // sopt is the corrupt skb->cb in question
+    if (sopt->rr) {
+        optlen  = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data
+        soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data
+       // this now writes potentially attacker-controlled data, over
+       // flowing the stack:
+        memcpy(dptr, sptr+sopt->rr, optlen);
+    }
+
+In the icmpv6_send case, the story is similar, but not as dire, as only
+IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is
+worse than the iif case, but it is passed to ipv6_find_tlv, which does
+a bit of bounds checking on the value.
+
+This is easy to simulate by doing a `memset(skb->cb, 0x41,
+sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by
+good fortune and the rarity of icmp sending from that context that we've
+avoided reports like this until now. For example, in KASAN:
+
+    BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0
+    Write of size 38 at addr ffff888006f1f80e by task ping/89
+    CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5
+    Call Trace:
+     dump_stack+0x9a/0xcc
+     print_address_description.constprop.0+0x1a/0x160
+     __kasan_report.cold+0x20/0x38
+     kasan_report+0x32/0x40
+     check_memory_region+0x145/0x1a0
+     memcpy+0x39/0x60
+     __ip_options_echo+0xa0e/0x12b0
+     __icmp_send+0x744/0x1700
+
+Actually, out of the 4 drivers that do this, only gtp zeroed the cb for
+the v4 case, while the rest did not. So this commit actually removes the
+gtp-specific zeroing, while putting the code where it belongs in the
+shared infrastructure of icmp{,v6}_ndo_send.
+
+This commit fixes the issue by passing an empty IPCB or IP6CB along to
+the functions that actually do the work. For the icmp_send, this was
+already trivial, thanks to __icmp_send providing the plumbing function.
+For icmpv6_send, this required a tiny bit of refactoring to make it
+behave like the v4 case, after which it was straight forward.
+
+Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs")
+Reported-by: SinYu <liuxyon@gmail.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/gtp.c      |    1 -
+ include/linux/icmpv6.h |   26 ++++++++++++++++++++------
+ include/linux/ipv6.h   |    1 -
+ include/net/icmp.h     |    6 +++++-
+ net/ipv4/icmp.c        |    5 +++--
+ net/ipv6/icmp.c        |   18 +++++++++---------
+ net/ipv6/ip6_icmp.c    |   12 +++++++-----
+ 7 files changed, 44 insertions(+), 25 deletions(-)
+
+--- a/drivers/net/gtp.c
++++ b/drivers/net/gtp.c
+@@ -539,7 +539,6 @@ static int gtp_build_skb_ip4(struct sk_b
+       if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) &&
+           mtu < ntohs(iph->tot_len)) {
+               netdev_dbg(dev, "packet too big, fragmentation needed\n");
+-              memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+               icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+                             htonl(mtu));
+               goto err_rt;
+--- a/include/linux/icmpv6.h
++++ b/include/linux/icmpv6.h
+@@ -3,6 +3,7 @@
+ #define _LINUX_ICMPV6_H
+ 
+ #include <linux/skbuff.h>
++#include <linux/ipv6.h>
+ #include <uapi/linux/icmpv6.h>
+ 
+ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
+@@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr
+ #if IS_ENABLED(CONFIG_IPV6)
+ 
+ typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+-                           const struct in6_addr *force_saddr);
++                           const struct in6_addr *force_saddr,
++                           const struct inet6_skb_parm *parm);
+ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+-              const struct in6_addr *force_saddr);
++              const struct in6_addr *force_saddr,
++              const struct inet6_skb_parm *parm);
+ #if IS_BUILTIN(CONFIG_IPV6)
+-static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
++static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
++                               const struct inet6_skb_parm *parm)
+ {
+-      icmp6_send(skb, type, code, info, NULL);
++      icmp6_send(skb, type, code, info, NULL, parm);
+ }
+ static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
+ {
+@@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_
+       return 0;
+ }
+ #else
+-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
++extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
++                        const struct inet6_skb_parm *parm);
+ extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
+ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
+ #endif
+ 
++static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
++{
++      __icmpv6_send(skb, type, code, info, IP6CB(skb));
++}
++
+ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+                              unsigned int data_len);
+ 
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
+ #else
+-#define icmpv6_ndo_send icmpv6_send
++static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
++{
++      struct inet6_skb_parm parm = { 0 };
++      __icmpv6_send(skb_in, type, code, info, &parm);
++}
+ #endif
+ 
+ #else
+--- a/include/linux/ipv6.h
++++ b/include/linux/ipv6.h
+@@ -84,7 +84,6 @@ struct ipv6_params {
+       __s32 autoconf;
+ };
+ extern struct ipv6_params ipv6_defaults;
+-#include <linux/icmpv6.h>
+ #include <linux/tcp.h>
+ #include <linux/udp.h>
+ 
+--- a/include/net/icmp.h
++++ b/include/net/icmp.h
+@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_b
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
+ #else
+-#define icmp_ndo_send icmp_send
++static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
++{
++      struct ip_options opts = { 0 };
++      __icmp_send(skb_in, type, code, info, &opts);
++}
+ #endif
+ 
+ int icmp_rcv(struct sk_buff *skb);
+--- a/net/ipv4/icmp.c
++++ b/net/ipv4/icmp.c
+@@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send);
+ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+ {
+       struct sk_buff *cloned_skb = NULL;
++      struct ip_options opts = { 0 };
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct;
+       __be32 orig_ip;
+ 
+       ct = nf_ct_get(skb_in, &ctinfo);
+       if (!ct || !(ct->status & IPS_SRC_NAT)) {
+-              icmp_send(skb_in, type, code, info);
++              __icmp_send(skb_in, type, code, info, &opts);
+               return;
+       }
+ 
+@@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_i
+ 
+       orig_ip = ip_hdr(skb_in)->saddr;
+       ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+-      icmp_send(skb_in, type, code, info);
++      __icmp_send(skb_in, type, code, info, &opts);
+       ip_hdr(skb_in)->saddr = orig_ip;
+ out:
+       consume_skb(cloned_skb);
+--- a/net/ipv6/icmp.c
++++ b/net/ipv6/icmp.c
+@@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, ch
+ }
+ 
+ #if IS_ENABLED(CONFIG_IPV6_MIP6)
+-static void mip6_addr_swap(struct sk_buff *skb)
++static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
+ {
+       struct ipv6hdr *iph = ipv6_hdr(skb);
+-      struct inet6_skb_parm *opt = IP6CB(skb);
+       struct ipv6_destopt_hao *hao;
+       struct in6_addr tmp;
+       int off;
+@@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buf
+       }
+ }
+ #else
+-static inline void mip6_addr_swap(struct sk_buff *skb) {}
++static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
+ #endif
+ 
+ static struct dst_entry *icmpv6_route_lookup(struct net *net,
+@@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buf
+  *    Send an ICMP message in response to a packet in error
+  */
+ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+-              const struct in6_addr *force_saddr)
++              const struct in6_addr *force_saddr,
++              const struct inet6_skb_parm *parm)
+ {
+       struct inet6_dev *idev = NULL;
+       struct ipv6hdr *hdr = ipv6_hdr(skb);
+@@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8
+       if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
+               goto out_bh_enable;
+ 
+-      mip6_addr_swap(skb);
++      mip6_addr_swap(skb, parm);
+ 
+       sk = icmpv6_xmit_lock(net);
+       if (!sk)
+@@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8
+               /* select a more meaningful saddr from input if */
+               struct net_device *in_netdev;
+ 
+-              in_netdev = dev_get_by_index(net, IP6CB(skb)->iif);
++              in_netdev = dev_get_by_index(net, parm->iif);
+               if (in_netdev) {
+                       ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
+                                          inet6_sk(sk)->srcprefs,
+@@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send);
+  */
+ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+ {
+-      icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
++      icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
+       kfree_skb(skb);
+ }
+ 
+@@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk
+       }
+       if (type == ICMP_TIME_EXCEEDED)
+               icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+-                         info, &temp_saddr);
++                         info, &temp_saddr, IP6CB(skb2));
+       else
+               icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
+-                         info, &temp_saddr);
++                         info, &temp_saddr, IP6CB(skb2));
+       if (rt)
+               ip6_rt_put(rt);
+ 
+--- a/net/ipv6/ip6_icmp.c
++++ b/net/ipv6/ip6_icmp.c
+@@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icm
+ }
+ EXPORT_SYMBOL(inet6_unregister_icmp_sender);
+ 
+-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
++void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
++                 const struct inet6_skb_parm *parm)
+ {
+       ip6_icmp_send_t *send;
+ 
+       rcu_read_lock();
+       send = rcu_dereference(ip6_icmp_send);
+       if (send)
+-              send(skb, type, code, info, NULL);
++              send(skb, type, code, info, NULL, parm);
+       rcu_read_unlock();
+ }
+-EXPORT_SYMBOL(icmpv6_send);
++EXPORT_SYMBOL(__icmpv6_send);
+ #endif
+ 
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ #include <net/netfilter/nf_conntrack.h>
+ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+ {
++      struct inet6_skb_parm parm = { 0 };
+       struct sk_buff *cloned_skb = NULL;
+       enum ip_conntrack_info ctinfo;
+       struct in6_addr orig_ip;
+@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb
+ 
+       ct = nf_ct_get(skb_in, &ctinfo);
+       if (!ct || !(ct->status & IPS_SRC_NAT)) {
+-              icmpv6_send(skb_in, type, code, info);
++              __icmpv6_send(skb_in, type, code, info, &parm);
+               return;
+       }
+ 
+@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb
+ 
+       orig_ip = ipv6_hdr(skb_in)->saddr;
+       ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
+-      icmpv6_send(skb_in, type, code, info);
++      __icmpv6_send(skb_in, type, code, info, &parm);
+       ipv6_hdr(skb_in)->saddr = orig_ip;
+ out:
+       consume_skb(cloned_skb);
diff --git a/queue-5.10/net-sched-fix-police-ext-initialization.patch b/queue-5.10/net-sched-fix-police-ext-initialization.patch

new file mode 100644 (file)

index 0000000..cd4b4f0
--- /dev/null
+++ b/queue-5.10/net-sched-fix-police-ext-initialization.patch
@@ -0,0 +1,119 @@
+From 396d7f23adf9e8c436dd81a69488b5b6a865acf8 Mon Sep 17 00:00:00 2001
+From: Vlad Buslov <vladbu@nvidia.com>
+Date: Tue, 16 Feb 2021 18:22:00 +0200
+Subject: net: sched: fix police ext initialization
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+commit 396d7f23adf9e8c436dd81a69488b5b6a865acf8 upstream.
+
+When police action is created by cls API tcf_exts_validate() first
+conditional that calls tcf_action_init_1() directly, the action idr is not
+updated according to latest changes in action API that require caller to
+commit newly created action to idr with tcf_idr_insert_many(). This results
+such action not being accessible through act API and causes crash reported
+by syzbot:
+
+==================================================================
+BUG: KASAN: null-ptr-deref in instrument_atomic_read include/linux/instrumented.h:71 [inline]
+BUG: KASAN: null-ptr-deref in atomic_read include/asm-generic/atomic-instrumented.h:27 [inline]
+BUG: KASAN: null-ptr-deref in __tcf_idr_release net/sched/act_api.c:178 [inline]
+BUG: KASAN: null-ptr-deref in tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598
+Read of size 4 at addr 0000000000000010 by task kworker/u4:5/204
+
+CPU: 0 PID: 204 Comm: kworker/u4:5 Not tainted 5.11.0-rc7-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Workqueue: netns cleanup_net
+Call Trace:
+ __dump_stack lib/dump_stack.c:79 [inline]
+ dump_stack+0x107/0x163 lib/dump_stack.c:120
+ __kasan_report mm/kasan/report.c:400 [inline]
+ kasan_report.cold+0x5f/0xd5 mm/kasan/report.c:413
+ check_memory_region_inline mm/kasan/generic.c:179 [inline]
+ check_memory_region+0x13d/0x180 mm/kasan/generic.c:185
+ instrument_atomic_read include/linux/instrumented.h:71 [inline]
+ atomic_read include/asm-generic/atomic-instrumented.h:27 [inline]
+ __tcf_idr_release net/sched/act_api.c:178 [inline]
+ tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598
+ tc_action_net_exit include/net/act_api.h:151 [inline]
+ police_exit_net+0x168/0x360 net/sched/act_police.c:390
+ ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190
+ cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604
+ process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275
+ worker_thread+0x64c/0x1120 kernel/workqueue.c:2421
+ kthread+0x3b1/0x4a0 kernel/kthread.c:292
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296
+==================================================================
+Kernel panic - not syncing: panic_on_warn set ...
+CPU: 0 PID: 204 Comm: kworker/u4:5 Tainted: G    B             5.11.0-rc7-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Workqueue: netns cleanup_net
+Call Trace:
+ __dump_stack lib/dump_stack.c:79 [inline]
+ dump_stack+0x107/0x163 lib/dump_stack.c:120
+ panic+0x306/0x73d kernel/panic.c:231
+ end_report+0x58/0x5e mm/kasan/report.c:100
+ __kasan_report mm/kasan/report.c:403 [inline]
+ kasan_report.cold+0x67/0xd5 mm/kasan/report.c:413
+ check_memory_region_inline mm/kasan/generic.c:179 [inline]
+ check_memory_region+0x13d/0x180 mm/kasan/generic.c:185
+ instrument_atomic_read include/linux/instrumented.h:71 [inline]
+ atomic_read include/asm-generic/atomic-instrumented.h:27 [inline]
+ __tcf_idr_release net/sched/act_api.c:178 [inline]
+ tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598
+ tc_action_net_exit include/net/act_api.h:151 [inline]
+ police_exit_net+0x168/0x360 net/sched/act_police.c:390
+ ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190
+ cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604
+ process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275
+ worker_thread+0x64c/0x1120 kernel/workqueue.c:2421
+ kthread+0x3b1/0x4a0 kernel/kthread.c:292
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296
+Kernel Offset: disabled
+
+Fix the issue by calling tcf_idr_insert_many() after successful action
+initialization.
+
+Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together")
+Reported-by: syzbot+151e3e714d34ae4ce7e8@syzkaller.appspotmail.com
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/act_api.h |    1 +
+ net/sched/act_api.c   |    2 +-
+ net/sched/cls_api.c   |    1 +
+ 3 files changed, 3 insertions(+), 1 deletion(-)
+
+--- a/include/net/act_api.h
++++ b/include/net/act_api.h
+@@ -166,6 +166,7 @@ int tcf_idr_create_from_flags(struct tc_
+                             struct nlattr *est, struct tc_action **a,
+                             const struct tc_action_ops *ops, int bind,
+                             u32 flags);
++void tcf_idr_insert_many(struct tc_action *actions[]);
+ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
+ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
+                       struct tc_action **a, int bind);
+--- a/net/sched/act_api.c
++++ b/net/sched/act_api.c
+@@ -888,7 +888,7 @@ static const struct nla_policy tcf_actio
+       [TCA_ACT_HW_STATS]      = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY),
+ };
+ 
+-static void tcf_idr_insert_many(struct tc_action *actions[])
++void tcf_idr_insert_many(struct tc_action *actions[])
+ {
+       int i;
+ 
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -3065,6 +3065,7 @@ int tcf_exts_validate(struct net *net, s
+                       act->type = exts->type = TCA_OLD_COMPAT;
+                       exts->actions[0] = act;
+                       exts->nr_actions = 1;
++                      tcf_idr_insert_many(exts->actions);
+               } else if (exts->action && tb[exts->action]) {
+                       int err;
+ 
diff --git a/queue-5.10/series b/queue-5.10/series

index b20ed6419800554fcc64cb13e7ccefffde0c3e76..cdfb9c934be19f3c6984b2e81a9a7681fa888cca 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -654,3 +654,8 @@ dm-era-reinitialize-bitset-cache-before-digesting-a-new-writeset.patch
  dm-era-only-resize-metadata-in-preresume.patch
  drm-i915-reject-446-480mhz-hdmi-clock-on-glk.patch
  kgdb-fix-to-kill-breakpoints-on-initmem-after-boot.patch
+ipv6-silence-compilation-warning-for-non-ipv6-builds.patch
+net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch
+wireguard-selftests-test-multiple-parallel-streams.patch
+wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch
+net-sched-fix-police-ext-initialization.patch
diff --git a/queue-5.10/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch b/queue-5.10/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch

new file mode 100644 (file)

index 0000000..81ccba5
--- /dev/null
+++ b/queue-5.10/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch
@@ -0,0 +1,562 @@
+From 8b5553ace83cced775eefd0f3f18b5c6214ccf7a Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:48 +0100
+Subject: wireguard: queueing: get rid of per-peer ring buffers
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream.
+
+Having two ring buffers per-peer means that every peer results in two
+massive ring allocations. On an 8-core x86_64 machine, this commit
+reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which
+is an 90% reduction. Ninety percent! With some single-machine
+deployments approaching 500,000 peers, we're talking about a reduction
+from 7 gigs of memory down to 700 megs of memory.
+
+In order to get rid of these per-peer allocations, this commit switches
+to using a list-based queueing approach. Currently GSO fragments are
+chained together using the skb->next pointer (the skb_list_* singly
+linked list approach), so we form the per-peer queue around the unused
+skb->prev pointer (which sort of makes sense because the links are
+pointing backwards). Use of skb_queue_* is not possible here, because
+that is based on doubly linked lists and spinlocks. Multiple cores can
+write into the queue at any given time, because its writes occur in the
+start_xmit path or in the udp_recv path. But reads happen in a single
+workqueue item per-peer, amounting to a multi-producer, single-consumer
+paradigm.
+
+The MPSC queue is implemented locklessly and never blocks. However, it
+is not linearizable (though it is serializable), with a very tight and
+unlikely race on writes, which, when hit (some tiny fraction of the
+0.15% of partial adds on a fully loaded 16-core x86_64 system), causes
+the queue reader to terminate early. However, because every packet sent
+queues up the same workqueue item after it is fully added, the worker
+resumes again, and stopping early isn't actually a problem, since at
+that point the packet wouldn't have yet been added to the encryption
+queue. These properties allow us to avoid disabling interrupts or
+spinning. The design is based on Dmitry Vyukov's algorithm [1].
+
+Performance-wise, ordinarily list-based queues aren't preferable to
+ringbuffers, because of cache misses when following pointers around.
+However, we *already* have to follow the adjacent pointers when working
+through fragments, so there shouldn't actually be any change there. A
+potential downside is that dequeueing is a bit more complicated, but the
+ptr_ring structure used prior had a spinlock when dequeueing, so all and
+all the difference appears to be a wash.
+
+Actually, from profiling, the biggest performance hit, by far, of this
+commit winds up being atomic_add_unless(count, 1, max) and atomic_
+dec(count), which account for the majority of CPU time, according to
+perf. In that sense, the previous ring buffer was superior in that it
+could check if it was full by head==tail, which the list-based approach
+cannot do.
+
+But all and all, this enables us to get massive memory savings, allowing
+WireGuard to scale for real world deployments, without taking much of a
+performance hit.
+
+[1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
+
+Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
+Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/device.c   |   12 ++---
+ drivers/net/wireguard/device.h   |   15 +++---
+ drivers/net/wireguard/peer.c     |   28 ++++--------
+ drivers/net/wireguard/peer.h     |    4 -
+ drivers/net/wireguard/queueing.c |   86 +++++++++++++++++++++++++++++++--------
+ drivers/net/wireguard/queueing.h |   45 ++++++++++++++------
+ drivers/net/wireguard/receive.c  |   16 ++-----
+ drivers/net/wireguard/send.c     |   31 ++++----------
+ 8 files changed, 144 insertions(+), 93 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic
+       destroy_workqueue(wg->handshake_receive_wq);
+       destroy_workqueue(wg->handshake_send_wq);
+       destroy_workqueue(wg->packet_crypt_wq);
+-      wg_packet_queue_free(&wg->decrypt_queue, true);
+-      wg_packet_queue_free(&wg->encrypt_queue, true);
++      wg_packet_queue_free(&wg->decrypt_queue);
++      wg_packet_queue_free(&wg->encrypt_queue);
+       rcu_barrier(); /* Wait for all the peers to be actually freed. */
+       wg_ratelimiter_uninit();
+       memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
+@@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne
+               goto err_destroy_handshake_send;
+ 
+       ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
+-                                 true, MAX_QUEUED_PACKETS);
++                                 MAX_QUEUED_PACKETS);
+       if (ret < 0)
+               goto err_destroy_packet_crypt;
+ 
+       ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
+-                                 true, MAX_QUEUED_PACKETS);
++                                 MAX_QUEUED_PACKETS);
+       if (ret < 0)
+               goto err_free_encrypt_queue;
+ 
+@@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne
+ err_uninit_ratelimiter:
+       wg_ratelimiter_uninit();
+ err_free_decrypt_queue:
+-      wg_packet_queue_free(&wg->decrypt_queue, true);
++      wg_packet_queue_free(&wg->decrypt_queue);
+ err_free_encrypt_queue:
+-      wg_packet_queue_free(&wg->encrypt_queue, true);
++      wg_packet_queue_free(&wg->encrypt_queue);
+ err_destroy_packet_crypt:
+       destroy_workqueue(wg->packet_crypt_wq);
+ err_destroy_handshake_send:
+--- a/drivers/net/wireguard/device.h
++++ b/drivers/net/wireguard/device.h
+@@ -27,13 +27,14 @@ struct multicore_worker {
+ 
+ struct crypt_queue {
+       struct ptr_ring ring;
+-      union {
+-              struct {
+-                      struct multicore_worker __percpu *worker;
+-                      int last_cpu;
+-              };
+-              struct work_struct work;
+-      };
++      struct multicore_worker __percpu *worker;
++      int last_cpu;
++};
++
++struct prev_queue {
++      struct sk_buff *head, *tail, *peeked;
++      struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
++      atomic_t count;
+ };
+ 
+ struct wg_device {
+--- a/drivers/net/wireguard/peer.c
++++ b/drivers/net/wireguard/peer.c
+@@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg
+       peer = kzalloc(sizeof(*peer), GFP_KERNEL);
+       if (unlikely(!peer))
+               return ERR_PTR(ret);
+-      peer->device = wg;
++      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
++              goto err;
+ 
++      peer->device = wg;
+       wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
+                               public_key, preshared_key, peer);
+-      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
+-              goto err_1;
+-      if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
+-                               MAX_QUEUED_PACKETS))
+-              goto err_2;
+-      if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
+-                               MAX_QUEUED_PACKETS))
+-              goto err_3;
+-
+       peer->internal_id = atomic64_inc_return(&peer_counter);
+       peer->serial_work_cpu = nr_cpumask_bits;
+       wg_cookie_init(&peer->latest_cookie);
+       wg_timers_init(peer);
+       wg_cookie_checker_precompute_peer_keys(peer);
+       spin_lock_init(&peer->keypairs.keypair_update_lock);
+-      INIT_WORK(&peer->transmit_handshake_work,
+-                wg_packet_handshake_send_worker);
++      INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
++      INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
++      wg_prev_queue_init(&peer->tx_queue);
++      wg_prev_queue_init(&peer->rx_queue);
+       rwlock_init(&peer->endpoint_lock);
+       kref_init(&peer->refcount);
+       skb_queue_head_init(&peer->staged_packet_queue);
+@@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg
+       pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
+       return peer;
+ 
+-err_3:
+-      wg_packet_queue_free(&peer->tx_queue, false);
+-err_2:
+-      dst_cache_destroy(&peer->endpoint_cache);
+-err_1:
++err:
+       kfree(peer);
+       return ERR_PTR(ret);
+ }
+@@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head
+       struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
+ 
+       dst_cache_destroy(&peer->endpoint_cache);
+-      wg_packet_queue_free(&peer->rx_queue, false);
+-      wg_packet_queue_free(&peer->tx_queue, false);
++      WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
+ 
+       /* The final zeroing takes care of clearing any remaining handshake key
+        * material and other potentially sensitive information.
+--- a/drivers/net/wireguard/peer.h
++++ b/drivers/net/wireguard/peer.h
+@@ -36,7 +36,7 @@ struct endpoint {
+ 
+ struct wg_peer {
+       struct wg_device *device;
+-      struct crypt_queue tx_queue, rx_queue;
++      struct prev_queue tx_queue, rx_queue;
+       struct sk_buff_head staged_packet_queue;
+       int serial_work_cpu;
+       struct noise_keypairs keypairs;
+@@ -45,7 +45,7 @@ struct wg_peer {
+       rwlock_t endpoint_lock;
+       struct noise_handshake handshake;
+       atomic64_t last_sent_handshake;
+-      struct work_struct transmit_handshake_work, clear_peer_work;
++      struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
+       struct cookie latest_cookie;
+       struct hlist_node pubkey_hash;
+       u64 rx_bytes, tx_bytes;
+--- a/drivers/net/wireguard/queueing.c
++++ b/drivers/net/wireguard/queueing.c
+@@ -9,8 +9,7 @@ struct multicore_worker __percpu *
+ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
+ {
+       int cpu;
+-      struct multicore_worker __percpu *worker =
+-              alloc_percpu(struct multicore_worker);
++      struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
+ 
+       if (!worker)
+               return NULL;
+@@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(
+ }
+ 
+ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
+-                       bool multicore, unsigned int len)
++                       unsigned int len)
+ {
+       int ret;
+ 
+@@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu
+       ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
+       if (ret)
+               return ret;
+-      if (function) {
+-              if (multicore) {
+-                      queue->worker = wg_packet_percpu_multicore_worker_alloc(
+-                              function, queue);
+-                      if (!queue->worker) {
+-                              ptr_ring_cleanup(&queue->ring, NULL);
+-                              return -ENOMEM;
+-                      }
+-              } else {
+-                      INIT_WORK(&queue->work, function);
+-              }
++      queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
++      if (!queue->worker) {
++              ptr_ring_cleanup(&queue->ring, NULL);
++              return -ENOMEM;
+       }
+       return 0;
+ }
+ 
+-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
++void wg_packet_queue_free(struct crypt_queue *queue)
+ {
+-      if (multicore)
+-              free_percpu(queue->worker);
++      free_percpu(queue->worker);
+       WARN_ON(!__ptr_ring_empty(&queue->ring));
+       ptr_ring_cleanup(&queue->ring, NULL);
+ }
++
++#define NEXT(skb) ((skb)->prev)
++#define STUB(queue) ((struct sk_buff *)&queue->empty)
++
++void wg_prev_queue_init(struct prev_queue *queue)
++{
++      NEXT(STUB(queue)) = NULL;
++      queue->head = queue->tail = STUB(queue);
++      queue->peeked = NULL;
++      atomic_set(&queue->count, 0);
++      BUILD_BUG_ON(
++              offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
++                                                      offsetof(struct prev_queue, empty) ||
++              offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
++                                                       offsetof(struct prev_queue, empty));
++}
++
++static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
++{
++      WRITE_ONCE(NEXT(skb), NULL);
++      WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
++}
++
++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
++{
++      if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
++              return false;
++      __wg_prev_queue_enqueue(queue, skb);
++      return true;
++}
++
++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
++{
++      struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
++
++      if (tail == STUB(queue)) {
++              if (!next)
++                      return NULL;
++              queue->tail = next;
++              tail = next;
++              next = smp_load_acquire(&NEXT(next));
++      }
++      if (next) {
++              queue->tail = next;
++              atomic_dec(&queue->count);
++              return tail;
++      }
++      if (tail != READ_ONCE(queue->head))
++              return NULL;
++      __wg_prev_queue_enqueue(queue, STUB(queue));
++      next = smp_load_acquire(&NEXT(tail));
++      if (next) {
++              queue->tail = next;
++              atomic_dec(&queue->count);
++              return tail;
++      }
++      return NULL;
++}
++
++#undef NEXT
++#undef STUB
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -17,12 +17,13 @@ struct wg_device;
+ struct wg_peer;
+ struct multicore_worker;
+ struct crypt_queue;
++struct prev_queue;
+ struct sk_buff;
+ 
+ /* queueing.c APIs: */
+ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
+-                       bool multicore, unsigned int len);
+-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
++                       unsigned int len);
++void wg_packet_queue_free(struct crypt_queue *queue);
+ struct multicore_worker __percpu *
+ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
+ 
+@@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online
+       return cpu;
+ }
+ 
++void wg_prev_queue_init(struct prev_queue *queue);
++
++/* Multi producer */
++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
++
++/* Single consumer */
++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
++
++/* Single consumer */
++static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
++{
++      if (queue->peeked)
++              return queue->peeked;
++      queue->peeked = wg_prev_queue_dequeue(queue);
++      return queue->peeked;
++}
++
++/* Single consumer */
++static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
++{
++      queue->peeked = NULL;
++}
++
+ static inline int wg_queue_enqueue_per_device_and_peer(
+-      struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
++      struct crypt_queue *device_queue, struct prev_queue *peer_queue,
+       struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
+ {
+       int cpu;
+@@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d
+       /* We first queue this up for the peer ingestion, but the consumer
+        * will wait for the state to change to CRYPTED or DEAD before.
+        */
+-      if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
++      if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
+               return -ENOSPC;
++
+       /* Then we queue it up in the device queue, which consumes the
+        * packet as soon as it can.
+        */
+@@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d
+       return 0;
+ }
+ 
+-static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
+-                                           struct sk_buff *skb,
+-                                           enum packet_state state)
++static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
+ {
+       /* We take a reference, because as soon as we call atomic_set, the
+        * peer can be freed from below us.
+@@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_
+       struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
+ 
+       atomic_set_release(&PACKET_CB(skb)->state, state);
+-      queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
+-                                             peer->internal_id),
+-                    peer->device->packet_crypt_wq, &queue->work);
++      queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
++                    peer->device->packet_crypt_wq, &peer->transmit_packet_work);
+       wg_peer_put(peer);
+ }
+ 
+-static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
+-                                                enum packet_state state)
++static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
+ {
+       /* We take a reference, because as soon as we call atomic_set, the
+        * peer can be freed from below us.
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -444,7 +444,6 @@ packet_processed:
+ int wg_packet_rx_poll(struct napi_struct *napi, int budget)
+ {
+       struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
+-      struct crypt_queue *queue = &peer->rx_queue;
+       struct noise_keypair *keypair;
+       struct endpoint endpoint;
+       enum packet_state state;
+@@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct
+       if (unlikely(budget <= 0))
+               return 0;
+ 
+-      while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
++      while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL &&
+              (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
+                      PACKET_STATE_UNCRYPTED) {
+-              __ptr_ring_discard_one(&queue->ring);
+-              peer = PACKET_PEER(skb);
++              wg_prev_queue_drop_peeked(&peer->rx_queue);
+               keypair = PACKET_CB(skb)->keypair;
+               free = true;
+ 
+@@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor
+               enum packet_state state =
+                       likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
+                               PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
+-              wg_queue_enqueue_per_peer_napi(skb, state);
++              wg_queue_enqueue_per_peer_rx(skb, state);
+               if (need_resched())
+                       cond_resched();
+       }
+@@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc
+       if (unlikely(READ_ONCE(peer->is_dead)))
+               goto err;
+ 
+-      ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
+-                                                 &peer->rx_queue, skb,
+-                                                 wg->packet_crypt_wq,
+-                                                 &wg->decrypt_queue.last_cpu);
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb,
++                                                 wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu);
+       if (unlikely(ret == -EPIPE))
+-              wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
++              wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD);
+       if (likely(!ret || ret == -EPIPE)) {
+               rcu_read_unlock_bh();
+               return;
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_
+       wg_packet_send_staged_packets(peer);
+ }
+ 
+-static void wg_packet_create_data_done(struct sk_buff *first,
+-                                     struct wg_peer *peer)
++static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first)
+ {
+       struct sk_buff *skb, *next;
+       bool is_keepalive, data_sent = false;
+@@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s
+ 
+ void wg_packet_tx_worker(struct work_struct *work)
+ {
+-      struct crypt_queue *queue = container_of(work, struct crypt_queue,
+-                                               work);
++      struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work);
+       struct noise_keypair *keypair;
+       enum packet_state state;
+       struct sk_buff *first;
+-      struct wg_peer *peer;
+ 
+-      while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
++      while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL &&
+              (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
+                      PACKET_STATE_UNCRYPTED) {
+-              __ptr_ring_discard_one(&queue->ring);
+-              peer = PACKET_PEER(first);
++              wg_prev_queue_drop_peeked(&peer->tx_queue);
+               keypair = PACKET_CB(first)->keypair;
+ 
+               if (likely(state == PACKET_STATE_CRYPTED))
+-                      wg_packet_create_data_done(first, peer);
++                      wg_packet_create_data_done(peer, first);
+               else
+                       kfree_skb_list(first);
+ 
+@@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor
+                               break;
+                       }
+               }
+-              wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
+-                                        state);
++              wg_queue_enqueue_per_peer_tx(first, state);
+               if (need_resched())
+                       cond_resched();
+       }
+ }
+ 
+-static void wg_packet_create_data(struct sk_buff *first)
++static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first)
+ {
+-      struct wg_peer *peer = PACKET_PEER(first);
+       struct wg_device *wg = peer->device;
+       int ret = -EINVAL;
+ 
+@@ -323,13 +317,10 @@ static void wg_packet_create_data(struct
+       if (unlikely(READ_ONCE(peer->is_dead)))
+               goto err;
+ 
+-      ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
+-                                                 &peer->tx_queue, first,
+-                                                 wg->packet_crypt_wq,
+-                                                 &wg->encrypt_queue.last_cpu);
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first,
++                                                 wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu);
+       if (unlikely(ret == -EPIPE))
+-              wg_queue_enqueue_per_peer(&peer->tx_queue, first,
+-                                        PACKET_STATE_DEAD);
++              wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD);
+ err:
+       rcu_read_unlock_bh();
+       if (likely(!ret || ret == -EPIPE))
+@@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc
+       packets.prev->next = NULL;
+       wg_peer_get(keypair->entry.peer);
+       PACKET_CB(packets.next)->keypair = keypair;
+-      wg_packet_create_data(packets.next);
++      wg_packet_create_data(peer, packets.next);
+       return;
+ 
+ out_invalid:
diff --git a/queue-5.10/wireguard-selftests-test-multiple-parallel-streams.patch b/queue-5.10/wireguard-selftests-test-multiple-parallel-streams.patch

new file mode 100644 (file)

index 0000000..9f7f6f1
--- /dev/null
+++ b/queue-5.10/wireguard-selftests-test-multiple-parallel-streams.patch
@@ -0,0 +1,54 @@
+From d5a49aa6c3e264a93a7d08485d66e346be0969dd Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:45 +0100
+Subject: wireguard: selftests: test multiple parallel streams
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit d5a49aa6c3e264a93a7d08485d66e346be0969dd upstream.
+
+In order to test ndo_start_xmit being called in parallel, explicitly add
+separate tests, which should all run on different cores. This should
+help tease out bugs associated with queueing up packets from different
+cores in parallel. Currently, it hasn't found those types of bugs, but
+given future planned work, this is a useful regression to avoid.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/wireguard/netns.sh |   15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0
+ ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+ sleep() { read -t "$1" -N 1 || true; }
+-waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
++waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+ waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+ waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; }
+ 
+@@ -141,6 +141,19 @@ tests() {
+       n2 iperf3 -s -1 -B fd00::2 &
+       waitiperf $netns2 $!
+       n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2
++
++      # TCP over IPv4, in parallel
++      for max in 4 5 50; do
++              local pids=( )
++              for ((i=0; i < max; ++i)) do
++                      n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 &
++                      pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i ))
++              done
++              for ((i=0; i < max; ++i)) do
++                      n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 &
++              done
++              wait "${pids[@]}"
++      done
+ }
+ 
+ [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}"
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 1 Mar 2021 14:54:00 +0000 (15:54 +0100)
queue-5.10/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/net-sched-fix-police-ext-initialization.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history
queue-5.10/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/wireguard-selftests-test-multiple-parallel-streams.patch	[new file with mode: 0644]	patch \| blob