From 79368accd64c2ebec533a53db55b0c13dbb730ec Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 1 Mar 2021 15:53:38 +0100 Subject: [PATCH] 5.11-stable patches added patches: ipv6-silence-compilation-warning-for-non-ipv6-builds.patch net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch net-sched-fix-police-ext-initialization.patch wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch wireguard-selftests-test-multiple-parallel-streams.patch --- ...pilation-warning-for-non-ipv6-builds.patch | 37 ++ ...rom-icmp-v6-_ndo_send-before-sending.patch | 338 +++++++++++ ...-sched-fix-police-ext-initialization.patch | 119 ++++ queue-5.11/series | 5 + ...ing-get-rid-of-per-peer-ring-buffers.patch | 562 ++++++++++++++++++ ...tests-test-multiple-parallel-streams.patch | 54 ++ 6 files changed, 1115 insertions(+) create mode 100644 queue-5.11/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch create mode 100644 queue-5.11/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch create mode 100644 queue-5.11/net-sched-fix-police-ext-initialization.patch create mode 100644 queue-5.11/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch create mode 100644 queue-5.11/wireguard-selftests-test-multiple-parallel-streams.patch diff --git a/queue-5.11/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch b/queue-5.11/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch new file mode 100644 index 00000000000..d2b51736d40 --- /dev/null +++ b/queue-5.11/ipv6-silence-compilation-warning-for-non-ipv6-builds.patch @@ -0,0 +1,37 @@ +From 1faba27f11c8da244e793546a1b35a9b1da8208e Mon Sep 17 00:00:00 2001 +From: Leon Romanovsky +Date: Wed, 3 Feb 2021 15:51:09 +0200 +Subject: ipv6: silence compilation warning for non-IPV6 builds + +From: Leon Romanovsky + +commit 1faba27f11c8da244e793546a1b35a9b1da8208e upstream. + +The W=1 compilation of allmodconfig generates the following warning: + +net/ipv6/icmp.c:448:6: warning: no previous prototype for 'icmp6_send' [-Wmissing-prototypes] + 448 | void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + | ^~~~~~~~~~ + +Fix it by providing function declaration for builds with ipv6 as a module. + +Signed-off-by: Leon Romanovsky +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/icmpv6.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/icmpv6.h ++++ b/include/linux/icmpv6.h +@@ -16,9 +16,9 @@ static inline struct icmp6hdr *icmp6_hdr + + typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr); +-#if IS_BUILTIN(CONFIG_IPV6) + void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr); ++#if IS_BUILTIN(CONFIG_IPV6) + static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) + { + icmp6_send(skb, type, code, info, NULL); diff --git a/queue-5.11/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch b/queue-5.11/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch new file mode 100644 index 00000000000..e4a4717bd40 --- /dev/null +++ b/queue-5.11/net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch @@ -0,0 +1,338 @@ +From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Tue, 23 Feb 2021 14:18:58 +0100 +Subject: net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending + +From: Jason A. Donenfeld + +commit ee576c47db60432c37e54b1e2b43a8ca6d3a8dca upstream. + +The icmp{,v6}_send functions make all sorts of use of skb->cb, casting +it with IPCB or IP6CB, assuming the skb to have come directly from the +inet layer. But when the packet comes from the ndo layer, especially +when forwarded, there's no telling what might be in skb->cb at that +point. As a result, the icmp sending code risks reading bogus memory +contents, which can result in nasty stack overflows such as this one +reported by a user: + + panic+0x108/0x2ea + __stack_chk_fail+0x14/0x20 + __icmp_send+0x5bd/0x5c0 + icmp_ndo_send+0x148/0x160 + +In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read +from it. The optlen parameter there is of particular note, as it can +induce writes beyond bounds. There are quite a few ways that can happen +in __ip_options_echo. For example: + + // sptr/skb are attacker-controlled skb bytes + sptr = skb_network_header(skb); + // dptr/dopt points to stack memory allocated by __icmp_send + dptr = dopt->__data; + // sopt is the corrupt skb->cb in question + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data + soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data + // this now writes potentially attacker-controlled data, over + // flowing the stack: + memcpy(dptr, sptr+sopt->rr, optlen); + } + +In the icmpv6_send case, the story is similar, but not as dire, as only +IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is +worse than the iif case, but it is passed to ipv6_find_tlv, which does +a bit of bounds checking on the value. + +This is easy to simulate by doing a `memset(skb->cb, 0x41, +sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by +good fortune and the rarity of icmp sending from that context that we've +avoided reports like this until now. For example, in KASAN: + + BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0 + Write of size 38 at addr ffff888006f1f80e by task ping/89 + CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5 + Call Trace: + dump_stack+0x9a/0xcc + print_address_description.constprop.0+0x1a/0x160 + __kasan_report.cold+0x20/0x38 + kasan_report+0x32/0x40 + check_memory_region+0x145/0x1a0 + memcpy+0x39/0x60 + __ip_options_echo+0xa0e/0x12b0 + __icmp_send+0x744/0x1700 + +Actually, out of the 4 drivers that do this, only gtp zeroed the cb for +the v4 case, while the rest did not. So this commit actually removes the +gtp-specific zeroing, while putting the code where it belongs in the +shared infrastructure of icmp{,v6}_ndo_send. + +This commit fixes the issue by passing an empty IPCB or IP6CB along to +the functions that actually do the work. For the icmp_send, this was +already trivial, thanks to __icmp_send providing the plumbing function. +For icmpv6_send, this required a tiny bit of refactoring to make it +behave like the v4 case, after which it was straight forward. + +Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs") +Reported-by: SinYu +Reviewed-by: Willem de Bruijn +Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/ +Signed-off-by: Jason A. Donenfeld +Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/gtp.c | 1 - + include/linux/icmpv6.h | 26 ++++++++++++++++++++------ + include/linux/ipv6.h | 1 - + include/net/icmp.h | 6 +++++- + net/ipv4/icmp.c | 5 +++-- + net/ipv6/icmp.c | 18 +++++++++--------- + net/ipv6/ip6_icmp.c | 12 +++++++----- + 7 files changed, 44 insertions(+), 25 deletions(-) + +--- a/drivers/net/gtp.c ++++ b/drivers/net/gtp.c +@@ -539,7 +539,6 @@ static int gtp_build_skb_ip4(struct sk_b + if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && + mtu < ntohs(iph->tot_len)) { + netdev_dbg(dev, "packet too big, fragmentation needed\n"); +- memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + goto err_rt; +--- a/include/linux/icmpv6.h ++++ b/include/linux/icmpv6.h +@@ -3,6 +3,7 @@ + #define _LINUX_ICMPV6_H + + #include ++#include + #include + + static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) +@@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr + #if IS_ENABLED(CONFIG_IPV6) + + typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr); ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm); + void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr); ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm); + #if IS_BUILTIN(CONFIG_IPV6) +-static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm) + { +- icmp6_send(skb, type, code, info, NULL); ++ icmp6_send(skb, type, code, info, NULL, parm); + } + static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) + { +@@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_ + return 0; + } + #else +-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info); ++extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm); + extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); + extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); + #endif + ++static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++{ ++ __icmpv6_send(skb, type, code, info, IP6CB(skb)); ++} ++ + int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, + unsigned int data_len); + + #if IS_ENABLED(CONFIG_NF_NAT) + void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); + #else +-#define icmpv6_ndo_send icmpv6_send ++static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) ++{ ++ struct inet6_skb_parm parm = { 0 }; ++ __icmpv6_send(skb_in, type, code, info, &parm); ++} + #endif + + #else +--- a/include/linux/ipv6.h ++++ b/include/linux/ipv6.h +@@ -84,7 +84,6 @@ struct ipv6_params { + __s32 autoconf; + }; + extern struct ipv6_params ipv6_defaults; +-#include + #include + #include + +--- a/include/net/icmp.h ++++ b/include/net/icmp.h +@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_b + #if IS_ENABLED(CONFIG_NF_NAT) + void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); + #else +-#define icmp_ndo_send icmp_send ++static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) ++{ ++ struct ip_options opts = { 0 }; ++ __icmp_send(skb_in, type, code, info, &opts); ++} + #endif + + int icmp_rcv(struct sk_buff *skb); +--- a/net/ipv4/icmp.c ++++ b/net/ipv4/icmp.c +@@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send); + void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) + { + struct sk_buff *cloned_skb = NULL; ++ struct ip_options opts = { 0 }; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { +- icmp_send(skb_in, type, code, info); ++ __icmp_send(skb_in, type, code, info, &opts); + return; + } + +@@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_i + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; +- icmp_send(skb_in, type, code, info); ++ __icmp_send(skb_in, type, code, info, &opts); + ip_hdr(skb_in)->saddr = orig_ip; + out: + consume_skb(cloned_skb); +--- a/net/ipv6/icmp.c ++++ b/net/ipv6/icmp.c +@@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, ch + } + + #if IS_ENABLED(CONFIG_IPV6_MIP6) +-static void mip6_addr_swap(struct sk_buff *skb) ++static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) + { + struct ipv6hdr *iph = ipv6_hdr(skb); +- struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6_destopt_hao *hao; + struct in6_addr tmp; + int off; +@@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buf + } + } + #else +-static inline void mip6_addr_swap(struct sk_buff *skb) {} ++static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} + #endif + + static struct dst_entry *icmpv6_route_lookup(struct net *net, +@@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buf + * Send an ICMP message in response to a packet in error + */ + void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, +- const struct in6_addr *force_saddr) ++ const struct in6_addr *force_saddr, ++ const struct inet6_skb_parm *parm) + { + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = ipv6_hdr(skb); +@@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + goto out_bh_enable; + +- mip6_addr_swap(skb); ++ mip6_addr_swap(skb, parm); + + sk = icmpv6_xmit_lock(net); + if (!sk) +@@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 + /* select a more meaningful saddr from input if */ + struct net_device *in_netdev; + +- in_netdev = dev_get_by_index(net, IP6CB(skb)->iif); ++ in_netdev = dev_get_by_index(net, parm->iif); + if (in_netdev) { + ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, + inet6_sk(sk)->srcprefs, +@@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send); + */ + void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) + { +- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); ++ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); + kfree_skb(skb); + } + +@@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk + } + if (type == ICMP_TIME_EXCEEDED) + icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, +- info, &temp_saddr); ++ info, &temp_saddr, IP6CB(skb2)); + else + icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, +- info, &temp_saddr); ++ info, &temp_saddr, IP6CB(skb2)); + if (rt) + ip6_rt_put(rt); + +--- a/net/ipv6/ip6_icmp.c ++++ b/net/ipv6/ip6_icmp.c +@@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icm + } + EXPORT_SYMBOL(inet6_unregister_icmp_sender); + +-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) ++void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ++ const struct inet6_skb_parm *parm) + { + ip6_icmp_send_t *send; + + rcu_read_lock(); + send = rcu_dereference(ip6_icmp_send); + if (send) +- send(skb, type, code, info, NULL); ++ send(skb, type, code, info, NULL, parm); + rcu_read_unlock(); + } +-EXPORT_SYMBOL(icmpv6_send); ++EXPORT_SYMBOL(__icmpv6_send); + #endif + + #if IS_ENABLED(CONFIG_NF_NAT) + #include + void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) + { ++ struct inet6_skb_parm parm = { 0 }; + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct in6_addr orig_ip; +@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { +- icmpv6_send(skb_in, type, code, info); ++ __icmpv6_send(skb_in, type, code, info, &parm); + return; + } + +@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb + + orig_ip = ipv6_hdr(skb_in)->saddr; + ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; +- icmpv6_send(skb_in, type, code, info); ++ __icmpv6_send(skb_in, type, code, info, &parm); + ipv6_hdr(skb_in)->saddr = orig_ip; + out: + consume_skb(cloned_skb); diff --git a/queue-5.11/net-sched-fix-police-ext-initialization.patch b/queue-5.11/net-sched-fix-police-ext-initialization.patch new file mode 100644 index 00000000000..af2682ce8c6 --- /dev/null +++ b/queue-5.11/net-sched-fix-police-ext-initialization.patch @@ -0,0 +1,119 @@ +From 396d7f23adf9e8c436dd81a69488b5b6a865acf8 Mon Sep 17 00:00:00 2001 +From: Vlad Buslov +Date: Tue, 16 Feb 2021 18:22:00 +0200 +Subject: net: sched: fix police ext initialization + +From: Vlad Buslov + +commit 396d7f23adf9e8c436dd81a69488b5b6a865acf8 upstream. + +When police action is created by cls API tcf_exts_validate() first +conditional that calls tcf_action_init_1() directly, the action idr is not +updated according to latest changes in action API that require caller to +commit newly created action to idr with tcf_idr_insert_many(). This results +such action not being accessible through act API and causes crash reported +by syzbot: + +================================================================== +BUG: KASAN: null-ptr-deref in instrument_atomic_read include/linux/instrumented.h:71 [inline] +BUG: KASAN: null-ptr-deref in atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] +BUG: KASAN: null-ptr-deref in __tcf_idr_release net/sched/act_api.c:178 [inline] +BUG: KASAN: null-ptr-deref in tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 +Read of size 4 at addr 0000000000000010 by task kworker/u4:5/204 + +CPU: 0 PID: 204 Comm: kworker/u4:5 Not tainted 5.11.0-rc7-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Workqueue: netns cleanup_net +Call Trace: + __dump_stack lib/dump_stack.c:79 [inline] + dump_stack+0x107/0x163 lib/dump_stack.c:120 + __kasan_report mm/kasan/report.c:400 [inline] + kasan_report.cold+0x5f/0xd5 mm/kasan/report.c:413 + check_memory_region_inline mm/kasan/generic.c:179 [inline] + check_memory_region+0x13d/0x180 mm/kasan/generic.c:185 + instrument_atomic_read include/linux/instrumented.h:71 [inline] + atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] + __tcf_idr_release net/sched/act_api.c:178 [inline] + tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 + tc_action_net_exit include/net/act_api.h:151 [inline] + police_exit_net+0x168/0x360 net/sched/act_police.c:390 + ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190 + cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604 + process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 + worker_thread+0x64c/0x1120 kernel/workqueue.c:2421 + kthread+0x3b1/0x4a0 kernel/kthread.c:292 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 +================================================================== +Kernel panic - not syncing: panic_on_warn set ... +CPU: 0 PID: 204 Comm: kworker/u4:5 Tainted: G B 5.11.0-rc7-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +Workqueue: netns cleanup_net +Call Trace: + __dump_stack lib/dump_stack.c:79 [inline] + dump_stack+0x107/0x163 lib/dump_stack.c:120 + panic+0x306/0x73d kernel/panic.c:231 + end_report+0x58/0x5e mm/kasan/report.c:100 + __kasan_report mm/kasan/report.c:403 [inline] + kasan_report.cold+0x67/0xd5 mm/kasan/report.c:413 + check_memory_region_inline mm/kasan/generic.c:179 [inline] + check_memory_region+0x13d/0x180 mm/kasan/generic.c:185 + instrument_atomic_read include/linux/instrumented.h:71 [inline] + atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] + __tcf_idr_release net/sched/act_api.c:178 [inline] + tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 + tc_action_net_exit include/net/act_api.h:151 [inline] + police_exit_net+0x168/0x360 net/sched/act_police.c:390 + ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190 + cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604 + process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 + worker_thread+0x64c/0x1120 kernel/workqueue.c:2421 + kthread+0x3b1/0x4a0 kernel/kthread.c:292 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 +Kernel Offset: disabled + +Fix the issue by calling tcf_idr_insert_many() after successful action +initialization. + +Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together") +Reported-by: syzbot+151e3e714d34ae4ce7e8@syzkaller.appspotmail.com +Signed-off-by: Vlad Buslov +Reviewed-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/act_api.h | 1 + + net/sched/act_api.c | 2 +- + net/sched/cls_api.c | 1 + + 3 files changed, 3 insertions(+), 1 deletion(-) + +--- a/include/net/act_api.h ++++ b/include/net/act_api.h +@@ -166,6 +166,7 @@ int tcf_idr_create_from_flags(struct tc_ + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags); ++void tcf_idr_insert_many(struct tc_action *actions[]); + void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); + int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind); +--- a/net/sched/act_api.c ++++ b/net/sched/act_api.c +@@ -908,7 +908,7 @@ static const struct nla_policy tcf_actio + [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), + }; + +-static void tcf_idr_insert_many(struct tc_action *actions[]) ++void tcf_idr_insert_many(struct tc_action *actions[]) + { + int i; + +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -3053,6 +3053,7 @@ int tcf_exts_validate(struct net *net, s + act->type = exts->type = TCA_OLD_COMPAT; + exts->actions[0] = act; + exts->nr_actions = 1; ++ tcf_idr_insert_many(exts->actions); + } else if (exts->action && tb[exts->action]) { + int err; + diff --git a/queue-5.11/series b/queue-5.11/series index dac87bc45fd..3ce2ce9dc21 100644 --- a/queue-5.11/series +++ b/queue-5.11/series @@ -766,3 +766,8 @@ dm-era-reinitialize-bitset-cache-before-digesting-a-new-writeset.patch dm-era-only-resize-metadata-in-preresume.patch drm-i915-reject-446-480mhz-hdmi-clock-on-glk.patch kgdb-fix-to-kill-breakpoints-on-initmem-after-boot.patch +ipv6-silence-compilation-warning-for-non-ipv6-builds.patch +net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-before-sending.patch +wireguard-selftests-test-multiple-parallel-streams.patch +wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch +net-sched-fix-police-ext-initialization.patch diff --git a/queue-5.11/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch b/queue-5.11/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch new file mode 100644 index 00000000000..81ccba58c95 --- /dev/null +++ b/queue-5.11/wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch @@ -0,0 +1,562 @@ +From 8b5553ace83cced775eefd0f3f18b5c6214ccf7a Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:48 +0100 +Subject: wireguard: queueing: get rid of per-peer ring buffers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jason A. Donenfeld + +commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream. + +Having two ring buffers per-peer means that every peer results in two +massive ring allocations. On an 8-core x86_64 machine, this commit +reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which +is an 90% reduction. Ninety percent! With some single-machine +deployments approaching 500,000 peers, we're talking about a reduction +from 7 gigs of memory down to 700 megs of memory. + +In order to get rid of these per-peer allocations, this commit switches +to using a list-based queueing approach. Currently GSO fragments are +chained together using the skb->next pointer (the skb_list_* singly +linked list approach), so we form the per-peer queue around the unused +skb->prev pointer (which sort of makes sense because the links are +pointing backwards). Use of skb_queue_* is not possible here, because +that is based on doubly linked lists and spinlocks. Multiple cores can +write into the queue at any given time, because its writes occur in the +start_xmit path or in the udp_recv path. But reads happen in a single +workqueue item per-peer, amounting to a multi-producer, single-consumer +paradigm. + +The MPSC queue is implemented locklessly and never blocks. However, it +is not linearizable (though it is serializable), with a very tight and +unlikely race on writes, which, when hit (some tiny fraction of the +0.15% of partial adds on a fully loaded 16-core x86_64 system), causes +the queue reader to terminate early. However, because every packet sent +queues up the same workqueue item after it is fully added, the worker +resumes again, and stopping early isn't actually a problem, since at +that point the packet wouldn't have yet been added to the encryption +queue. These properties allow us to avoid disabling interrupts or +spinning. The design is based on Dmitry Vyukov's algorithm [1]. + +Performance-wise, ordinarily list-based queues aren't preferable to +ringbuffers, because of cache misses when following pointers around. +However, we *already* have to follow the adjacent pointers when working +through fragments, so there shouldn't actually be any change there. A +potential downside is that dequeueing is a bit more complicated, but the +ptr_ring structure used prior had a spinlock when dequeueing, so all and +all the difference appears to be a wash. + +Actually, from profiling, the biggest performance hit, by far, of this +commit winds up being atomic_add_unless(count, 1, max) and atomic_ +dec(count), which account for the majority of CPU time, according to +perf. In that sense, the previous ring buffer was superior in that it +could check if it was full by head==tail, which the list-based approach +cannot do. + +But all and all, this enables us to get massive memory savings, allowing +WireGuard to scale for real world deployments, without taking much of a +performance hit. + +[1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue + +Reviewed-by: Dmitry Vyukov +Reviewed-by: Toke Høiland-Jørgensen +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/wireguard/device.c | 12 ++--- + drivers/net/wireguard/device.h | 15 +++--- + drivers/net/wireguard/peer.c | 28 ++++-------- + drivers/net/wireguard/peer.h | 4 - + drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++++++++-------- + drivers/net/wireguard/queueing.h | 45 ++++++++++++++------ + drivers/net/wireguard/receive.c | 16 ++----- + drivers/net/wireguard/send.c | 31 ++++---------- + 8 files changed, 144 insertions(+), 93 deletions(-) + +--- a/drivers/net/wireguard/device.c ++++ b/drivers/net/wireguard/device.c +@@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic + destroy_workqueue(wg->handshake_receive_wq); + destroy_workqueue(wg->handshake_send_wq); + destroy_workqueue(wg->packet_crypt_wq); +- wg_packet_queue_free(&wg->decrypt_queue, true); +- wg_packet_queue_free(&wg->encrypt_queue, true); ++ wg_packet_queue_free(&wg->decrypt_queue); ++ wg_packet_queue_free(&wg->encrypt_queue); + rcu_barrier(); /* Wait for all the peers to be actually freed. */ + wg_ratelimiter_uninit(); + memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); +@@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne + goto err_destroy_handshake_send; + + ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, +- true, MAX_QUEUED_PACKETS); ++ MAX_QUEUED_PACKETS); + if (ret < 0) + goto err_destroy_packet_crypt; + + ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, +- true, MAX_QUEUED_PACKETS); ++ MAX_QUEUED_PACKETS); + if (ret < 0) + goto err_free_encrypt_queue; + +@@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne + err_uninit_ratelimiter: + wg_ratelimiter_uninit(); + err_free_decrypt_queue: +- wg_packet_queue_free(&wg->decrypt_queue, true); ++ wg_packet_queue_free(&wg->decrypt_queue); + err_free_encrypt_queue: +- wg_packet_queue_free(&wg->encrypt_queue, true); ++ wg_packet_queue_free(&wg->encrypt_queue); + err_destroy_packet_crypt: + destroy_workqueue(wg->packet_crypt_wq); + err_destroy_handshake_send: +--- a/drivers/net/wireguard/device.h ++++ b/drivers/net/wireguard/device.h +@@ -27,13 +27,14 @@ struct multicore_worker { + + struct crypt_queue { + struct ptr_ring ring; +- union { +- struct { +- struct multicore_worker __percpu *worker; +- int last_cpu; +- }; +- struct work_struct work; +- }; ++ struct multicore_worker __percpu *worker; ++ int last_cpu; ++}; ++ ++struct prev_queue { ++ struct sk_buff *head, *tail, *peeked; ++ struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff. ++ atomic_t count; + }; + + struct wg_device { +--- a/drivers/net/wireguard/peer.c ++++ b/drivers/net/wireguard/peer.c +@@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg + peer = kzalloc(sizeof(*peer), GFP_KERNEL); + if (unlikely(!peer)) + return ERR_PTR(ret); +- peer->device = wg; ++ if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) ++ goto err; + ++ peer->device = wg; + wg_noise_handshake_init(&peer->handshake, &wg->static_identity, + public_key, preshared_key, peer); +- if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) +- goto err_1; +- if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false, +- MAX_QUEUED_PACKETS)) +- goto err_2; +- if (wg_packet_queue_init(&peer->rx_queue, NULL, false, +- MAX_QUEUED_PACKETS)) +- goto err_3; +- + peer->internal_id = atomic64_inc_return(&peer_counter); + peer->serial_work_cpu = nr_cpumask_bits; + wg_cookie_init(&peer->latest_cookie); + wg_timers_init(peer); + wg_cookie_checker_precompute_peer_keys(peer); + spin_lock_init(&peer->keypairs.keypair_update_lock); +- INIT_WORK(&peer->transmit_handshake_work, +- wg_packet_handshake_send_worker); ++ INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); ++ INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); ++ wg_prev_queue_init(&peer->tx_queue); ++ wg_prev_queue_init(&peer->rx_queue); + rwlock_init(&peer->endpoint_lock); + kref_init(&peer->refcount); + skb_queue_head_init(&peer->staged_packet_queue); +@@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg + pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); + return peer; + +-err_3: +- wg_packet_queue_free(&peer->tx_queue, false); +-err_2: +- dst_cache_destroy(&peer->endpoint_cache); +-err_1: ++err: + kfree(peer); + return ERR_PTR(ret); + } +@@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head + struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); + + dst_cache_destroy(&peer->endpoint_cache); +- wg_packet_queue_free(&peer->rx_queue, false); +- wg_packet_queue_free(&peer->tx_queue, false); ++ WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); + + /* The final zeroing takes care of clearing any remaining handshake key + * material and other potentially sensitive information. +--- a/drivers/net/wireguard/peer.h ++++ b/drivers/net/wireguard/peer.h +@@ -36,7 +36,7 @@ struct endpoint { + + struct wg_peer { + struct wg_device *device; +- struct crypt_queue tx_queue, rx_queue; ++ struct prev_queue tx_queue, rx_queue; + struct sk_buff_head staged_packet_queue; + int serial_work_cpu; + struct noise_keypairs keypairs; +@@ -45,7 +45,7 @@ struct wg_peer { + rwlock_t endpoint_lock; + struct noise_handshake handshake; + atomic64_t last_sent_handshake; +- struct work_struct transmit_handshake_work, clear_peer_work; ++ struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work; + struct cookie latest_cookie; + struct hlist_node pubkey_hash; + u64 rx_bytes, tx_bytes; +--- a/drivers/net/wireguard/queueing.c ++++ b/drivers/net/wireguard/queueing.c +@@ -9,8 +9,7 @@ struct multicore_worker __percpu * + wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) + { + int cpu; +- struct multicore_worker __percpu *worker = +- alloc_percpu(struct multicore_worker); ++ struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); + + if (!worker) + return NULL; +@@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc( + } + + int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, +- bool multicore, unsigned int len) ++ unsigned int len) + { + int ret; + +@@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu + ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); + if (ret) + return ret; +- if (function) { +- if (multicore) { +- queue->worker = wg_packet_percpu_multicore_worker_alloc( +- function, queue); +- if (!queue->worker) { +- ptr_ring_cleanup(&queue->ring, NULL); +- return -ENOMEM; +- } +- } else { +- INIT_WORK(&queue->work, function); +- } ++ queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); ++ if (!queue->worker) { ++ ptr_ring_cleanup(&queue->ring, NULL); ++ return -ENOMEM; + } + return 0; + } + +-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore) ++void wg_packet_queue_free(struct crypt_queue *queue) + { +- if (multicore) +- free_percpu(queue->worker); ++ free_percpu(queue->worker); + WARN_ON(!__ptr_ring_empty(&queue->ring)); + ptr_ring_cleanup(&queue->ring, NULL); + } ++ ++#define NEXT(skb) ((skb)->prev) ++#define STUB(queue) ((struct sk_buff *)&queue->empty) ++ ++void wg_prev_queue_init(struct prev_queue *queue) ++{ ++ NEXT(STUB(queue)) = NULL; ++ queue->head = queue->tail = STUB(queue); ++ queue->peeked = NULL; ++ atomic_set(&queue->count, 0); ++ BUILD_BUG_ON( ++ offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - ++ offsetof(struct prev_queue, empty) || ++ offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - ++ offsetof(struct prev_queue, empty)); ++} ++ ++static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) ++{ ++ WRITE_ONCE(NEXT(skb), NULL); ++ WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); ++} ++ ++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) ++{ ++ if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) ++ return false; ++ __wg_prev_queue_enqueue(queue, skb); ++ return true; ++} ++ ++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) ++{ ++ struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); ++ ++ if (tail == STUB(queue)) { ++ if (!next) ++ return NULL; ++ queue->tail = next; ++ tail = next; ++ next = smp_load_acquire(&NEXT(next)); ++ } ++ if (next) { ++ queue->tail = next; ++ atomic_dec(&queue->count); ++ return tail; ++ } ++ if (tail != READ_ONCE(queue->head)) ++ return NULL; ++ __wg_prev_queue_enqueue(queue, STUB(queue)); ++ next = smp_load_acquire(&NEXT(tail)); ++ if (next) { ++ queue->tail = next; ++ atomic_dec(&queue->count); ++ return tail; ++ } ++ return NULL; ++} ++ ++#undef NEXT ++#undef STUB +--- a/drivers/net/wireguard/queueing.h ++++ b/drivers/net/wireguard/queueing.h +@@ -17,12 +17,13 @@ struct wg_device; + struct wg_peer; + struct multicore_worker; + struct crypt_queue; ++struct prev_queue; + struct sk_buff; + + /* queueing.c APIs: */ + int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, +- bool multicore, unsigned int len); +-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); ++ unsigned int len); ++void wg_packet_queue_free(struct crypt_queue *queue); + struct multicore_worker __percpu * + wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); + +@@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online + return cpu; + } + ++void wg_prev_queue_init(struct prev_queue *queue); ++ ++/* Multi producer */ ++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); ++ ++/* Single consumer */ ++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); ++ ++/* Single consumer */ ++static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) ++{ ++ if (queue->peeked) ++ return queue->peeked; ++ queue->peeked = wg_prev_queue_dequeue(queue); ++ return queue->peeked; ++} ++ ++/* Single consumer */ ++static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) ++{ ++ queue->peeked = NULL; ++} ++ + static inline int wg_queue_enqueue_per_device_and_peer( +- struct crypt_queue *device_queue, struct crypt_queue *peer_queue, ++ struct crypt_queue *device_queue, struct prev_queue *peer_queue, + struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) + { + int cpu; +@@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d + /* We first queue this up for the peer ingestion, but the consumer + * will wait for the state to change to CRYPTED or DEAD before. + */ +- if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) ++ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) + return -ENOSPC; ++ + /* Then we queue it up in the device queue, which consumes the + * packet as soon as it can. + */ +@@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d + return 0; + } + +-static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, +- struct sk_buff *skb, +- enum packet_state state) ++static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) + { + /* We take a reference, because as soon as we call atomic_set, the + * peer can be freed from below us. +@@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_ + struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); + + atomic_set_release(&PACKET_CB(skb)->state, state); +- queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, +- peer->internal_id), +- peer->device->packet_crypt_wq, &queue->work); ++ queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), ++ peer->device->packet_crypt_wq, &peer->transmit_packet_work); + wg_peer_put(peer); + } + +-static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, +- enum packet_state state) ++static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) + { + /* We take a reference, because as soon as we call atomic_set, the + * peer can be freed from below us. +--- a/drivers/net/wireguard/receive.c ++++ b/drivers/net/wireguard/receive.c +@@ -444,7 +444,6 @@ packet_processed: + int wg_packet_rx_poll(struct napi_struct *napi, int budget) + { + struct wg_peer *peer = container_of(napi, struct wg_peer, napi); +- struct crypt_queue *queue = &peer->rx_queue; + struct noise_keypair *keypair; + struct endpoint endpoint; + enum packet_state state; +@@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct + if (unlikely(budget <= 0)) + return 0; + +- while ((skb = __ptr_ring_peek(&queue->ring)) != NULL && ++ while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && + (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != + PACKET_STATE_UNCRYPTED) { +- __ptr_ring_discard_one(&queue->ring); +- peer = PACKET_PEER(skb); ++ wg_prev_queue_drop_peeked(&peer->rx_queue); + keypair = PACKET_CB(skb)->keypair; + free = true; + +@@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor + enum packet_state state = + likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? + PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; +- wg_queue_enqueue_per_peer_napi(skb, state); ++ wg_queue_enqueue_per_peer_rx(skb, state); + if (need_resched()) + cond_resched(); + } +@@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc + if (unlikely(READ_ONCE(peer->is_dead))) + goto err; + +- ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, +- &peer->rx_queue, skb, +- wg->packet_crypt_wq, +- &wg->decrypt_queue.last_cpu); ++ ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, ++ wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu); + if (unlikely(ret == -EPIPE)) +- wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD); ++ wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); + if (likely(!ret || ret == -EPIPE)) { + rcu_read_unlock_bh(); + return; +--- a/drivers/net/wireguard/send.c ++++ b/drivers/net/wireguard/send.c +@@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_ + wg_packet_send_staged_packets(peer); + } + +-static void wg_packet_create_data_done(struct sk_buff *first, +- struct wg_peer *peer) ++static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first) + { + struct sk_buff *skb, *next; + bool is_keepalive, data_sent = false; +@@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s + + void wg_packet_tx_worker(struct work_struct *work) + { +- struct crypt_queue *queue = container_of(work, struct crypt_queue, +- work); ++ struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work); + struct noise_keypair *keypair; + enum packet_state state; + struct sk_buff *first; +- struct wg_peer *peer; + +- while ((first = __ptr_ring_peek(&queue->ring)) != NULL && ++ while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL && + (state = atomic_read_acquire(&PACKET_CB(first)->state)) != + PACKET_STATE_UNCRYPTED) { +- __ptr_ring_discard_one(&queue->ring); +- peer = PACKET_PEER(first); ++ wg_prev_queue_drop_peeked(&peer->tx_queue); + keypair = PACKET_CB(first)->keypair; + + if (likely(state == PACKET_STATE_CRYPTED)) +- wg_packet_create_data_done(first, peer); ++ wg_packet_create_data_done(peer, first); + else + kfree_skb_list(first); + +@@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor + break; + } + } +- wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, +- state); ++ wg_queue_enqueue_per_peer_tx(first, state); + if (need_resched()) + cond_resched(); + } + } + +-static void wg_packet_create_data(struct sk_buff *first) ++static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) + { +- struct wg_peer *peer = PACKET_PEER(first); + struct wg_device *wg = peer->device; + int ret = -EINVAL; + +@@ -323,13 +317,10 @@ static void wg_packet_create_data(struct + if (unlikely(READ_ONCE(peer->is_dead))) + goto err; + +- ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, +- &peer->tx_queue, first, +- wg->packet_crypt_wq, +- &wg->encrypt_queue.last_cpu); ++ ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first, ++ wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu); + if (unlikely(ret == -EPIPE)) +- wg_queue_enqueue_per_peer(&peer->tx_queue, first, +- PACKET_STATE_DEAD); ++ wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD); + err: + rcu_read_unlock_bh(); + if (likely(!ret || ret == -EPIPE)) +@@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc + packets.prev->next = NULL; + wg_peer_get(keypair->entry.peer); + PACKET_CB(packets.next)->keypair = keypair; +- wg_packet_create_data(packets.next); ++ wg_packet_create_data(peer, packets.next); + return; + + out_invalid: diff --git a/queue-5.11/wireguard-selftests-test-multiple-parallel-streams.patch b/queue-5.11/wireguard-selftests-test-multiple-parallel-streams.patch new file mode 100644 index 00000000000..9f7f6f1a5bd --- /dev/null +++ b/queue-5.11/wireguard-selftests-test-multiple-parallel-streams.patch @@ -0,0 +1,54 @@ +From d5a49aa6c3e264a93a7d08485d66e346be0969dd Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Mon, 22 Feb 2021 17:25:45 +0100 +Subject: wireguard: selftests: test multiple parallel streams + +From: Jason A. Donenfeld + +commit d5a49aa6c3e264a93a7d08485d66e346be0969dd upstream. + +In order to test ndo_start_xmit being called in parallel, explicitly add +separate tests, which should all run on different cores. This should +help tease out bugs associated with queueing up packets from different +cores in parallel. Currently, it hasn't found those types of bugs, but +given future planned work, this is a useful regression to avoid. + +Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/wireguard/netns.sh | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/tools/testing/selftests/wireguard/netns.sh ++++ b/tools/testing/selftests/wireguard/netns.sh +@@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0 + ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } + ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } + sleep() { read -t "$1" -N 1 || true; } +-waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } ++waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } + waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; } + waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; } + +@@ -141,6 +141,19 @@ tests() { + n2 iperf3 -s -1 -B fd00::2 & + waitiperf $netns2 $! + n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2 ++ ++ # TCP over IPv4, in parallel ++ for max in 4 5 50; do ++ local pids=( ) ++ for ((i=0; i < max; ++i)) do ++ n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 & ++ pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i )) ++ done ++ for ((i=0; i < max; ++i)) do ++ n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 & ++ done ++ wait "${pids[@]}" ++ done + } + + [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}" -- 2.47.3