]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Jun 2021 12:14:10 +0000 (14:14 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 8 Jun 2021 12:14:10 +0000 (14:14 +0200)
added patches:
bluetooth-use-correct-lock-to-prevent-uaf-of-hdev-object.patch
wireguard-allowedips-allocate-nodes-in-kmem_cache.patch
wireguard-allowedips-free-empty-intermediate-nodes-when-removing-single-node.patch
wireguard-allowedips-initialize-list-head-in-selftest.patch
wireguard-allowedips-remove-nodes-in-o-1.patch
wireguard-do-not-use-o3.patch
wireguard-peer-allocate-in-kmem_cache.patch
wireguard-selftests-make-sure-rp_filter-is-disabled-on-vethc.patch
wireguard-selftests-remove-old-conntrack-kconfig-value.patch
wireguard-use-synchronize_net-rather-than-synchronize_rcu.patch

queue-5.10/bluetooth-use-correct-lock-to-prevent-uaf-of-hdev-object.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/wireguard-allowedips-allocate-nodes-in-kmem_cache.patch [new file with mode: 0644]
queue-5.10/wireguard-allowedips-free-empty-intermediate-nodes-when-removing-single-node.patch [new file with mode: 0644]
queue-5.10/wireguard-allowedips-initialize-list-head-in-selftest.patch [new file with mode: 0644]
queue-5.10/wireguard-allowedips-remove-nodes-in-o-1.patch [new file with mode: 0644]
queue-5.10/wireguard-do-not-use-o3.patch [new file with mode: 0644]
queue-5.10/wireguard-peer-allocate-in-kmem_cache.patch [new file with mode: 0644]
queue-5.10/wireguard-selftests-make-sure-rp_filter-is-disabled-on-vethc.patch [new file with mode: 0644]
queue-5.10/wireguard-selftests-remove-old-conntrack-kconfig-value.patch [new file with mode: 0644]
queue-5.10/wireguard-use-synchronize_net-rather-than-synchronize_rcu.patch [new file with mode: 0644]

diff --git a/queue-5.10/bluetooth-use-correct-lock-to-prevent-uaf-of-hdev-object.patch b/queue-5.10/bluetooth-use-correct-lock-to-prevent-uaf-of-hdev-object.patch
new file mode 100644 (file)
index 0000000..ac9c59d
--- /dev/null
@@ -0,0 +1,43 @@
+From e305509e678b3a4af2b3cfd410f409f7cdaabb52 Mon Sep 17 00:00:00 2001
+From: Lin Ma <linma@zju.edu.cn>
+Date: Sun, 30 May 2021 21:37:43 +0800
+Subject: Bluetooth: use correct lock to prevent UAF of hdev object
+
+From: Lin Ma <linma@zju.edu.cn>
+
+commit e305509e678b3a4af2b3cfd410f409f7cdaabb52 upstream.
+
+The hci_sock_dev_event() function will cleanup the hdev object for
+sockets even if this object may still be in used within the
+hci_sock_bound_ioctl() function, result in UAF vulnerability.
+
+This patch replace the BH context lock to serialize these affairs
+and prevent the race condition.
+
+Signed-off-by: Lin Ma <linma@zju.edu.cn>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bluetooth/hci_sock.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/bluetooth/hci_sock.c
++++ b/net/bluetooth/hci_sock.c
+@@ -762,7 +762,7 @@ void hci_sock_dev_event(struct hci_dev *
+               /* Detach sockets from device */
+               read_lock(&hci_sk_list.lock);
+               sk_for_each(sk, &hci_sk_list.head) {
+-                      bh_lock_sock_nested(sk);
++                      lock_sock(sk);
+                       if (hci_pi(sk)->hdev == hdev) {
+                               hci_pi(sk)->hdev = NULL;
+                               sk->sk_err = EPIPE;
+@@ -771,7 +771,7 @@ void hci_sock_dev_event(struct hci_dev *
+                               hci_dev_put(hdev);
+                       }
+-                      bh_unlock_sock(sk);
++                      release_sock(sk);
+               }
+               read_unlock(&hci_sk_list.lock);
+       }
index 56c4de258372c22aa3c3ca88259553497761146f..62bd9b5abab510a77e8876cfa822f120d33d4247 100644 (file)
@@ -76,3 +76,13 @@ drm-amdgpu-vcn3-add-cancel_delayed_work_sync-before-.patch
 drm-amdgpu-jpeg2.5-add-cancel_delayed_work_sync-befo.patch
 drm-amdgpu-jpeg3-add-cancel_delayed_work_sync-before.patch
 bluetooth-fix-the-erroneous-flush_work-order.patch
+bluetooth-use-correct-lock-to-prevent-uaf-of-hdev-object.patch
+wireguard-do-not-use-o3.patch
+wireguard-peer-allocate-in-kmem_cache.patch
+wireguard-use-synchronize_net-rather-than-synchronize_rcu.patch
+wireguard-selftests-remove-old-conntrack-kconfig-value.patch
+wireguard-selftests-make-sure-rp_filter-is-disabled-on-vethc.patch
+wireguard-allowedips-initialize-list-head-in-selftest.patch
+wireguard-allowedips-remove-nodes-in-o-1.patch
+wireguard-allowedips-allocate-nodes-in-kmem_cache.patch
+wireguard-allowedips-free-empty-intermediate-nodes-when-removing-single-node.patch
diff --git a/queue-5.10/wireguard-allowedips-allocate-nodes-in-kmem_cache.patch b/queue-5.10/wireguard-allowedips-allocate-nodes-in-kmem_cache.patch
new file mode 100644 (file)
index 0000000..d7f9b71
--- /dev/null
@@ -0,0 +1,175 @@
+From dc680de28ca849dfe589dc15ac56d22505f0ef11 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:37 +0200
+Subject: wireguard: allowedips: allocate nodes in kmem_cache
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit dc680de28ca849dfe589dc15ac56d22505f0ef11 upstream.
+
+The previous commit moved from O(n) to O(1) for removal, but in the
+process introduced an additional pointer member to a struct that
+increased the size from 60 to 68 bytes, putting nodes in the 128-byte
+slab. With deployed systems having as many as 2 million nodes, this
+represents a significant doubling in memory usage (128 MiB -> 256 MiB).
+Fix this by using our own kmem_cache, that's sized exactly right. This
+also makes wireguard's memory usage more transparent in tools like
+slabtop and /proc/slabinfo.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Suggested-by: Arnd Bergmann <arnd@arndb.de>
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/allowedips.c |   31 +++++++++++++++++++++++++------
+ drivers/net/wireguard/allowedips.h |    5 ++++-
+ drivers/net/wireguard/main.c       |   10 +++++++++-
+ 3 files changed, 38 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -6,6 +6,8 @@
+ #include "allowedips.h"
+ #include "peer.h"
++static struct kmem_cache *node_cache;
++
+ static void swap_endian(u8 *dst, const u8 *src, u8 bits)
+ {
+       if (bits == 32) {
+@@ -40,6 +42,11 @@ static void push_rcu(struct allowedips_n
+       }
+ }
++static void node_free_rcu(struct rcu_head *rcu)
++{
++      kmem_cache_free(node_cache, container_of(rcu, struct allowedips_node, rcu));
++}
++
+ static void root_free_rcu(struct rcu_head *rcu)
+ {
+       struct allowedips_node *node, *stack[128] = {
+@@ -49,7 +56,7 @@ static void root_free_rcu(struct rcu_hea
+       while (len > 0 && (node = stack[--len])) {
+               push_rcu(stack, node->bit[0], &len);
+               push_rcu(stack, node->bit[1], &len);
+-              kfree(node);
++              kmem_cache_free(node_cache, node);
+       }
+ }
+@@ -164,7 +171,7 @@ static int add(struct allowedips_node __
+               return -EINVAL;
+       if (!rcu_access_pointer(*trie)) {
+-              node = kzalloc(sizeof(*node), GFP_KERNEL);
++              node = kmem_cache_zalloc(node_cache, GFP_KERNEL);
+               if (unlikely(!node))
+                       return -ENOMEM;
+               RCU_INIT_POINTER(node->peer, peer);
+@@ -180,7 +187,7 @@ static int add(struct allowedips_node __
+               return 0;
+       }
+-      newnode = kzalloc(sizeof(*newnode), GFP_KERNEL);
++      newnode = kmem_cache_zalloc(node_cache, GFP_KERNEL);
+       if (unlikely(!newnode))
+               return -ENOMEM;
+       RCU_INIT_POINTER(newnode->peer, peer);
+@@ -213,10 +220,10 @@ static int add(struct allowedips_node __
+               return 0;
+       }
+-      node = kzalloc(sizeof(*node), GFP_KERNEL);
++      node = kmem_cache_zalloc(node_cache, GFP_KERNEL);
+       if (unlikely(!node)) {
+               list_del(&newnode->peer_list);
+-              kfree(newnode);
++              kmem_cache_free(node_cache, newnode);
+               return -ENOMEM;
+       }
+       INIT_LIST_HEAD(&node->peer_list);
+@@ -306,7 +313,7 @@ void wg_allowedips_remove_by_peer(struct
+               if (child)
+                       child->parent_bit = node->parent_bit;
+               *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
+-              kfree_rcu(node, rcu);
++              call_rcu(&node->rcu, node_free_rcu);
+               /* TODO: Note that we currently don't walk up and down in order to
+                * free any potential filler nodes. This means that this function
+@@ -350,4 +357,16 @@ struct wg_peer *wg_allowedips_lookup_src
+       return NULL;
+ }
++int __init wg_allowedips_slab_init(void)
++{
++      node_cache = KMEM_CACHE(allowedips_node, 0);
++      return node_cache ? 0 : -ENOMEM;
++}
++
++void wg_allowedips_slab_uninit(void)
++{
++      rcu_barrier();
++      kmem_cache_destroy(node_cache);
++}
++
+ #include "selftest/allowedips.c"
+--- a/drivers/net/wireguard/allowedips.h
++++ b/drivers/net/wireguard/allowedips.h
+@@ -19,7 +19,7 @@ struct allowedips_node {
+       u8 bits[16] __aligned(__alignof(u64));
+       /* Keep rarely used members at bottom to be beyond cache line. */
+-      struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
++      struct allowedips_node *__rcu *parent_bit;
+       union {
+               struct list_head peer_list;
+               struct rcu_head rcu;
+@@ -53,4 +53,7 @@ struct wg_peer *wg_allowedips_lookup_src
+ bool wg_allowedips_selftest(void);
+ #endif
++int wg_allowedips_slab_init(void);
++void wg_allowedips_slab_uninit(void);
++
+ #endif /* _WG_ALLOWEDIPS_H */
+--- a/drivers/net/wireguard/main.c
++++ b/drivers/net/wireguard/main.c
+@@ -21,10 +21,15 @@ static int __init mod_init(void)
+ {
+       int ret;
++      ret = wg_allowedips_slab_init();
++      if (ret < 0)
++              goto err_allowedips;
++
+ #ifdef DEBUG
++      ret = -ENOTRECOVERABLE;
+       if (!wg_allowedips_selftest() || !wg_packet_counter_selftest() ||
+           !wg_ratelimiter_selftest())
+-              return -ENOTRECOVERABLE;
++              goto err_peer;
+ #endif
+       wg_noise_init();
+@@ -50,6 +55,8 @@ err_netlink:
+ err_device:
+       wg_peer_uninit();
+ err_peer:
++      wg_allowedips_slab_uninit();
++err_allowedips:
+       return ret;
+ }
+@@ -58,6 +65,7 @@ static void __exit mod_exit(void)
+       wg_genetlink_uninit();
+       wg_device_uninit();
+       wg_peer_uninit();
++      wg_allowedips_slab_uninit();
+ }
+ module_init(mod_init);
diff --git a/queue-5.10/wireguard-allowedips-free-empty-intermediate-nodes-when-removing-single-node.patch b/queue-5.10/wireguard-allowedips-free-empty-intermediate-nodes-when-removing-single-node.patch
new file mode 100644 (file)
index 0000000..0484ea8
--- /dev/null
@@ -0,0 +1,522 @@
+From bf7b042dc62a31f66d3a41dd4dfc7806f267b307 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:38 +0200
+Subject: wireguard: allowedips: free empty intermediate nodes when removing single node
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit bf7b042dc62a31f66d3a41dd4dfc7806f267b307 upstream.
+
+When removing single nodes, it's possible that that node's parent is an
+empty intermediate node, in which case, it too should be removed.
+Otherwise the trie fills up and never is fully emptied, leading to
+gradual memory leaks over time for tries that are modified often. There
+was originally code to do this, but was removed during refactoring in
+2016 and never reworked. Now that we have proper parent pointers from
+the previous commits, we can implement this properly.
+
+In order to reduce branching and expensive comparisons, we want to keep
+the double pointer for parent assignment (which lets us easily chain up
+to the root), but we still need to actually get the parent's base
+address. So encode the bit number into the last two bits of the pointer,
+and pack and unpack it as needed. This is a little bit clumsy but is the
+fastest and less memory wasteful of the compromises. Note that we align
+the root struct here to a minimum of 4, because it's embedded into a
+larger struct, and we're relying on having the bottom two bits for our
+flag, which would only be 16-bit aligned on m68k.
+
+The existing macro-based helpers were a bit unwieldy for adding the bit
+packing to, so this commit replaces them with safer and clearer ordinary
+functions.
+
+We add a test to the randomized/fuzzer part of the selftests, to free
+the randomized tries by-peer, refuzz it, and repeat, until it's supposed
+to be empty, and then then see if that actually resulted in the whole
+thing being emptied. That combined with kmemcheck should hopefully make
+sure this commit is doing what it should. Along the way this resulted in
+various other cleanups of the tests and fixes for recent graphviz.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/allowedips.c          |  102 ++++++++++-------
+ drivers/net/wireguard/allowedips.h          |    4 
+ drivers/net/wireguard/selftest/allowedips.c |  162 +++++++++++++---------------
+ 3 files changed, 137 insertions(+), 131 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -30,8 +30,11 @@ static void copy_and_assign_cidr(struct
+       node->bitlen = bits;
+       memcpy(node->bits, src, bits / 8U);
+ }
+-#define CHOOSE_NODE(parent, key) \
+-      parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
++
++static inline u8 choose(struct allowedips_node *node, const u8 *key)
++{
++      return (key[node->bit_at_a] >> node->bit_at_b) & 1;
++}
+ static void push_rcu(struct allowedips_node **stack,
+                    struct allowedips_node __rcu *p, unsigned int *len)
+@@ -112,7 +115,7 @@ static struct allowedips_node *find_node
+                       found = node;
+               if (node->cidr == bits)
+                       break;
+-              node = rcu_dereference_bh(CHOOSE_NODE(node, key));
++              node = rcu_dereference_bh(node->bit[choose(node, key)]);
+       }
+       return found;
+ }
+@@ -144,8 +147,7 @@ static bool node_placement(struct allowe
+                          u8 cidr, u8 bits, struct allowedips_node **rnode,
+                          struct mutex *lock)
+ {
+-      struct allowedips_node *node = rcu_dereference_protected(trie,
+-                                              lockdep_is_held(lock));
++      struct allowedips_node *node = rcu_dereference_protected(trie, lockdep_is_held(lock));
+       struct allowedips_node *parent = NULL;
+       bool exact = false;
+@@ -155,13 +157,24 @@ static bool node_placement(struct allowe
+                       exact = true;
+                       break;
+               }
+-              node = rcu_dereference_protected(CHOOSE_NODE(parent, key),
+-                                               lockdep_is_held(lock));
++              node = rcu_dereference_protected(parent->bit[choose(parent, key)], lockdep_is_held(lock));
+       }
+       *rnode = parent;
+       return exact;
+ }
++static inline void connect_node(struct allowedips_node **parent, u8 bit, struct allowedips_node *node)
++{
++      node->parent_bit_packed = (unsigned long)parent | bit;
++      rcu_assign_pointer(*parent, node);
++}
++
++static inline void choose_and_connect_node(struct allowedips_node *parent, struct allowedips_node *node)
++{
++      u8 bit = choose(parent, node->bits);
++      connect_node(&parent->bit[bit], bit, node);
++}
++
+ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
+              u8 cidr, struct wg_peer *peer, struct mutex *lock)
+ {
+@@ -177,8 +190,7 @@ static int add(struct allowedips_node __
+               RCU_INIT_POINTER(node->peer, peer);
+               list_add_tail(&node->peer_list, &peer->allowedips_list);
+               copy_and_assign_cidr(node, key, cidr, bits);
+-              rcu_assign_pointer(node->parent_bit, trie);
+-              rcu_assign_pointer(*trie, node);
++              connect_node(trie, 2, node);
+               return 0;
+       }
+       if (node_placement(*trie, key, cidr, bits, &node, lock)) {
+@@ -197,10 +209,10 @@ static int add(struct allowedips_node __
+       if (!node) {
+               down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
+       } else {
+-              down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
++              const u8 bit = choose(node, key);
++              down = rcu_dereference_protected(node->bit[bit], lockdep_is_held(lock));
+               if (!down) {
+-                      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
+-                      rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
++                      connect_node(&node->bit[bit], bit, newnode);
+                       return 0;
+               }
+       }
+@@ -208,15 +220,11 @@ static int add(struct allowedips_node __
+       parent = node;
+       if (newnode->cidr == cidr) {
+-              rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
+-              rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
+-              if (!parent) {
+-                      rcu_assign_pointer(newnode->parent_bit, trie);
+-                      rcu_assign_pointer(*trie, newnode);
+-              } else {
+-                      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
+-                      rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
+-              }
++              choose_and_connect_node(newnode, down);
++              if (!parent)
++                      connect_node(trie, 2, newnode);
++              else
++                      choose_and_connect_node(parent, newnode);
+               return 0;
+       }
+@@ -229,17 +237,12 @@ static int add(struct allowedips_node __
+       INIT_LIST_HEAD(&node->peer_list);
+       copy_and_assign_cidr(node, newnode->bits, cidr, bits);
+-      rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
+-      rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
+-      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
+-      rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
+-      if (!parent) {
+-              rcu_assign_pointer(node->parent_bit, trie);
+-              rcu_assign_pointer(*trie, node);
+-      } else {
+-              rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
+-              rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
+-      }
++      choose_and_connect_node(node, down);
++      choose_and_connect_node(node, newnode);
++      if (!parent)
++              connect_node(trie, 2, node);
++      else
++              choose_and_connect_node(parent, node);
+       return 0;
+ }
+@@ -297,7 +300,8 @@ int wg_allowedips_insert_v6(struct allow
+ void wg_allowedips_remove_by_peer(struct allowedips *table,
+                                 struct wg_peer *peer, struct mutex *lock)
+ {
+-      struct allowedips_node *node, *child, *tmp;
++      struct allowedips_node *node, *child, **parent_bit, *parent, *tmp;
++      bool free_parent;
+       if (list_empty(&peer->allowedips_list))
+               return;
+@@ -307,19 +311,29 @@ void wg_allowedips_remove_by_peer(struct
+               RCU_INIT_POINTER(node->peer, NULL);
+               if (node->bit[0] && node->bit[1])
+                       continue;
+-              child = rcu_dereference_protected(
+-                              node->bit[!rcu_access_pointer(node->bit[0])],
+-                              lockdep_is_held(lock));
++              child = rcu_dereference_protected(node->bit[!rcu_access_pointer(node->bit[0])],
++                                                lockdep_is_held(lock));
+               if (child)
+-                      child->parent_bit = node->parent_bit;
+-              *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
++                      child->parent_bit_packed = node->parent_bit_packed;
++              parent_bit = (struct allowedips_node **)(node->parent_bit_packed & ~3UL);
++              *parent_bit = child;
++              parent = (void *)parent_bit -
++                       offsetof(struct allowedips_node, bit[node->parent_bit_packed & 1]);
++              free_parent = !rcu_access_pointer(node->bit[0]) &&
++                            !rcu_access_pointer(node->bit[1]) &&
++                            (node->parent_bit_packed & 3) <= 1 &&
++                            !rcu_access_pointer(parent->peer);
++              if (free_parent)
++                      child = rcu_dereference_protected(
++                                      parent->bit[!(node->parent_bit_packed & 1)],
++                                      lockdep_is_held(lock));
+               call_rcu(&node->rcu, node_free_rcu);
+-
+-              /* TODO: Note that we currently don't walk up and down in order to
+-               * free any potential filler nodes. This means that this function
+-               * doesn't free up as much as it could, which could be revisited
+-               * at some point.
+-               */
++              if (!free_parent)
++                      continue;
++              if (child)
++                      child->parent_bit_packed = parent->parent_bit_packed;
++              *(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child;
++              call_rcu(&parent->rcu, node_free_rcu);
+       }
+ }
+--- a/drivers/net/wireguard/allowedips.h
++++ b/drivers/net/wireguard/allowedips.h
+@@ -19,7 +19,7 @@ struct allowedips_node {
+       u8 bits[16] __aligned(__alignof(u64));
+       /* Keep rarely used members at bottom to be beyond cache line. */
+-      struct allowedips_node *__rcu *parent_bit;
++      unsigned long parent_bit_packed;
+       union {
+               struct list_head peer_list;
+               struct rcu_head rcu;
+@@ -30,7 +30,7 @@ struct allowedips {
+       struct allowedips_node __rcu *root4;
+       struct allowedips_node __rcu *root6;
+       u64 seq;
+-};
++} __aligned(4); /* We pack the lower 2 bits of &root, but m68k only gives 16-bit alignment. */
+ void wg_allowedips_init(struct allowedips *table);
+ void wg_allowedips_free(struct allowedips *table, struct mutex *mutex);
+--- a/drivers/net/wireguard/selftest/allowedips.c
++++ b/drivers/net/wireguard/selftest/allowedips.c
+@@ -19,32 +19,22 @@
+ #include <linux/siphash.h>
+-static __init void swap_endian_and_apply_cidr(u8 *dst, const u8 *src, u8 bits,
+-                                            u8 cidr)
+-{
+-      swap_endian(dst, src, bits);
+-      memset(dst + (cidr + 7) / 8, 0, bits / 8 - (cidr + 7) / 8);
+-      if (cidr)
+-              dst[(cidr + 7) / 8 - 1] &= ~0U << ((8 - (cidr % 8)) % 8);
+-}
+-
+ static __init void print_node(struct allowedips_node *node, u8 bits)
+ {
+       char *fmt_connection = KERN_DEBUG "\t\"%p/%d\" -> \"%p/%d\";\n";
+-      char *fmt_declaration = KERN_DEBUG
+-              "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
++      char *fmt_declaration = KERN_DEBUG "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
++      u8 ip1[16], ip2[16], cidr1, cidr2;
+       char *style = "dotted";
+-      u8 ip1[16], ip2[16];
+       u32 color = 0;
++      if (node == NULL)
++              return;
+       if (bits == 32) {
+               fmt_connection = KERN_DEBUG "\t\"%pI4/%d\" -> \"%pI4/%d\";\n";
+-              fmt_declaration = KERN_DEBUG
+-                      "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
++              fmt_declaration = KERN_DEBUG "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
+       } else if (bits == 128) {
+               fmt_connection = KERN_DEBUG "\t\"%pI6/%d\" -> \"%pI6/%d\";\n";
+-              fmt_declaration = KERN_DEBUG
+-                      "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
++              fmt_declaration = KERN_DEBUG "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
+       }
+       if (node->peer) {
+               hsiphash_key_t key = { { 0 } };
+@@ -55,24 +45,20 @@ static __init void print_node(struct all
+                       hsiphash_1u32(0xabad1dea, &key) % 200;
+               style = "bold";
+       }
+-      swap_endian_and_apply_cidr(ip1, node->bits, bits, node->cidr);
+-      printk(fmt_declaration, ip1, node->cidr, style, color);
++      wg_allowedips_read_node(node, ip1, &cidr1);
++      printk(fmt_declaration, ip1, cidr1, style, color);
+       if (node->bit[0]) {
+-              swap_endian_and_apply_cidr(ip2,
+-                              rcu_dereference_raw(node->bit[0])->bits, bits,
+-                              node->cidr);
+-              printk(fmt_connection, ip1, node->cidr, ip2,
+-                     rcu_dereference_raw(node->bit[0])->cidr);
+-              print_node(rcu_dereference_raw(node->bit[0]), bits);
++              wg_allowedips_read_node(rcu_dereference_raw(node->bit[0]), ip2, &cidr2);
++              printk(fmt_connection, ip1, cidr1, ip2, cidr2);
+       }
+       if (node->bit[1]) {
+-              swap_endian_and_apply_cidr(ip2,
+-                              rcu_dereference_raw(node->bit[1])->bits,
+-                              bits, node->cidr);
+-              printk(fmt_connection, ip1, node->cidr, ip2,
+-                     rcu_dereference_raw(node->bit[1])->cidr);
+-              print_node(rcu_dereference_raw(node->bit[1]), bits);
++              wg_allowedips_read_node(rcu_dereference_raw(node->bit[1]), ip2, &cidr2);
++              printk(fmt_connection, ip1, cidr1, ip2, cidr2);
+       }
++      if (node->bit[0])
++              print_node(rcu_dereference_raw(node->bit[0]), bits);
++      if (node->bit[1])
++              print_node(rcu_dereference_raw(node->bit[1]), bits);
+ }
+ static __init void print_tree(struct allowedips_node __rcu *top, u8 bits)
+@@ -121,8 +107,8 @@ static __init inline union nf_inet_addr
+ {
+       union nf_inet_addr mask;
+-      memset(&mask, 0x00, 128 / 8);
+-      memset(&mask, 0xff, cidr / 8);
++      memset(&mask, 0, sizeof(mask));
++      memset(&mask.all, 0xff, cidr / 8);
+       if (cidr % 32)
+               mask.all[cidr / 32] = (__force u32)htonl(
+                       (0xFFFFFFFFUL << (32 - (cidr % 32))) & 0xFFFFFFFFUL);
+@@ -149,42 +135,36 @@ horrible_mask_self(struct horrible_allow
+ }
+ static __init inline bool
+-horrible_match_v4(const struct horrible_allowedips_node *node,
+-                struct in_addr *ip)
++horrible_match_v4(const struct horrible_allowedips_node *node, struct in_addr *ip)
+ {
+       return (ip->s_addr & node->mask.ip) == node->ip.ip;
+ }
+ static __init inline bool
+-horrible_match_v6(const struct horrible_allowedips_node *node,
+-                struct in6_addr *ip)
++horrible_match_v6(const struct horrible_allowedips_node *node, struct in6_addr *ip)
+ {
+-      return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) ==
+-                     node->ip.ip6[0] &&
+-             (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) ==
+-                     node->ip.ip6[1] &&
+-             (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) ==
+-                     node->ip.ip6[2] &&
++      return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) == node->ip.ip6[0] &&
++             (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) == node->ip.ip6[1] &&
++             (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) == node->ip.ip6[2] &&
+              (ip->in6_u.u6_addr32[3] & node->mask.ip6[3]) == node->ip.ip6[3];
+ }
+ static __init void
+-horrible_insert_ordered(struct horrible_allowedips *table,
+-                      struct horrible_allowedips_node *node)
++horrible_insert_ordered(struct horrible_allowedips *table, struct horrible_allowedips_node *node)
+ {
+       struct horrible_allowedips_node *other = NULL, *where = NULL;
+       u8 my_cidr = horrible_mask_to_cidr(node->mask);
+       hlist_for_each_entry(other, &table->head, table) {
+-              if (!memcmp(&other->mask, &node->mask,
+-                          sizeof(union nf_inet_addr)) &&
+-                  !memcmp(&other->ip, &node->ip,
+-                          sizeof(union nf_inet_addr)) &&
+-                  other->ip_version == node->ip_version) {
++              if (other->ip_version == node->ip_version &&
++                  !memcmp(&other->mask, &node->mask, sizeof(union nf_inet_addr)) &&
++                  !memcmp(&other->ip, &node->ip, sizeof(union nf_inet_addr))) {
+                       other->value = node->value;
+                       kfree(node);
+                       return;
+               }
++      }
++      hlist_for_each_entry(other, &table->head, table) {
+               where = other;
+               if (horrible_mask_to_cidr(other->mask) <= my_cidr)
+                       break;
+@@ -201,8 +181,7 @@ static __init int
+ horrible_allowedips_insert_v4(struct horrible_allowedips *table,
+                             struct in_addr *ip, u8 cidr, void *value)
+ {
+-      struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
+-                                                      GFP_KERNEL);
++      struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
+       if (unlikely(!node))
+               return -ENOMEM;
+@@ -219,8 +198,7 @@ static __init int
+ horrible_allowedips_insert_v6(struct horrible_allowedips *table,
+                             struct in6_addr *ip, u8 cidr, void *value)
+ {
+-      struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
+-                                                      GFP_KERNEL);
++      struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
+       if (unlikely(!node))
+               return -ENOMEM;
+@@ -234,39 +212,43 @@ horrible_allowedips_insert_v6(struct hor
+ }
+ static __init void *
+-horrible_allowedips_lookup_v4(struct horrible_allowedips *table,
+-                            struct in_addr *ip)
++horrible_allowedips_lookup_v4(struct horrible_allowedips *table, struct in_addr *ip)
+ {
+       struct horrible_allowedips_node *node;
+-      void *ret = NULL;
+       hlist_for_each_entry(node, &table->head, table) {
+-              if (node->ip_version != 4)
+-                      continue;
+-              if (horrible_match_v4(node, ip)) {
+-                      ret = node->value;
+-                      break;
+-              }
++              if (node->ip_version == 4 && horrible_match_v4(node, ip))
++                      return node->value;
+       }
+-      return ret;
++      return NULL;
+ }
+ static __init void *
+-horrible_allowedips_lookup_v6(struct horrible_allowedips *table,
+-                            struct in6_addr *ip)
++horrible_allowedips_lookup_v6(struct horrible_allowedips *table, struct in6_addr *ip)
+ {
+       struct horrible_allowedips_node *node;
+-      void *ret = NULL;
+       hlist_for_each_entry(node, &table->head, table) {
+-              if (node->ip_version != 6)
++              if (node->ip_version == 6 && horrible_match_v6(node, ip))
++                      return node->value;
++      }
++      return NULL;
++}
++
++
++static __init void
++horrible_allowedips_remove_by_value(struct horrible_allowedips *table, void *value)
++{
++      struct horrible_allowedips_node *node;
++      struct hlist_node *h;
++
++      hlist_for_each_entry_safe(node, h, &table->head, table) {
++              if (node->value != value)
+                       continue;
+-              if (horrible_match_v6(node, ip)) {
+-                      ret = node->value;
+-                      break;
+-              }
++              hlist_del(&node->table);
++              kfree(node);
+       }
+-      return ret;
++
+ }
+ static __init bool randomized_test(void)
+@@ -397,23 +379,33 @@ static __init bool randomized_test(void)
+               print_tree(t.root6, 128);
+       }
+-      for (i = 0; i < NUM_QUERIES; ++i) {
+-              prandom_bytes(ip, 4);
+-              if (lookup(t.root4, 32, ip) !=
+-                  horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
+-                      pr_err("allowedips random self-test: FAIL\n");
+-                      goto free;
++      for (j = 0;; ++j) {
++              for (i = 0; i < NUM_QUERIES; ++i) {
++                      prandom_bytes(ip, 4);
++                      if (lookup(t.root4, 32, ip) != horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
++                              horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip);
++                              pr_err("allowedips random v4 self-test: FAIL\n");
++                              goto free;
++                      }
++                      prandom_bytes(ip, 16);
++                      if (lookup(t.root6, 128, ip) != horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
++                              pr_err("allowedips random v6 self-test: FAIL\n");
++                              goto free;
++                      }
+               }
++              if (j >= NUM_PEERS)
++                      break;
++              mutex_lock(&mutex);
++              wg_allowedips_remove_by_peer(&t, peers[j], &mutex);
++              mutex_unlock(&mutex);
++              horrible_allowedips_remove_by_value(&h, peers[j]);
+       }
+-      for (i = 0; i < NUM_QUERIES; ++i) {
+-              prandom_bytes(ip, 16);
+-              if (lookup(t.root6, 128, ip) !=
+-                  horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
+-                      pr_err("allowedips random self-test: FAIL\n");
+-                      goto free;
+-              }
++      if (t.root4 || t.root6) {
++              pr_err("allowedips random self-test removal: FAIL\n");
++              goto free;
+       }
++
+       ret = true;
+ free:
diff --git a/queue-5.10/wireguard-allowedips-initialize-list-head-in-selftest.patch b/queue-5.10/wireguard-allowedips-initialize-list-head-in-selftest.patch
new file mode 100644 (file)
index 0000000..b7df8d2
--- /dev/null
@@ -0,0 +1,45 @@
+From 46cfe8eee285cde465b420637507884551f5d7ca Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:35 +0200
+Subject: wireguard: allowedips: initialize list head in selftest
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit 46cfe8eee285cde465b420637507884551f5d7ca upstream.
+
+The randomized trie tests weren't initializing the dummy peer list head,
+resulting in a NULL pointer dereference when used. Fix this by
+initializing it in the randomized trie test, just like we do for the
+static unit test.
+
+While we're at it, all of the other strings like this have the word
+"self-test", so add it to the missing place here.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/selftest/allowedips.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/wireguard/selftest/allowedips.c
++++ b/drivers/net/wireguard/selftest/allowedips.c
+@@ -296,6 +296,7 @@ static __init bool randomized_test(void)
+                       goto free;
+               }
+               kref_init(&peers[i]->refcount);
++              INIT_LIST_HEAD(&peers[i]->allowedips_list);
+       }
+       mutex_lock(&mutex);
+@@ -333,7 +334,7 @@ static __init bool randomized_test(void)
+                       if (wg_allowedips_insert_v4(&t,
+                                                   (struct in_addr *)mutated,
+                                                   cidr, peer, &mutex) < 0) {
+-                              pr_err("allowedips random malloc: FAIL\n");
++                              pr_err("allowedips random self-test malloc: FAIL\n");
+                               goto free_locked;
+                       }
+                       if (horrible_allowedips_insert_v4(&h,
diff --git a/queue-5.10/wireguard-allowedips-remove-nodes-in-o-1.patch b/queue-5.10/wireguard-allowedips-remove-nodes-in-o-1.patch
new file mode 100644 (file)
index 0000000..641db1d
--- /dev/null
@@ -0,0 +1,239 @@
+From f634f418c227c912e7ea95a3299efdc9b10e4022 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:36 +0200
+Subject: wireguard: allowedips: remove nodes in O(1)
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream.
+
+Previously, deleting peers would require traversing the entire trie in
+order to rebalance nodes and safely free them. This meant that removing
+1000 peers from a trie with a half million nodes would take an extremely
+long time, during which we're holding the rtnl lock. Large-scale users
+were reporting 200ms latencies added to the networking stack as a whole
+every time their userspace software would queue up significant removals.
+That's a serious situation.
+
+This commit fixes that by maintaining a double pointer to the parent's
+bit pointer for each node, and then using the already existing node list
+belonging to each peer to go directly to the node, fix up its pointers,
+and free it with RCU. This means removal is O(1) instead of O(n), and we
+don't use gobs of stack.
+
+The removal algorithm has the same downside as the code that it fixes:
+it won't collapse needlessly long runs of fillers.  We can enhance that
+in the future if it ever becomes a problem. This commit documents that
+limitation with a TODO comment in code, a small but meaningful
+improvement over the prior situation.
+
+Currently the biggest flaw, which the next commit addresses, is that
+because this increases the node size on 64-bit machines from 60 bytes to
+68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
+using twice as much memory per node, because of power-of-two
+allocations, which is a big bummer. We'll need to figure something out
+there.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/allowedips.c |  130 +++++++++++++++----------------------
+ drivers/net/wireguard/allowedips.h |    9 --
+ 2 files changed, 56 insertions(+), 83 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc
+       }
+ }
+-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
+-                              struct wg_peer *peer, struct mutex *lock)
+-{
+-#define REF(p) rcu_access_pointer(p)
+-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
+-#define PUSH(p) ({                                                             \
+-              WARN_ON(IS_ENABLED(DEBUG) && len >= 128);                      \
+-              stack[len++] = p;                                              \
+-      })
+-
+-      struct allowedips_node __rcu **stack[128], **nptr;
+-      struct allowedips_node *node, *prev;
+-      unsigned int len;
+-
+-      if (unlikely(!peer || !REF(*top)))
+-              return;
+-
+-      for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
+-              nptr = stack[len - 1];
+-              node = DEREF(nptr);
+-              if (!node) {
+-                      --len;
+-                      continue;
+-              }
+-              if (!prev || REF(prev->bit[0]) == node ||
+-                  REF(prev->bit[1]) == node) {
+-                      if (REF(node->bit[0]))
+-                              PUSH(&node->bit[0]);
+-                      else if (REF(node->bit[1]))
+-                              PUSH(&node->bit[1]);
+-              } else if (REF(node->bit[0]) == prev) {
+-                      if (REF(node->bit[1]))
+-                              PUSH(&node->bit[1]);
+-              } else {
+-                      if (rcu_dereference_protected(node->peer,
+-                              lockdep_is_held(lock)) == peer) {
+-                              RCU_INIT_POINTER(node->peer, NULL);
+-                              list_del_init(&node->peer_list);
+-                              if (!node->bit[0] || !node->bit[1]) {
+-                                      rcu_assign_pointer(*nptr, DEREF(
+-                                             &node->bit[!REF(node->bit[0])]));
+-                                      kfree_rcu(node, rcu);
+-                                      node = DEREF(nptr);
+-                              }
+-                      }
+-                      --len;
+-              }
+-      }
+-
+-#undef REF
+-#undef DEREF
+-#undef PUSH
+-}
+-
+ static unsigned int fls128(u64 a, u64 b)
+ {
+       return a ? fls64(a) + 64U : fls64(b);
+@@ -224,6 +170,7 @@ static int add(struct allowedips_node __
+               RCU_INIT_POINTER(node->peer, peer);
+               list_add_tail(&node->peer_list, &peer->allowedips_list);
+               copy_and_assign_cidr(node, key, cidr, bits);
++              rcu_assign_pointer(node->parent_bit, trie);
+               rcu_assign_pointer(*trie, node);
+               return 0;
+       }
+@@ -243,9 +190,9 @@ static int add(struct allowedips_node __
+       if (!node) {
+               down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
+       } else {
+-              down = rcu_dereference_protected(CHOOSE_NODE(node, key),
+-                                               lockdep_is_held(lock));
++              down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
+               if (!down) {
++                      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
+                       rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
+                       return 0;
+               }
+@@ -254,29 +201,37 @@ static int add(struct allowedips_node __
+       parent = node;
+       if (newnode->cidr == cidr) {
++              rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
+               rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
+-              if (!parent)
++              if (!parent) {
++                      rcu_assign_pointer(newnode->parent_bit, trie);
+                       rcu_assign_pointer(*trie, newnode);
+-              else
+-                      rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
+-                                         newnode);
+-      } else {
+-              node = kzalloc(sizeof(*node), GFP_KERNEL);
+-              if (unlikely(!node)) {
+-                      list_del(&newnode->peer_list);
+-                      kfree(newnode);
+-                      return -ENOMEM;
++              } else {
++                      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
++                      rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
+               }
+-              INIT_LIST_HEAD(&node->peer_list);
+-              copy_and_assign_cidr(node, newnode->bits, cidr, bits);
++              return 0;
++      }
++
++      node = kzalloc(sizeof(*node), GFP_KERNEL);
++      if (unlikely(!node)) {
++              list_del(&newnode->peer_list);
++              kfree(newnode);
++              return -ENOMEM;
++      }
++      INIT_LIST_HEAD(&node->peer_list);
++      copy_and_assign_cidr(node, newnode->bits, cidr, bits);
+-              rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
+-              rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
+-              if (!parent)
+-                      rcu_assign_pointer(*trie, node);
+-              else
+-                      rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
+-                                         node);
++      rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
++      rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
++      rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
++      rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
++      if (!parent) {
++              rcu_assign_pointer(node->parent_bit, trie);
++              rcu_assign_pointer(*trie, node);
++      } else {
++              rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
++              rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
+       }
+       return 0;
+ }
+@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow
+ void wg_allowedips_remove_by_peer(struct allowedips *table,
+                                 struct wg_peer *peer, struct mutex *lock)
+ {
++      struct allowedips_node *node, *child, *tmp;
++
++      if (list_empty(&peer->allowedips_list))
++              return;
+       ++table->seq;
+-      walk_remove_by_peer(&table->root4, peer, lock);
+-      walk_remove_by_peer(&table->root6, peer, lock);
++      list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
++              list_del_init(&node->peer_list);
++              RCU_INIT_POINTER(node->peer, NULL);
++              if (node->bit[0] && node->bit[1])
++                      continue;
++              child = rcu_dereference_protected(
++                              node->bit[!rcu_access_pointer(node->bit[0])],
++                              lockdep_is_held(lock));
++              if (child)
++                      child->parent_bit = node->parent_bit;
++              *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
++              kfree_rcu(node, rcu);
++
++              /* TODO: Note that we currently don't walk up and down in order to
++               * free any potential filler nodes. This means that this function
++               * doesn't free up as much as it could, which could be revisited
++               * at some point.
++               */
++      }
+ }
+ int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
+--- a/drivers/net/wireguard/allowedips.h
++++ b/drivers/net/wireguard/allowedips.h
+@@ -15,14 +15,11 @@ struct wg_peer;
+ struct allowedips_node {
+       struct wg_peer __rcu *peer;
+       struct allowedips_node __rcu *bit[2];
+-      /* While it may seem scandalous that we waste space for v4,
+-       * we're alloc'ing to the nearest power of 2 anyway, so this
+-       * doesn't actually make a difference.
+-       */
+-      u8 bits[16] __aligned(__alignof(u64));
+       u8 cidr, bit_at_a, bit_at_b, bitlen;
++      u8 bits[16] __aligned(__alignof(u64));
+-      /* Keep rarely used list at bottom to be beyond cache line. */
++      /* Keep rarely used members at bottom to be beyond cache line. */
++      struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
+       union {
+               struct list_head peer_list;
+               struct rcu_head rcu;
diff --git a/queue-5.10/wireguard-do-not-use-o3.patch b/queue-5.10/wireguard-do-not-use-o3.patch
new file mode 100644 (file)
index 0000000..6ef05dc
--- /dev/null
@@ -0,0 +1,35 @@
+From cc5060ca0285efe2728bced399a1955a7ce808b2 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:32 +0200
+Subject: wireguard: do not use -O3
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit cc5060ca0285efe2728bced399a1955a7ce808b2 upstream.
+
+Apparently, various versions of gcc have O3-related miscompiles. Looking
+at the difference between -O2 and -O3 for gcc 11 doesn't indicate
+miscompiles, but the difference also doesn't seem so significant for
+performance that it's worth risking.
+
+Link: https://lore.kernel.org/lkml/CAHk-=wjuoGyxDhAF8SsrTkN0-YfCx7E6jUN3ikC_tn2AKWTTsA@mail.gmail.com/
+Link: https://lore.kernel.org/lkml/CAHmME9otB5Wwxp7H8bR_i2uH2esEMvoBMC8uEXBMH9p0q1s6Bw@mail.gmail.com/
+Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/Makefile |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/Makefile
++++ b/drivers/net/wireguard/Makefile
+@@ -1,5 +1,4 @@
+-ccflags-y := -O3
+-ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
++ccflags-y := -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+ ccflags-$(CONFIG_WIREGUARD_DEBUG) += -DDEBUG
+ wireguard-y := main.o
+ wireguard-y += noise.o
diff --git a/queue-5.10/wireguard-peer-allocate-in-kmem_cache.patch b/queue-5.10/wireguard-peer-allocate-in-kmem_cache.patch
new file mode 100644 (file)
index 0000000..d516291
--- /dev/null
@@ -0,0 +1,127 @@
+From a4e9f8e3287c9eb6bf70df982870980dd3341863 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:34 +0200
+Subject: wireguard: peer: allocate in kmem_cache
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit a4e9f8e3287c9eb6bf70df982870980dd3341863 upstream.
+
+With deployments having upwards of 600k peers now, this somewhat heavy
+structure could benefit from more fine-grained allocations.
+Specifically, instead of using a 2048-byte slab for a 1544-byte object,
+we can now use 1544-byte objects directly, thus saving almost 25%
+per-peer, or with 600k peers, that's a savings of 303 MiB. This also
+makes wireguard's memory usage more transparent in tools like slabtop
+and /proc/slabinfo.
+
+Fixes: 8b5553ace83c ("wireguard: queueing: get rid of per-peer ring buffers")
+Suggested-by: Arnd Bergmann <arnd@arndb.de>
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/main.c |    7 +++++++
+ drivers/net/wireguard/peer.c |   21 +++++++++++++++++----
+ drivers/net/wireguard/peer.h |    3 +++
+ 3 files changed, 27 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/main.c
++++ b/drivers/net/wireguard/main.c
+@@ -28,6 +28,10 @@ static int __init mod_init(void)
+ #endif
+       wg_noise_init();
++      ret = wg_peer_init();
++      if (ret < 0)
++              goto err_peer;
++
+       ret = wg_device_init();
+       if (ret < 0)
+               goto err_device;
+@@ -44,6 +48,8 @@ static int __init mod_init(void)
+ err_netlink:
+       wg_device_uninit();
+ err_device:
++      wg_peer_uninit();
++err_peer:
+       return ret;
+ }
+@@ -51,6 +57,7 @@ static void __exit mod_exit(void)
+ {
+       wg_genetlink_uninit();
+       wg_device_uninit();
++      wg_peer_uninit();
+ }
+ module_init(mod_init);
+--- a/drivers/net/wireguard/peer.c
++++ b/drivers/net/wireguard/peer.c
+@@ -15,6 +15,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/list.h>
++static struct kmem_cache *peer_cache;
+ static atomic64_t peer_counter = ATOMIC64_INIT(0);
+ struct wg_peer *wg_peer_create(struct wg_device *wg,
+@@ -29,10 +30,10 @@ struct wg_peer *wg_peer_create(struct wg
+       if (wg->num_peers >= MAX_PEERS_PER_DEVICE)
+               return ERR_PTR(ret);
+-      peer = kzalloc(sizeof(*peer), GFP_KERNEL);
++      peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL);
+       if (unlikely(!peer))
+               return ERR_PTR(ret);
+-      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
++      if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)))
+               goto err;
+       peer->device = wg;
+@@ -64,7 +65,7 @@ struct wg_peer *wg_peer_create(struct wg
+       return peer;
+ err:
+-      kfree(peer);
++      kmem_cache_free(peer_cache, peer);
+       return ERR_PTR(ret);
+ }
+@@ -193,7 +194,8 @@ static void rcu_release(struct rcu_head
+       /* The final zeroing takes care of clearing any remaining handshake key
+        * material and other potentially sensitive information.
+        */
+-      kfree_sensitive(peer);
++      memzero_explicit(peer, sizeof(*peer));
++      kmem_cache_free(peer_cache, peer);
+ }
+ static void kref_release(struct kref *refcount)
+@@ -225,3 +227,14 @@ void wg_peer_put(struct wg_peer *peer)
+               return;
+       kref_put(&peer->refcount, kref_release);
+ }
++
++int __init wg_peer_init(void)
++{
++      peer_cache = KMEM_CACHE(wg_peer, 0);
++      return peer_cache ? 0 : -ENOMEM;
++}
++
++void wg_peer_uninit(void)
++{
++      kmem_cache_destroy(peer_cache);
++}
+--- a/drivers/net/wireguard/peer.h
++++ b/drivers/net/wireguard/peer.h
+@@ -80,4 +80,7 @@ void wg_peer_put(struct wg_peer *peer);
+ void wg_peer_remove(struct wg_peer *peer);
+ void wg_peer_remove_all(struct wg_device *wg);
++int wg_peer_init(void);
++void wg_peer_uninit(void);
++
+ #endif /* _WG_PEER_H */
diff --git a/queue-5.10/wireguard-selftests-make-sure-rp_filter-is-disabled-on-vethc.patch b/queue-5.10/wireguard-selftests-make-sure-rp_filter-is-disabled-on-vethc.patch
new file mode 100644 (file)
index 0000000..a484338
--- /dev/null
@@ -0,0 +1,32 @@
+From f8873d11d4121aad35024f9379e431e0c83abead Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:31 +0200
+Subject: wireguard: selftests: make sure rp_filter is disabled on vethc
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit f8873d11d4121aad35024f9379e431e0c83abead upstream.
+
+Some distros may enable strict rp_filter by default, which will prevent
+vethc from receiving the packets with an unrouteable reverse path address.
+
+Reported-by: Hangbin Liu <liuhangbin@gmail.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/wireguard/netns.sh |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -363,6 +363,7 @@ ip1 -6 rule add table main suppress_pref
+ ip1 -4 route add default dev wg0 table 51820
+ ip1 -4 rule add not fwmark 51820 table 51820
+ ip1 -4 rule add table main suppress_prefixlength 0
++n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/vethc/rp_filter'
+ # Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
+ n1 ping -W 1 -c 100 -f 192.168.99.7
+ n1 ping -W 1 -c 100 -f abab::1111
diff --git a/queue-5.10/wireguard-selftests-remove-old-conntrack-kconfig-value.patch b/queue-5.10/wireguard-selftests-remove-old-conntrack-kconfig-value.patch
new file mode 100644 (file)
index 0000000..5ddf3ec
--- /dev/null
@@ -0,0 +1,31 @@
+From acf2492b51c9a3c4dfb947f4d3477a86d315150f Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:30 +0200
+Subject: wireguard: selftests: remove old conntrack kconfig value
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit acf2492b51c9a3c4dfb947f4d3477a86d315150f upstream.
+
+On recent kernels, this config symbol is no longer used.
+
+Reported-by: Rui Salvaterra <rsalvaterra@gmail.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/wireguard/qemu/kernel.config |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/tools/testing/selftests/wireguard/qemu/kernel.config
++++ b/tools/testing/selftests/wireguard/qemu/kernel.config
+@@ -19,7 +19,6 @@ CONFIG_NETFILTER_XTABLES=y
+ CONFIG_NETFILTER_XT_NAT=y
+ CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+ CONFIG_NETFILTER_XT_MARK=y
+-CONFIG_NF_CONNTRACK_IPV4=y
+ CONFIG_NF_NAT_IPV4=y
+ CONFIG_IP_NF_IPTABLES=y
+ CONFIG_IP_NF_FILTER=y
diff --git a/queue-5.10/wireguard-use-synchronize_net-rather-than-synchronize_rcu.patch b/queue-5.10/wireguard-use-synchronize_net-rather-than-synchronize_rcu.patch
new file mode 100644 (file)
index 0000000..07ae36f
--- /dev/null
@@ -0,0 +1,68 @@
+From 24b70eeeb4f46c09487f8155239ebfb1f875774a Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 4 Jun 2021 17:17:33 +0200
+Subject: wireguard: use synchronize_net rather than synchronize_rcu
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+commit 24b70eeeb4f46c09487f8155239ebfb1f875774a upstream.
+
+Many of the synchronization points are sometimes called under the rtnl
+lock, which means we should use synchronize_net rather than
+synchronize_rcu. Under the hood, this expands to using the expedited
+flavor of function in the event that rtnl is held, in order to not stall
+other concurrent changes.
+
+This fixes some very, very long delays when removing multiple peers at
+once, which would cause some operations to take several minutes.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireguard/peer.c   |    6 +++---
+ drivers/net/wireguard/socket.c |    2 +-
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/peer.c
++++ b/drivers/net/wireguard/peer.c
+@@ -89,7 +89,7 @@ static void peer_make_dead(struct wg_pee
+       /* Mark as dead, so that we don't allow jumping contexts after. */
+       WRITE_ONCE(peer->is_dead, true);
+-      /* The caller must now synchronize_rcu() for this to take effect. */
++      /* The caller must now synchronize_net() for this to take effect. */
+ }
+ static void peer_remove_after_dead(struct wg_peer *peer)
+@@ -161,7 +161,7 @@ void wg_peer_remove(struct wg_peer *peer
+       lockdep_assert_held(&peer->device->device_update_lock);
+       peer_make_dead(peer);
+-      synchronize_rcu();
++      synchronize_net();
+       peer_remove_after_dead(peer);
+ }
+@@ -179,7 +179,7 @@ void wg_peer_remove_all(struct wg_device
+               peer_make_dead(peer);
+               list_add_tail(&peer->peer_list, &dead_peers);
+       }
+-      synchronize_rcu();
++      synchronize_net();
+       list_for_each_entry_safe(peer, temp, &dead_peers, peer_list)
+               peer_remove_after_dead(peer);
+ }
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -430,7 +430,7 @@ void wg_socket_reinit(struct wg_device *
+       if (new4)
+               wg->incoming_port = ntohs(inet_sk(new4)->inet_sport);
+       mutex_unlock(&wg->socket_update_lock);
+-      synchronize_rcu();
++      synchronize_net();
+       sock_free(old4);
+       sock_free(old6);
+ }