-From a4b3867e7b8f208e720e8084fce3a8c783f35150 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
+From 8e15aee621618a3ee3abecaf1fd8c1428098b7ef Mon Sep 17 00:00:00 2001
+From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 17 Oct 2023 18:38:16 -0700
Subject: net: move altnames together with the netdevice
From: Jakub Kicinski <kuba@kernel.org>
-[ Upstream commit 8e15aee621618a3ee3abecaf1fd8c1428098b7ef ]
+commit 8e15aee621618a3ee3abecaf1fd8c1428098b7ef upstream.
The altname nodes are currently not moved to the new netns
when netdevice itself moves:
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
- net/core/dev.c | 13 +++++++++----
+ net/core/dev.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
-diff --git a/net/core/dev.c b/net/core/dev.c
-index a450234f68a54..b1ba00e57a1ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
-@@ -379,6 +379,7 @@ static void netdev_name_node_alt_flush(struct net_device *dev)
+@@ -379,6 +379,7 @@ static void netdev_name_node_alt_flush(s
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
struct net *net = dev_net(dev);
ASSERT_RTNL();
-@@ -389,6 +390,10 @@ static void list_netdevice(struct net_device *dev)
- hlist_add_head_rcu(&dev->index_hlist,
+@@ -390,6 +391,9 @@ static void list_netdevice(struct net_de
dev_index_hash(net, dev->ifindex));
write_unlock(&dev_base_lock);
-+
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_add(net, name_node);
+
- /* We reserved the ifindex, this can't fail */
- WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
+ dev_base_seq_inc(net);
+ }
-@@ -400,12 +405,16 @@ static void list_netdevice(struct net_device *dev)
+@@ -398,8 +402,13 @@ static void list_netdevice(struct net_de
*/
static void unlist_netdevice(struct net_device *dev, bool lock)
{
+ struct netdev_name_node *name_node;
- struct net *net = dev_net(dev);
-
++
ASSERT_RTNL();
- xa_erase(&net->dev_by_index, dev->ifindex);
-
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_del(name_node);
+
/* Unlink dev from the device chain */
if (lock)
write_lock(&dev_base_lock);
-@@ -10873,7 +10882,6 @@ void unregister_netdevice_many_notify(struct list_head *head,
+@@ -10854,7 +10863,6 @@ void unregister_netdevice_many_notify(st
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
-@@ -10901,9 +10909,6 @@ void unregister_netdevice_many_notify(struct list_head *head,
+@@ -10882,9 +10890,6 @@ void unregister_netdevice_many_notify(st
dev_uc_flush(dev);
dev_mc_flush(dev);
netdev_name_node_alt_flush(dev);
netdev_name_node_free(dev->name_node);
---
-2.40.1
-
+++ /dev/null
-From 3e438fc7e62173a7f73219f1e7a2a4012ea1d0d5 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 26 Jul 2023 11:55:29 -0700
-Subject: net: store netdevs in an xarray
-
-From: Jakub Kicinski <kuba@kernel.org>
-
-[ Upstream commit 759ab1edb56c88906830fd6b2e7b12514dd32758 ]
-
-Iterating over the netdev hash table for netlink dumps is hard.
-Dumps are done in "chunks" so we need to save the position
-after each chunk, so we know where to restart from. Because
-netdevs are stored in a hash table we remember which bucket
-we were in and how many devices we dumped.
-
-Since we don't hold any locks across the "chunks" - devices may
-come and go while we're dumping. If that happens we may miss
-a device (if device is deleted from the bucket we were in).
-We indicate to user space that this may have happened by setting
-NLM_F_DUMP_INTR. User space is supposed to dump again (I think)
-if it sees that. Somehow I doubt most user space gets this right..
-
-To illustrate let's look at an example:
-
- System state:
- start: # [A, B, C]
- del: B # [A, C]
-
-with the hash table we may dump [A, B], missing C completely even
-tho it existed both before and after the "del B".
-
-Add an xarray and use it to allocate ifindexes. This way we
-can iterate ifindexes in order, without the worry that we'll
-skip one. We may still generate a dump of a state which "never
-existed", for example for a set of values and sequence of ops:
-
- System state:
- start: # [A, B]
- add: C # [A, C, B]
- del: B # [A, C]
-
-we may generate a dump of [A], if C got an index between A and B.
-System has never been in such state. But I'm 90% sure that's perfectly
-fine, important part is that we can't _miss_ devices which exist before
-and after. User space which wants to mirror kernel's state subscribes
-to notifications and does periodic dumps so it will know that C exists
-from the notification about its creation or from the next dump
-(next dump is _guaranteed_ to include C, if it doesn't get removed).
-
-To avoid any perf regressions keep the hash table for now. Most
-net namespaces have very few devices and microbenchmarking 1M lookups
-on Skylake I get the following results (not counting loopback
-to number of devs):
-
- #devs | hash | xa | delta
- 2 | 18.3 | 20.1 | + 9.8%
- 16 | 18.3 | 20.1 | + 9.5%
- 64 | 18.3 | 26.3 | +43.8%
- 128 | 20.4 | 26.3 | +28.6%
- 256 | 20.0 | 26.4 | +32.1%
- 1024 | 26.6 | 26.7 | + 0.2%
- 8192 |541.3 | 33.5 | -93.8%
-
-No surprises since the hash table has 256 entries.
-The microbenchmark scans indexes in order, if the pattern is more
-random xa starts to win at 512 devices already. But that's a lot
-of devices, in practice.
-
-Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
-Link: https://lore.kernel.org/r/20230726185530.2247698-2-kuba@kernel.org
-Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-Stable-dep-of: 8e15aee62161 ("net: move altnames together with the netdevice")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- include/net/net_namespace.h | 4 +-
- net/core/dev.c | 82 ++++++++++++++++++++++++-------------
- 2 files changed, 57 insertions(+), 29 deletions(-)
-
-diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
-index 78beaa765c733..9f6add96de2d7 100644
---- a/include/net/net_namespace.h
-+++ b/include/net/net_namespace.h
-@@ -42,6 +42,7 @@
- #include <linux/idr.h>
- #include <linux/skbuff.h>
- #include <linux/notifier.h>
-+#include <linux/xarray.h>
-
- struct user_namespace;
- struct proc_dir_entry;
-@@ -69,7 +70,7 @@ struct net {
- atomic_t dev_unreg_count;
-
- unsigned int dev_base_seq; /* protected by rtnl_mutex */
-- int ifindex;
-+ u32 ifindex;
-
- spinlock_t nsid_lock;
- atomic_t fnhe_genid;
-@@ -110,6 +111,7 @@ struct net {
-
- struct hlist_head *dev_name_head;
- struct hlist_head *dev_index_head;
-+ struct xarray dev_by_index;
- struct raw_notifier_head netdev_chain;
-
- /* Note that @hash_mix can be read millions times per second,
-diff --git a/net/core/dev.c b/net/core/dev.c
-index 45ec18d8b0f6b..a450234f68a54 100644
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -389,6 +389,8 @@ static void list_netdevice(struct net_device *dev)
- hlist_add_head_rcu(&dev->index_hlist,
- dev_index_hash(net, dev->ifindex));
- write_unlock(&dev_base_lock);
-+ /* We reserved the ifindex, this can't fail */
-+ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
-
- dev_base_seq_inc(net);
- }
-@@ -398,8 +400,12 @@ static void list_netdevice(struct net_device *dev)
- */
- static void unlist_netdevice(struct net_device *dev, bool lock)
- {
-+ struct net *net = dev_net(dev);
-+
- ASSERT_RTNL();
-
-+ xa_erase(&net->dev_by_index, dev->ifindex);
-+
- /* Unlink dev from the device chain */
- if (lock)
- write_lock(&dev_base_lock);
-@@ -9529,23 +9535,35 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- }
-
- /**
-- * dev_new_index - allocate an ifindex
-- * @net: the applicable net namespace
-+ * dev_index_reserve() - allocate an ifindex in a namespace
-+ * @net: the applicable net namespace
-+ * @ifindex: requested ifindex, pass %0 to get one allocated
-+ *
-+ * Allocate a ifindex for a new device. Caller must either use the ifindex
-+ * to store the device (via list_netdevice()) or call dev_index_release()
-+ * to give the index up.
- *
-- * Returns a suitable unique value for a new device interface
-- * number. The caller must hold the rtnl semaphore or the
-- * dev_base_lock to be sure it remains unique.
-+ * Return: a suitable unique value for a new device interface number or -errno.
- */
--static int dev_new_index(struct net *net)
-+static int dev_index_reserve(struct net *net, u32 ifindex)
- {
-- int ifindex = net->ifindex;
-+ int err;
-
-- for (;;) {
-- if (++ifindex <= 0)
-- ifindex = 1;
-- if (!__dev_get_by_index(net, ifindex))
-- return net->ifindex = ifindex;
-- }
-+ if (!ifindex)
-+ err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
-+ xa_limit_31b, &net->ifindex, GFP_KERNEL);
-+ else
-+ err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
-+ if (err < 0)
-+ return err;
-+
-+ return ifindex;
-+}
-+
-+static void dev_index_release(struct net *net, int ifindex)
-+{
-+ /* Expect only unused indexes, unlist_netdevice() removes the used */
-+ WARN_ON(xa_erase(&net->dev_by_index, ifindex));
- }
-
- /* Delayed registration/unregisteration */
-@@ -10015,11 +10033,10 @@ int register_netdevice(struct net_device *dev)
- goto err_uninit;
- }
-
-- ret = -EBUSY;
-- if (!dev->ifindex)
-- dev->ifindex = dev_new_index(net);
-- else if (__dev_get_by_index(net, dev->ifindex))
-+ ret = dev_index_reserve(net, dev->ifindex);
-+ if (ret < 0)
- goto err_uninit;
-+ dev->ifindex = ret;
-
- /* Transfer changeable features to wanted_features and enable
- * software offloads (GSO and GRO).
-@@ -10066,7 +10083,7 @@ int register_netdevice(struct net_device *dev)
- ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
- ret = notifier_to_errno(ret);
- if (ret)
-- goto err_uninit;
-+ goto err_ifindex_release;
-
- ret = netdev_register_kobject(dev);
- write_lock(&dev_base_lock);
-@@ -10122,6 +10139,8 @@ int register_netdevice(struct net_device *dev)
-
- err_uninit_notify:
- call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
-+err_ifindex_release:
-+ dev_index_release(net, dev->ifindex);
- err_uninit:
- if (dev->netdev_ops->ndo_uninit)
- dev->netdev_ops->ndo_uninit(dev);
-@@ -11009,9 +11028,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- goto out;
-
- /* Check that new_ifindex isn't used yet. */
-- err = -EBUSY;
-- if (new_ifindex && __dev_get_by_index(net, new_ifindex))
-- goto out;
-+ if (new_ifindex) {
-+ err = dev_index_reserve(net, new_ifindex);
-+ if (err < 0)
-+ goto out;
-+ } else {
-+ /* If there is an ifindex conflict assign a new one */
-+ err = dev_index_reserve(net, dev->ifindex);
-+ if (err == -EBUSY)
-+ err = dev_index_reserve(net, 0);
-+ if (err < 0)
-+ goto out;
-+ new_ifindex = err;
-+ }
-
- /*
- * And now a mini version of register_netdevice unregister_netdevice.
-@@ -11039,13 +11068,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- rcu_barrier();
-
- new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
-- /* If there is an ifindex conflict assign a new one */
-- if (!new_ifindex) {
-- if (__dev_get_by_index(net, dev->ifindex))
-- new_ifindex = dev_new_index(net);
-- else
-- new_ifindex = dev->ifindex;
-- }
-
- rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
- new_ifindex);
-@@ -11226,6 +11248,9 @@ static int __net_init netdev_init(struct net *net)
- if (net->dev_index_head == NULL)
- goto err_idx;
-
-+ net->ifindex = 1;
-+ xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC);
-+
- RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
-
- return 0;
-@@ -11323,6 +11348,7 @@ static void __net_exit netdev_exit(struct net *net)
- {
- kfree(net->dev_name_head);
- kfree(net->dev_index_head);
-+ xa_destroy(&net->dev_by_index);
- if (net != &init_net)
- WARN_ON_ONCE(!list_empty(&net->dev_base_head));
- }
---
-2.40.1
-