From 173b1e3d5f75b028b4f2650e6b4003b6d7ab3c66 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 30 Dec 2024 15:34:47 +0100 Subject: [PATCH] 5.15-stable patches added patches: net-dsa-improve-shutdown-sequence.patch nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch --- .../net-dsa-improve-shutdown-sequence.patch | 130 ++++++++++++++++++ ...sync-mode-in-nfs4_state_shutdown_net.patch | 97 +++++++++++++ queue-5.15/series | 2 + 3 files changed, 229 insertions(+) create mode 100644 queue-5.15/net-dsa-improve-shutdown-sequence.patch create mode 100644 queue-5.15/nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch diff --git a/queue-5.15/net-dsa-improve-shutdown-sequence.patch b/queue-5.15/net-dsa-improve-shutdown-sequence.patch new file mode 100644 index 00000000000..6323725d729 --- /dev/null +++ b/queue-5.15/net-dsa-improve-shutdown-sequence.patch @@ -0,0 +1,130 @@ +From 6c24a03a61a245fe34d47582898331fa034b6ccd Mon Sep 17 00:00:00 2001 +From: Vladimir Oltean +Date: Fri, 13 Sep 2024 23:35:49 +0300 +Subject: net: dsa: improve shutdown sequence + +From: Vladimir Oltean + +commit 6c24a03a61a245fe34d47582898331fa034b6ccd upstream. + +Alexander Sverdlin presents 2 problems during shutdown with the +lan9303 driver. One is specific to lan9303 and the other just happens +to reproduce there. + +The first problem is that lan9303 is unique among DSA drivers in that it +calls dev_get_drvdata() at "arbitrary runtime" (not probe, not shutdown, +not remove): + +phy_state_machine() +-> ... + -> dsa_user_phy_read() + -> ds->ops->phy_read() + -> lan9303_phy_read() + -> chip->ops->phy_read() + -> lan9303_mdio_phy_read() + -> dev_get_drvdata() + +But we never stop the phy_state_machine(), so it may continue to run +after dsa_switch_shutdown(). Our common pattern in all DSA drivers is +to set drvdata to NULL to suppress the remove() method that may come +afterwards. But in this case it will result in an NPD. + +The second problem is that the way in which we set +dp->conduit->dsa_ptr = NULL; is concurrent with receive packet +processing. dsa_switch_rcv() checks once whether dev->dsa_ptr is NULL, +but afterwards, rather than continuing to use that non-NULL value, +dev->dsa_ptr is dereferenced again and again without NULL checks: +dsa_conduit_find_user() and many other places. In between dereferences, +there is no locking to ensure that what was valid once continues to be +valid. + +Both problems have the common aspect that closing the conduit interface +solves them. + +In the first case, dev_close(conduit) triggers the NETDEV_GOING_DOWN +event in dsa_user_netdevice_event() which closes user ports as well. +dsa_port_disable_rt() calls phylink_stop(), which synchronously stops +the phylink state machine, and ds->ops->phy_read() will thus no longer +call into the driver after this point. + +In the second case, dev_close(conduit) should do this, as per +Documentation/networking/driver.rst: + +| Quiescence +| ---------- +| +| After the ndo_stop routine has been called, the hardware must +| not receive or transmit any data. All in flight packets must +| be aborted. If necessary, poll or wait for completion of +| any reset commands. + +So it should be sufficient to ensure that later, when we zeroize +conduit->dsa_ptr, there will be no concurrent dsa_switch_rcv() call +on this conduit. + +The addition of the netif_device_detach() function is to ensure that +ioctls, rtnetlinks and ethtool requests on the user ports no longer +propagate down to the driver - we're no longer prepared to handle them. + +The race condition actually did not exist when commit 0650bf52b31f +("net: dsa: be compatible with masters which unregister on shutdown") +first introduced dsa_switch_shutdown(). It was created later, when we +stopped unregistering the user interfaces from a bad spot, and we just +replaced that sequence with a racy zeroization of conduit->dsa_ptr +(one which doesn't ensure that the interfaces aren't up). + +Reported-by: Alexander Sverdlin +Closes: https://lore.kernel.org/netdev/2d2e3bba17203c14a5ffdabc174e3b6bbb9ad438.camel@siemens.com/ +Closes: https://lore.kernel.org/netdev/c1bf4de54e829111e0e4a70e7bd1cf523c9550ff.camel@siemens.com/ +Fixes: ee534378f005 ("net: dsa: fix panic when DSA master device unbinds on shutdown") +Reviewed-by: Alexander Sverdlin +Tested-by: Alexander Sverdlin +Signed-off-by: Vladimir Oltean +Link: https://patch.msgid.link/20240913203549.3081071-1-vladimir.oltean@nxp.com +Signed-off-by: Paolo Abeni +Suggested-by: Vladimir Oltean +Signed-off-by: Peilin He +Reviewed-by: xu xin +Signed-off-by: Kun Jiang +Cc: Fan Yu +Cc: Yutan Qiu +Cc: Yaxin Wang +Cc: tuqiang +Cc: Yang Yang +Cc: ye xingchen +Cc: Yunkai Zhang +Signed-off-by: Greg Kroah-Hartman +--- + net/dsa/dsa2.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/net/dsa/dsa2.c ++++ b/net/dsa/dsa2.c +@@ -1656,6 +1656,7 @@ EXPORT_SYMBOL_GPL(dsa_unregister_switch) + void dsa_switch_shutdown(struct dsa_switch *ds) + { + struct net_device *master, *slave_dev; ++ LIST_HEAD(close_list); + struct dsa_port *dp; + + mutex_lock(&dsa2_mutex); +@@ -1665,6 +1666,11 @@ void dsa_switch_shutdown(struct dsa_swit + + rtnl_lock(); + ++ dsa_switch_for_each_cpu_port(dp, ds) ++ list_add(&dp->master->close_list, &close_list); ++ ++ dev_close_many(&close_list, true); ++ + list_for_each_entry(dp, &ds->dst->ports, list) { + if (dp->ds != ds) + continue; +@@ -1675,6 +1681,7 @@ void dsa_switch_shutdown(struct dsa_swit + master = dp->cpu_dp->master; + slave_dev = dp->slave; + ++ netif_device_detach(slave_dev); + netdev_upper_dev_unlink(master, slave_dev); + } + diff --git a/queue-5.15/nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch b/queue-5.15/nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch new file mode 100644 index 00000000000..20611957b08 --- /dev/null +++ b/queue-5.15/nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch @@ -0,0 +1,97 @@ +From d5ff2fb2e7167e9483846e34148e60c0c016a1f6 Mon Sep 17 00:00:00 2001 +From: Yang Erkun +Date: Mon, 21 Oct 2024 16:25:40 +0800 +Subject: nfsd: cancel nfsd_shrinker_work using sync mode in nfs4_state_shutdown_net + +From: Yang Erkun + +commit d5ff2fb2e7167e9483846e34148e60c0c016a1f6 upstream. + +In the normal case, when we excute `echo 0 > /proc/fs/nfsd/threads`, the +function `nfs4_state_destroy_net` in `nfs4_state_shutdown_net` will +release all resources related to the hashed `nfs4_client`. If the +`nfsd_client_shrinker` is running concurrently, the `expire_client` +function will first unhash this client and then destroy it. This can +lead to the following warning. Additionally, numerous use-after-free +errors may occur as well. + +nfsd_client_shrinker echo 0 > /proc/fs/nfsd/threads + +expire_client nfsd_shutdown_net + unhash_client ... + nfs4_state_shutdown_net + /* won't wait shrinker exit */ + /* cancel_work(&nn->nfsd_shrinker_work) + * nfsd_file for this /* won't destroy unhashed client1 */ + * client1 still alive nfs4_state_destroy_net + */ + + nfsd_file_cache_shutdown + /* trigger warning */ + kmem_cache_destroy(nfsd_file_slab) + kmem_cache_destroy(nfsd_file_mark_slab) + /* release nfsd_file and mark */ + __destroy_client + +==================================================================== +BUG nfsd_file (Not tainted): Objects remaining in nfsd_file on +__kmem_cache_shutdown() +-------------------------------------------------------------------- +CPU: 4 UID: 0 PID: 764 Comm: sh Not tainted 6.12.0-rc3+ #1 + + dump_stack_lvl+0x53/0x70 + slab_err+0xb0/0xf0 + __kmem_cache_shutdown+0x15c/0x310 + kmem_cache_destroy+0x66/0x160 + nfsd_file_cache_shutdown+0xac/0x210 [nfsd] + nfsd_destroy_serv+0x251/0x2a0 [nfsd] + nfsd_svc+0x125/0x1e0 [nfsd] + write_threads+0x16a/0x2a0 [nfsd] + nfsctl_transaction_write+0x74/0xa0 [nfsd] + vfs_write+0x1a5/0x6d0 + ksys_write+0xc1/0x160 + do_syscall_64+0x5f/0x170 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + +==================================================================== +BUG nfsd_file_mark (Tainted: G B W ): Objects remaining +nfsd_file_mark on __kmem_cache_shutdown() +-------------------------------------------------------------------- + + dump_stack_lvl+0x53/0x70 + slab_err+0xb0/0xf0 + __kmem_cache_shutdown+0x15c/0x310 + kmem_cache_destroy+0x66/0x160 + nfsd_file_cache_shutdown+0xc8/0x210 [nfsd] + nfsd_destroy_serv+0x251/0x2a0 [nfsd] + nfsd_svc+0x125/0x1e0 [nfsd] + write_threads+0x16a/0x2a0 [nfsd] + nfsctl_transaction_write+0x74/0xa0 [nfsd] + vfs_write+0x1a5/0x6d0 + ksys_write+0xc1/0x160 + do_syscall_64+0x5f/0x170 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + +To resolve this issue, cancel `nfsd_shrinker_work` using synchronous +mode in nfs4_state_shutdown_net. + +Fixes: 7c24fa225081 ("NFSD: replace delayed_work with work_struct for nfsd_client_shrinker") +Signed-off-by: Yang Erkun +Reviewed-by: Jeff Layton +Signed-off-by: Chuck Lever +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfsd/nfs4state.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -8211,7 +8211,7 @@ nfs4_state_shutdown_net(struct net *net) + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + unregister_shrinker(&nn->nfsd_client_shrinker); +- cancel_work(&nn->nfsd_shrinker_work); ++ cancel_work_sync(&nn->nfsd_shrinker_work); + cancel_delayed_work_sync(&nn->laundromat_work); + locks_end_grace(&nn->nfsd4_manager); + diff --git a/queue-5.15/series b/queue-5.15/series index e3c81cbe225..4f9ec79dcb6 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -99,3 +99,5 @@ tracing-constify-string-literal-data-member-in-struct-trace_event_call.patch tracing-prevent-bad-count-for-tracing_cpumask_write.patch power-supply-gpio-charger-fix-set-charge-current-limits.patch btrfs-avoid-monopolizing-a-core-when-activating-a-swap-file.patch +nfsd-cancel-nfsd_shrinker_work-using-sync-mode-in-nfs4_state_shutdown_net.patch +net-dsa-improve-shutdown-sequence.patch -- 2.47.3