From: Sasha Levin <sashal@kernel.org>
Date: Mon, 25 Sep 2023 19:11:40 +0000 (-0400)
Subject: Fixes for 5.15
X-Git-Tag: v6.5.6~90
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9b0c0e5da473e099e88d43c0fae1d639ebf9699d;p=thirdparty%2Fkernel%2Fstable-queue.git

Fixes for 5.15

Signed-off-by: Sasha Levin <sashal@kernel.org>
---

diff --git a/queue-5.15/series b/queue-5.15/series
index 9e9cd5b6a9b..0671f8a25dd 100644
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -69,3 +69,9 @@ gpio-tb10x-fix-an-error-handling-path-in-tb10x_gpio_.patch
 i2c-mux-demux-pinctrl-check-the-return-value-of-devm.patch
 i2c-mux-gpio-replace-custom-acpi_get_local_address.patch
 i2c-mux-gpio-add-missing-fwnode_handle_put.patch
+xfs-bound-maximum-wait-time-for-inodegc-work.patch
+xfs-introduce-xfs_inodegc_push.patch
+xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
+xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
+xfs-disable-reaping-in-fscounters-scrub.patch
+xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
diff --git a/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch b/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch
new file mode 100644
index 00000000000..d20e4837305
--- /dev/null
+++ b/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch
@@ -0,0 +1,161 @@
+From 5749af0ac33eef67973f0631a0ebe3f21d7a9abd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:51 -0700
+Subject: xfs: bound maximum wait time for inodegc work
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 7cf2b0f9611b9971d663e1fc3206eeda3b902922 ]
+
+Currently inodegc work can sit queued on the per-cpu queue until
+the workqueue is either flushed of the queue reaches a depth that
+triggers work queuing (and later throttling). This means that we
+could queue work that waits for a long time for some other event to
+trigger flushing.
+
+Hence instead of just queueing work at a specific depth, use a
+delayed work that queues the work at a bound time. We can still
+schedule the work immediately at a given depth, but we no long need
+to worry about leaving a number of items on the list that won't get
+processed until external events prevail.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 36 ++++++++++++++++++++++--------------
+ fs/xfs/xfs_mount.h  |  2 +-
+ fs/xfs/xfs_super.c  |  2 +-
+ 3 files changed, 24 insertions(+), 16 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 5e44d7bbd8fca..2c3ef553f5ef0 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -458,7 +458,7 @@ xfs_inodegc_queue_all(
+ 	for_each_online_cpu(cpu) {
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ 		if (!llist_empty(&gc->list))
+-			queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ 	}
+ }
+ 
+@@ -1851,8 +1851,8 @@ void
+ xfs_inodegc_worker(
+ 	struct work_struct	*work)
+ {
+-	struct xfs_inodegc	*gc = container_of(work, struct xfs_inodegc,
+-							work);
++	struct xfs_inodegc	*gc = container_of(to_delayed_work(work),
++						struct xfs_inodegc, work);
+ 	struct llist_node	*node = llist_del_all(&gc->list);
+ 	struct xfs_inode	*ip, *n;
+ 
+@@ -2021,6 +2021,7 @@ xfs_inodegc_queue(
+ 	struct xfs_inodegc	*gc;
+ 	int			items;
+ 	unsigned int		shrinker_hits;
++	unsigned long		queue_delay = 1;
+ 
+ 	trace_xfs_inode_set_need_inactive(ip);
+ 	spin_lock(&ip->i_flags_lock);
+@@ -2032,19 +2033,26 @@ xfs_inodegc_queue(
+ 	items = READ_ONCE(gc->items);
+ 	WRITE_ONCE(gc->items, items + 1);
+ 	shrinker_hits = READ_ONCE(gc->shrinker_hits);
+-	put_cpu_ptr(gc);
+ 
+-	if (!xfs_is_inodegc_enabled(mp))
++	/*
++	 * We queue the work while holding the current CPU so that the work
++	 * is scheduled to run on this CPU.
++	 */
++	if (!xfs_is_inodegc_enabled(mp)) {
++		put_cpu_ptr(gc);
+ 		return;
+-
+-	if (xfs_inodegc_want_queue_work(ip, items)) {
+-		trace_xfs_inodegc_queue(mp, __return_address);
+-		queue_work(mp->m_inodegc_wq, &gc->work);
+ 	}
+ 
++	if (xfs_inodegc_want_queue_work(ip, items))
++		queue_delay = 0;
++
++	trace_xfs_inodegc_queue(mp, __return_address);
++	mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++	put_cpu_ptr(gc);
++
+ 	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+ 		trace_xfs_inodegc_throttle(mp, __return_address);
+-		flush_work(&gc->work);
++		flush_delayed_work(&gc->work);
+ 	}
+ }
+ 
+@@ -2061,7 +2069,7 @@ xfs_inodegc_cpu_dead(
+ 	unsigned int		count = 0;
+ 
+ 	dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+-	cancel_work_sync(&dead_gc->work);
++	cancel_delayed_work_sync(&dead_gc->work);
+ 
+ 	if (llist_empty(&dead_gc->list))
+ 		return;
+@@ -2080,12 +2088,12 @@ xfs_inodegc_cpu_dead(
+ 	llist_add_batch(first, last, &gc->list);
+ 	count += READ_ONCE(gc->items);
+ 	WRITE_ONCE(gc->items, count);
+-	put_cpu_ptr(gc);
+ 
+ 	if (xfs_is_inodegc_enabled(mp)) {
+ 		trace_xfs_inodegc_queue(mp, __return_address);
+-		queue_work(mp->m_inodegc_wq, &gc->work);
++		mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+ 	}
++	put_cpu_ptr(gc);
+ }
+ 
+ /*
+@@ -2180,7 +2188,7 @@ xfs_inodegc_shrinker_scan(
+ 			unsigned int	h = READ_ONCE(gc->shrinker_hits);
+ 
+ 			WRITE_ONCE(gc->shrinker_hits, h + 1);
+-			queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ 			no_items = false;
+ 		}
+ 	}
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 86564295fce6d..3d58938a6f75b 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -61,7 +61,7 @@ struct xfs_error_cfg {
+  */
+ struct xfs_inodegc {
+ 	struct llist_head	list;
+-	struct work_struct	work;
++	struct delayed_work	work;
+ 
+ 	/* approximate count of inodes in the list */
+ 	unsigned int		items;
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index df1d6be61bfa3..8fe6ca9208de6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1061,7 +1061,7 @@ xfs_inodegc_init_percpu(
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ 		init_llist_head(&gc->list);
+ 		gc->items = 0;
+-		INIT_WORK(&gc->work, xfs_inodegc_worker);
++		INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+ 	}
+ 	return 0;
+ }
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch b/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
new file mode 100644
index 00000000000..2d809345342
--- /dev/null
+++ b/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
@@ -0,0 +1,69 @@
+From afd86b3c5e1ab8fc89d632a031839219d31d54b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:54 -0700
+Subject: xfs: check that per-cpu inodegc workers actually run on that cpu
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit b37c4c8339cd394ea6b8b415026603320a185651 ]
+
+Now that we've allegedly worked out the problem of the per-cpu inodegc
+workers being scheduled on the wrong cpu, let's put in a debugging knob
+to let us know if a worker ever gets mis-scheduled again.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 2 ++
+ fs/xfs/xfs_mount.h  | 3 +++
+ fs/xfs/xfs_super.c  | 3 +++
+ 3 files changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index ab8181f8d08a9..02022164772de 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1856,6 +1856,8 @@ xfs_inodegc_worker(
+ 	struct llist_node	*node = llist_del_all(&gc->list);
+ 	struct xfs_inode	*ip, *n;
+ 
++	ASSERT(gc->cpu == smp_processor_id());
++
+ 	WRITE_ONCE(gc->items, 0);
+ 
+ 	if (!node)
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 3d58938a6f75b..29f35169bf9cf 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -66,6 +66,9 @@ struct xfs_inodegc {
+ 	/* approximate count of inodes in the list */
+ 	unsigned int		items;
+ 	unsigned int		shrinker_hits;
++#if defined(DEBUG) || defined(XFS_WARN)
++	unsigned int		cpu;
++#endif
+ };
+ 
+ /*
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 9b3af7611eaa9..569960e4ea3a6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1062,6 +1062,9 @@ xfs_inodegc_init_percpu(
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
++#if defined(DEBUG) || defined(XFS_WARN)
++		gc->cpu = cpu;
++#endif
+ 		init_llist_head(&gc->list);
+ 		gc->items = 0;
+ 		INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch
new file mode 100644
index 00000000000..fa2db1d5e1c
--- /dev/null
+++ b/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch
@@ -0,0 +1,141 @@
+From fb88730e10e7b3525c580bf34d31d95815a3c35c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:55 -0700
+Subject: xfs: disable reaping in fscounters scrub
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee ]
+
+The fscounters scrub code doesn't work properly because it cannot
+quiesce updates to the percpu counters in the filesystem, hence it
+returns false corruption reports.  This has been fixed properly in
+one of the online repair patchsets that are under review by replacing
+the xchk_disable_reaping calls with an exclusive filesystem freeze.
+Disabling background gc isn't sufficient to fix the problem.
+
+In other words, scrub doesn't need to call xfs_inodegc_stop, which is
+just as well since it wasn't correct to allow scrub to call
+xfs_inodegc_start when something else could be calling xfs_inodegc_stop
+(e.g. trying to freeze the filesystem).
+
+Neuter the scrubber for now, and remove the xchk_*_reaping functions.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/scrub/common.c     | 25 -------------------------
+ fs/xfs/scrub/common.h     |  2 --
+ fs/xfs/scrub/fscounters.c | 13 ++++++-------
+ fs/xfs/scrub/scrub.c      |  2 --
+ fs/xfs/scrub/scrub.h      |  1 -
+ 5 files changed, 6 insertions(+), 37 deletions(-)
+
+diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
+index bf1f3607d0b60..08df23edea720 100644
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -864,28 +864,3 @@ xchk_ilock_inverted(
+ 	return -EDEADLOCK;
+ }
+ 
+-/* Pause background reaping of resources. */
+-void
+-xchk_stop_reaping(
+-	struct xfs_scrub	*sc)
+-{
+-	sc->flags |= XCHK_REAPING_DISABLED;
+-	xfs_blockgc_stop(sc->mp);
+-	xfs_inodegc_stop(sc->mp);
+-}
+-
+-/* Restart background reaping of resources. */
+-void
+-xchk_start_reaping(
+-	struct xfs_scrub	*sc)
+-{
+-	/*
+-	 * Readonly filesystems do not perform inactivation or speculative
+-	 * preallocation, so there's no need to restart the workers.
+-	 */
+-	if (!xfs_is_readonly(sc->mp)) {
+-		xfs_inodegc_start(sc->mp);
+-		xfs_blockgc_start(sc->mp);
+-	}
+-	sc->flags &= ~XCHK_REAPING_DISABLED;
+-}
+diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
+index 454145db10e71..2ca80102e704a 100644
+--- a/fs/xfs/scrub/common.h
++++ b/fs/xfs/scrub/common.h
+@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
+ 
+ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+-void xchk_stop_reaping(struct xfs_scrub *sc);
+-void xchk_start_reaping(struct xfs_scrub *sc);
+ 
+ #endif	/* __XFS_SCRUB_COMMON_H__ */
+diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
+index 48a6cbdf95d08..037541339d80a 100644
+--- a/fs/xfs/scrub/fscounters.c
++++ b/fs/xfs/scrub/fscounters.c
+@@ -128,13 +128,6 @@ xchk_setup_fscounters(
+ 	if (error)
+ 		return error;
+ 
+-	/*
+-	 * Pause background reclaim while we're scrubbing to reduce the
+-	 * likelihood of background perturbations to the counters throwing off
+-	 * our calculations.
+-	 */
+-	xchk_stop_reaping(sc);
+-
+ 	return xchk_trans_alloc(sc, 0);
+ }
+ 
+@@ -353,6 +346,12 @@ xchk_fscounters(
+ 	if (fdblocks > mp->m_sb.sb_dblocks)
+ 		xchk_set_corrupt(sc);
+ 
++	/*
++	 * XXX: We can't quiesce percpu counter updates, so exit early.
++	 * This can be re-enabled when we gain exclusive freeze functionality.
++	 */
++	return 0;
++
+ 	/*
+ 	 * If ifree exceeds icount by more than the minimum variance then
+ 	 * something's probably wrong with the counters.
+diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
+index 51e4c61916d20..e4d2a41983f73 100644
+--- a/fs/xfs/scrub/scrub.c
++++ b/fs/xfs/scrub/scrub.c
+@@ -171,8 +171,6 @@ xchk_teardown(
+ 	}
+ 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ 		mnt_drop_write_file(sc->file);
+-	if (sc->flags & XCHK_REAPING_DISABLED)
+-		xchk_start_reaping(sc);
+ 	if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
+ 		mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ 		sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
+index 80e5026bba44a..e8d9fe9de26ed 100644
+--- a/fs/xfs/scrub/scrub.h
++++ b/fs/xfs/scrub/scrub.h
+@@ -89,7 +89,6 @@ struct xfs_scrub {
+ /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+ #define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
+ #define XCHK_HAS_QUOTAOFFLOCK	(1 << 1)  /* we hold the quotaoff lock */
+-#define XCHK_REAPING_DISABLED	(1 << 2)  /* background block reaping paused */
+ #define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
+ 
+ /* Metadata scrubbers */
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch b/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
new file mode 100644
index 00000000000..d1414c95d1a
--- /dev/null
+++ b/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
@@ -0,0 +1,75 @@
+From b210ea2535b4205d2e826b1d6d4b82d4d8917c79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:53 -0700
+Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run
+ immediately
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 ]
+
+I've been noticing odd racing behavior in the inodegc code that could
+only be explained by one cpu adding an inode to its inactivation llist
+at the same time that another cpu is processing that cpu's llist.
+Preemption is disabled between get/put_cpu_ptr, so the only explanation
+is scheduler mayhem.  I inserted the following debug code into
+xfs_inodegc_worker (see the next patch):
+
+	ASSERT(gc->cpu == smp_processor_id());
+
+This assertion tripped during overnight tests on the arm64 machines, but
+curiously not on x86_64.  I think we haven't observed any resource leaks
+here because the lockfree list code can handle simultaneous llist_add
+and llist_del_all functions operating on the same list.  However, the
+whole point of having percpu inodegc lists is to take advantage of warm
+memory caches by inactivating inodes on the last processor to touch the
+inode.
+
+The incorrect scheduling seems to occur after an inodegc worker is
+subjected to mod_delayed_work().  This wraps mod_delayed_work_on with
+WORK_CPU_UNBOUND specified as the cpu number.  Unbound allows for
+scheduling on any cpu, not necessarily the same one that scheduled the
+work.
+
+Because preemption is disabled for as long as we have the gc pointer, I
+think it's safe to use current_cpu() (aka smp_processor_id) to queue the
+delayed work item on the correct cpu.
+
+Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index e9ebfe6f80150..ab8181f8d08a9 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2057,7 +2057,8 @@ xfs_inodegc_queue(
+ 		queue_delay = 0;
+ 
+ 	trace_xfs_inodegc_queue(mp, __return_address);
+-	mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++			queue_delay);
+ 	put_cpu_ptr(gc);
+ 
+ 	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+@@ -2101,7 +2102,8 @@ xfs_inodegc_cpu_dead(
+ 
+ 	if (xfs_is_inodegc_enabled(mp)) {
+ 		trace_xfs_inodegc_queue(mp, __return_address);
+-		mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
++		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++				0);
+ 	}
+ 	put_cpu_ptr(gc);
+ }
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch b/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
new file mode 100644
index 00000000000..903f7de4ad9
--- /dev/null
+++ b/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
@@ -0,0 +1,188 @@
+From 40c124cf66ae4ddfa82dbe3c34a357a0cae38c66 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:56 -0700
+Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2254a7396a0ca6309854948ee1c0a33fa4268cec ]
+
+syzbot reported this warning from the faux inodegc shrinker that tries
+to kick off inodegc work:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+Call Trace:
+ __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
+ mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
+ xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
+ xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
+ do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
+ shrink_slab+0x175/0x660 mm/vmscan.c:1013
+ shrink_one+0x502/0x810 mm/vmscan.c:5343
+ shrink_many mm/vmscan.c:5394 [inline]
+ lru_gen_shrink_node mm/vmscan.c:5511 [inline]
+ shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
+ kswapd_shrink_node mm/vmscan.c:7262 [inline]
+ balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
+ kswapd+0x677/0xd60 mm/vmscan.c:7712
+ kthread+0x2e8/0x3a0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+
+This warning corresponds to this code in __queue_work:
+
+	/*
+	 * For a draining wq, only works from the same workqueue are
+	 * allowed. The __WQ_DESTROYING helps to spot the issue that
+	 * queues a new work item to a wq after destroy_workqueue(wq).
+	 */
+	if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+		     WARN_ON_ONCE(!is_chained_work(wq))))
+		return;
+
+For this to trip, we must have a thread draining the inodedgc workqueue
+and a second thread trying to queue inodegc work to that workqueue.
+This can happen if freezing or a ro remount race with reclaim poking our
+faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
+file:
+
+Thread 0	Thread 1	Thread 2
+
+xfs_inodegc_stop
+
+				xfs_inodegc_shrinker_scan
+				xfs_is_inodegc_enabled
+				<yes, will continue>
+
+xfs_clear_inodegc_enabled
+xfs_inodegc_queue_all
+<list empty, do not queue inodegc worker>
+
+		xfs_inodegc_queue
+		<add to list>
+		xfs_is_inodegc_enabled
+		<no, returns>
+
+drain_workqueue
+<set WQ_DRAINING>
+
+				llist_empty
+				<no, will queue list>
+				mod_delayed_work_on(..., 0)
+				__queue_work
+				<sees WQ_DRAINING, kaboom>
+
+In other words, everything between the access to inodegc_enabled state
+and the decision to poke the inodegc workqueue requires some kind of
+coordination to avoid the WQ_DRAINING state.  We could perhaps introduce
+a lock here, but we could also try to eliminate WQ_DRAINING from the
+picture.
+
+We could replace the drain_workqueue call with a loop that flushes the
+workqueue and queues workers as long as there is at least one inode
+present in the per-cpu inodegc llists.  We've disabled inodegc at this
+point, so we know that the number of queued inodes will eventually hit
+zero as long as xfs_inodegc_start cannot reactivate the workers.
+
+There are four callers of xfs_inodegc_start.  Three of them come from the
+VFS with s_umount held: filesystem thawing, failed filesystem freezing,
+and the rw remount transition.  The fourth caller is mounting rw (no
+remount or freezing possible).
+
+There are three callers ofs xfs_inodegc_stop.  One is unmounting (no
+remount or thaw possible).  Two of them come from the VFS with s_umount
+held: fs freezing and ro remount transition.
+
+Hence, it is correct to replace the drain_workqueue call with a loop
+that drains the inodegc llists.
+
+Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 32 +++++++++++++++++++++++++++-----
+ 1 file changed, 27 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 02022164772de..eab98d76dbe1f 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -448,18 +448,23 @@ xfs_iget_check_free_state(
+ }
+ 
+ /* Make all pending inactivation work start immediately. */
+-static void
++static bool
+ xfs_inodegc_queue_all(
+ 	struct xfs_mount	*mp)
+ {
+ 	struct xfs_inodegc	*gc;
+ 	int			cpu;
++	bool			ret = false;
+ 
+ 	for_each_online_cpu(cpu) {
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-		if (!llist_empty(&gc->list))
++		if (!llist_empty(&gc->list)) {
+ 			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
++			ret = true;
++		}
+ 	}
++
++	return ret;
+ }
+ 
+ /*
+@@ -1902,24 +1907,41 @@ xfs_inodegc_flush(
+ 
+ /*
+  * Flush all the pending work and then disable the inode inactivation background
+- * workers and wait for them to stop.
++ * workers and wait for them to stop.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_stop(
+ 	struct xfs_mount	*mp)
+ {
++	bool			rerun;
++
+ 	if (!xfs_clear_inodegc_enabled(mp))
+ 		return;
+ 
++	/*
++	 * Drain all pending inodegc work, including inodes that could be
++	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
++	 * threads that sample the inodegc state just prior to us clearing it.
++	 * The inodegc flag state prevents new threads from queuing more
++	 * inodes, so we queue pending work items and flush the workqueue until
++	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
++	 * here because it does not allow other unserialized mechanisms to
++	 * reschedule inodegc work while this draining is in progress.
++	 */
+ 	xfs_inodegc_queue_all(mp);
+-	drain_workqueue(mp->m_inodegc_wq);
++	do {
++		flush_workqueue(mp->m_inodegc_wq);
++		rerun = xfs_inodegc_queue_all(mp);
++	} while (rerun);
+ 
+ 	trace_xfs_inodegc_stop(mp, __return_address);
+ }
+ 
+ /*
+  * Enable the inode inactivation background workers and schedule deferred inode
+- * inactivation work if there is any.
++ * inactivation work if there is any.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_start(
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-introduce-xfs_inodegc_push.patch b/queue-5.15/xfs-introduce-xfs_inodegc_push.patch
new file mode 100644
index 00000000000..db5c76f7505
--- /dev/null
+++ b/queue-5.15/xfs-introduce-xfs_inodegc_push.patch
@@ -0,0 +1,157 @@
+From f764f1d54ed4d383d265427d89575057e36dc870 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:52 -0700
+Subject: xfs: introduce xfs_inodegc_push()
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 5e672cd69f0a534a445df4372141fd0d1d00901d ]
+
+The current blocking mechanism for pushing the inodegc queue out to
+disk can result in systems becoming unusable when there is a long
+running inodegc operation. This is because the statfs()
+implementation currently issues a blocking flush of the inodegc
+queue and a significant number of common system utilities will call
+statfs() to discover something about the underlying filesystem.
+
+This can result in userspace operations getting stuck on inodegc
+progress, and when trying to remove a heavily reflinked file on slow
+storage with a full journal, this can result in delays measuring in
+hours.
+
+Avoid this problem by adding "push" function that expedites the
+flushing of the inodegc queue, but doesn't wait for it to complete.
+
+Convert xfs_fs_statfs() and xfs_qm_scall_getquota() to use this
+mechanism so they don't block but still ensure that queued
+operations are expedited.
+
+Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues")
+Reported-by: Chris Dunlop <chris@onthe.net.au>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+[djwong: fix _getquota_next to use _inodegc_push too]
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c      | 20 +++++++++++++++-----
+ fs/xfs/xfs_icache.h      |  1 +
+ fs/xfs/xfs_qm_syscalls.c |  9 ++++++---
+ fs/xfs/xfs_super.c       |  7 +++++--
+ fs/xfs/xfs_trace.h       |  1 +
+ 5 files changed, 28 insertions(+), 10 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 2c3ef553f5ef0..e9ebfe6f80150 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1872,19 +1872,29 @@ xfs_inodegc_worker(
+ }
+ 
+ /*
+- * Force all currently queued inode inactivation work to run immediately and
+- * wait for the work to finish.
++ * Expedite all pending inodegc work to run immediately. This does not wait for
++ * completion of the work.
+  */
+ void
+-xfs_inodegc_flush(
++xfs_inodegc_push(
+ 	struct xfs_mount	*mp)
+ {
+ 	if (!xfs_is_inodegc_enabled(mp))
+ 		return;
++	trace_xfs_inodegc_push(mp, __return_address);
++	xfs_inodegc_queue_all(mp);
++}
+ 
++/*
++ * Force all currently queued inode inactivation work to run immediately and
++ * wait for the work to finish.
++ */
++void
++xfs_inodegc_flush(
++	struct xfs_mount	*mp)
++{
++	xfs_inodegc_push(mp);
+ 	trace_xfs_inodegc_flush(mp, __return_address);
+-
+-	xfs_inodegc_queue_all(mp);
+ 	flush_workqueue(mp->m_inodegc_wq);
+ }
+ 
+diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
+index 2e4cfddf8b8ed..6cd180721659b 100644
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
+ void xfs_blockgc_start(struct xfs_mount *mp);
+ 
+ void xfs_inodegc_worker(struct work_struct *work);
++void xfs_inodegc_push(struct xfs_mount *mp);
+ void xfs_inodegc_flush(struct xfs_mount *mp);
+ void xfs_inodegc_stop(struct xfs_mount *mp);
+ void xfs_inodegc_start(struct xfs_mount *mp);
+diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
+index 47fe60e1a8873..322a111dfbc0f 100644
+--- a/fs/xfs/xfs_qm_syscalls.c
++++ b/fs/xfs/xfs_qm_syscalls.c
+@@ -481,9 +481,12 @@ xfs_qm_scall_getquota(
+ 	struct xfs_dquot	*dqp;
+ 	int			error;
+ 
+-	/* Flush inodegc work at the start of a quota reporting scan. */
++	/*
++	 * Expedite pending inodegc work at the start of a quota reporting
++	 * scan but don't block waiting for it to complete.
++	 */
+ 	if (id == 0)
+-		xfs_inodegc_flush(mp);
++		xfs_inodegc_push(mp);
+ 
+ 	/*
+ 	 * Try to get the dquot. We don't want it allocated on disk, so don't
+@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next(
+ 
+ 	/* Flush inodegc work at the start of a quota reporting scan. */
+ 	if (*id == 0)
+-		xfs_inodegc_flush(mp);
++		xfs_inodegc_push(mp);
+ 
+ 	error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+ 	if (error)
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 8fe6ca9208de6..9b3af7611eaa9 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -795,8 +795,11 @@ xfs_fs_statfs(
+ 	xfs_extlen_t		lsize;
+ 	int64_t			ffree;
+ 
+-	/* Wait for whatever inactivations are in progress. */
+-	xfs_inodegc_flush(mp);
++	/*
++	 * Expedite background inodegc but don't wait. We do not want to block
++	 * here waiting hours for a billion extent file to be truncated.
++	 */
++	xfs_inodegc_push(mp);
+ 
+ 	statp->f_type = XFS_SUPER_MAGIC;
+ 	statp->f_namelen = MAXNAMELEN - 1;
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index 1033a95fbf8e6..ebd17ddba0243 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name,					\
+ 	TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+ 	TP_ARGS(mp, caller_ip))
+ DEFINE_FS_EVENT(xfs_inodegc_flush);
++DEFINE_FS_EVENT(xfs_inodegc_push);
+ DEFINE_FS_EVENT(xfs_inodegc_start);
+ DEFINE_FS_EVENT(xfs_inodegc_stop);
+ DEFINE_FS_EVENT(xfs_inodegc_queue);
+-- 
+2.40.1
+