From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 16 Jul 2023 15:05:15 +0000 (+0200)
Subject: 6.1-stable patches
X-Git-Tag: v6.1.39~61
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=dc6184c1c58c04507edc2b0f04e056705e14375e;p=thirdparty%2Fkernel%2Fstable-queue.git

6.1-stable patches

added patches:
	mm-mmap-fix-extra-maple-tree-write.patch
	xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
	xfs-disable-reaping-in-fscounters-scrub.patch
	xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
	xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
---

diff --git a/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch b/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch
new file mode 100644
index 00000000000..14dc636d2a8
--- /dev/null
+++ b/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch
@@ -0,0 +1,45 @@
+From Liam.Howlett@oracle.com  Sun Jul 16 17:02:51 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Thu,  6 Jul 2023 14:51:35 -0400
+Subject: mm/mmap: Fix extra maple tree write
+To: linux-kernel@vger.kernel.org
+Cc: Andrew Morton <akpm@linux-foundation.org>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, John Hsu <John.Hsu@mediatek.com>, stable@vger.kernel.org, linux-mm@kvack.org
+Message-ID: <20230706185135.2235532-1-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+
+based on commit 0503ea8f5ba73eb3ab13a81c1eefbaf51405385a upstream.
+
+This was inadvertently fixed during the removal of __vma_adjust().
+
+When __vma_adjust() is adjusting next with a negative value (pushing
+vma->vm_end lower), there would be two writes to the maple tree.  The
+first write is unnecessary and uses all allocated nodes in the maple
+state.  The second write is necessary but will need to allocate nodes
+since the first write has used the allocated nodes.  This may be a
+problem as it may not be safe to allocate at this time, such as a low
+memory situation.  Fix the issue by avoiding the first write and only
+write the adjusted "next" VMA.
+
+Reported-by: John Hsu <John.Hsu@mediatek.com>
+Link: https://lore.kernel.org/lkml/9cb8c599b1d7f9c1c300d1a334d5eb70ec4d7357.camel@mediatek.com/
+Cc: stable@vger.kernel.org
+Cc: linux-mm@kvack.org
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mmap.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -767,7 +767,8 @@ int __vma_adjust(struct vm_area_struct *
+ 	}
+ 	if (end != vma->vm_end) {
+ 		if (vma->vm_end > end) {
+-			if (!insert || (insert->vm_start != end)) {
++			if ((vma->vm_end + adjust_next != end) &&
++			    (!insert || (insert->vm_start != end))) {
+ 				vma_mas_szero(&mas, end, vma->vm_end);
+ 				mas_reset(&mas);
+ 				VM_WARN_ON(insert &&
diff --git a/queue-6.1/series b/queue-6.1/series
index fc83f5eb22b..ae25d8f18ee 100644
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -583,3 +583,8 @@ arm-orion5x-fix-d2net-gpio-initialization.patch
 leds-trigger-netdev-recheck-netdev_led_mode_linkup-on-dev-rename.patch
 blktrace-use-inline-function-for-blk_trace_remove-while-blktrace-is-disabled.patch
 fs-no-need-to-check-source.patch
+xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
+xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
+xfs-disable-reaping-in-fscounters-scrub.patch
+xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
+mm-mmap-fix-extra-maple-tree-write.patch
diff --git a/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch b/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
new file mode 100644
index 00000000000..714f5ba526c
--- /dev/null
+++ b/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
@@ -0,0 +1,63 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:12 +0300
+Subject: xfs: check that per-cpu inodegc workers actually run on that cpu
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-3-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit b37c4c8339cd394ea6b8b415026603320a185651 upstream.
+
+Now that we've allegedly worked out the problem of the per-cpu inodegc
+workers being scheduled on the wrong cpu, let's put in a debugging knob
+to let us know if a worker ever gets mis-scheduled again.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    2 ++
+ fs/xfs/xfs_mount.h  |    3 +++
+ fs/xfs/xfs_super.c  |    3 +++
+ 3 files changed, 8 insertions(+)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1848,6 +1848,8 @@ xfs_inodegc_worker(
+ 	struct llist_node	*node = llist_del_all(&gc->list);
+ 	struct xfs_inode	*ip, *n;
+ 
++	ASSERT(gc->cpu == smp_processor_id());
++
+ 	WRITE_ONCE(gc->items, 0);
+ 
+ 	if (!node)
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -66,6 +66,9 @@ struct xfs_inodegc {
+ 	/* approximate count of inodes in the list */
+ 	unsigned int		items;
+ 	unsigned int		shrinker_hits;
++#if defined(DEBUG) || defined(XFS_WARN)
++	unsigned int		cpu;
++#endif
+ };
+ 
+ /*
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1084,6 +1084,9 @@ xfs_inodegc_init_percpu(
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
++#if defined(DEBUG) || defined(XFS_WARN)
++		gc->cpu = cpu;
++#endif
+ 		init_llist_head(&gc->list);
+ 		gc->items = 0;
+ 		INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
diff --git a/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch
new file mode 100644
index 00000000000..90153d13b0c
--- /dev/null
+++ b/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch
@@ -0,0 +1,132 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:13 +0300
+Subject: xfs: disable reaping in fscounters scrub
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-4-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee upstream.
+
+The fscounters scrub code doesn't work properly because it cannot
+quiesce updates to the percpu counters in the filesystem, hence it
+returns false corruption reports.  This has been fixed properly in
+one of the online repair patchsets that are under review by replacing
+the xchk_disable_reaping calls with an exclusive filesystem freeze.
+Disabling background gc isn't sufficient to fix the problem.
+
+In other words, scrub doesn't need to call xfs_inodegc_stop, which is
+just as well since it wasn't correct to allow scrub to call
+xfs_inodegc_start when something else could be calling xfs_inodegc_stop
+(e.g. trying to freeze the filesystem).
+
+Neuter the scrubber for now, and remove the xchk_*_reaping functions.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/common.c     |   26 --------------------------
+ fs/xfs/scrub/common.h     |    2 --
+ fs/xfs/scrub/fscounters.c |   13 ++++++-------
+ fs/xfs/scrub/scrub.c      |    2 --
+ fs/xfs/scrub/scrub.h      |    1 -
+ 5 files changed, 6 insertions(+), 38 deletions(-)
+
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -865,29 +865,3 @@ xchk_ilock_inverted(
+ 	}
+ 	return -EDEADLOCK;
+ }
+-
+-/* Pause background reaping of resources. */
+-void
+-xchk_stop_reaping(
+-	struct xfs_scrub	*sc)
+-{
+-	sc->flags |= XCHK_REAPING_DISABLED;
+-	xfs_blockgc_stop(sc->mp);
+-	xfs_inodegc_stop(sc->mp);
+-}
+-
+-/* Restart background reaping of resources. */
+-void
+-xchk_start_reaping(
+-	struct xfs_scrub	*sc)
+-{
+-	/*
+-	 * Readonly filesystems do not perform inactivation or speculative
+-	 * preallocation, so there's no need to restart the workers.
+-	 */
+-	if (!xfs_is_readonly(sc->mp)) {
+-		xfs_inodegc_start(sc->mp);
+-		xfs_blockgc_start(sc->mp);
+-	}
+-	sc->flags &= ~XCHK_REAPING_DISABLED;
+-}
+--- a/fs/xfs/scrub/common.h
++++ b/fs/xfs/scrub/common.h
+@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct
+ 
+ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+-void xchk_stop_reaping(struct xfs_scrub *sc);
+-void xchk_start_reaping(struct xfs_scrub *sc);
+ 
+ #endif	/* __XFS_SCRUB_COMMON_H__ */
+--- a/fs/xfs/scrub/fscounters.c
++++ b/fs/xfs/scrub/fscounters.c
+@@ -128,13 +128,6 @@ xchk_setup_fscounters(
+ 	if (error)
+ 		return error;
+ 
+-	/*
+-	 * Pause background reclaim while we're scrubbing to reduce the
+-	 * likelihood of background perturbations to the counters throwing off
+-	 * our calculations.
+-	 */
+-	xchk_stop_reaping(sc);
+-
+ 	return xchk_trans_alloc(sc, 0);
+ }
+ 
+@@ -354,6 +347,12 @@ xchk_fscounters(
+ 		xchk_set_corrupt(sc);
+ 
+ 	/*
++	 * XXX: We can't quiesce percpu counter updates, so exit early.
++	 * This can be re-enabled when we gain exclusive freeze functionality.
++	 */
++	return 0;
++
++	/*
+ 	 * If ifree exceeds icount by more than the minimum variance then
+ 	 * something's probably wrong with the counters.
+ 	 */
+--- a/fs/xfs/scrub/scrub.c
++++ b/fs/xfs/scrub/scrub.c
+@@ -171,8 +171,6 @@ xchk_teardown(
+ 	}
+ 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ 		mnt_drop_write_file(sc->file);
+-	if (sc->flags & XCHK_REAPING_DISABLED)
+-		xchk_start_reaping(sc);
+ 	if (sc->buf) {
+ 		kmem_free(sc->buf);
+ 		sc->buf = NULL;
+--- a/fs/xfs/scrub/scrub.h
++++ b/fs/xfs/scrub/scrub.h
+@@ -88,7 +88,6 @@ struct xfs_scrub {
+ 
+ /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+ #define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
+-#define XCHK_REAPING_DISABLED	(1 << 2)  /* background block reaping paused */
+ #define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
+ 
+ /* Metadata scrubbers */
diff --git a/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch b/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
new file mode 100644
index 00000000000..2bba34427b3
--- /dev/null
+++ b/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
@@ -0,0 +1,72 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:32 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:11 +0300
+Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run immediately
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-2-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 upstream.
+
+I've been noticing odd racing behavior in the inodegc code that could
+only be explained by one cpu adding an inode to its inactivation llist
+at the same time that another cpu is processing that cpu's llist.
+Preemption is disabled between get/put_cpu_ptr, so the only explanation
+is scheduler mayhem.  I inserted the following debug code into
+xfs_inodegc_worker (see the next patch):
+
+	ASSERT(gc->cpu == smp_processor_id());
+
+This assertion tripped during overnight tests on the arm64 machines, but
+curiously not on x86_64.  I think we haven't observed any resource leaks
+here because the lockfree list code can handle simultaneous llist_add
+and llist_del_all functions operating on the same list.  However, the
+whole point of having percpu inodegc lists is to take advantage of warm
+memory caches by inactivating inodes on the last processor to touch the
+inode.
+
+The incorrect scheduling seems to occur after an inodegc worker is
+subjected to mod_delayed_work().  This wraps mod_delayed_work_on with
+WORK_CPU_UNBOUND specified as the cpu number.  Unbound allows for
+scheduling on any cpu, not necessarily the same one that scheduled the
+work.
+
+Because preemption is disabled for as long as we have the gc pointer, I
+think it's safe to use current_cpu() (aka smp_processor_id) to queue the
+delayed work item on the correct cpu.
+
+Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2052,7 +2052,8 @@ xfs_inodegc_queue(
+ 		queue_delay = 0;
+ 
+ 	trace_xfs_inodegc_queue(mp, __return_address);
+-	mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++			queue_delay);
+ 	put_cpu_ptr(gc);
+ 
+ 	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+@@ -2096,7 +2097,8 @@ xfs_inodegc_cpu_dead(
+ 
+ 	if (xfs_is_inodegc_enabled(mp)) {
+ 		trace_xfs_inodegc_queue(mp, __return_address);
+-		mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
++		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++				0);
+ 	}
+ 	put_cpu_ptr(gc);
+ }
diff --git a/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch b/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
new file mode 100644
index 00000000000..3db57ab266f
--- /dev/null
+++ b/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
@@ -0,0 +1,186 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:14 +0300
+Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-5-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 2254a7396a0ca6309854948ee1c0a33fa4268cec upstream.
+
+syzbot reported this warning from the faux inodegc shrinker that tries
+to kick off inodegc work:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+Call Trace:
+ __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
+ mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
+ xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
+ xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
+ do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
+ shrink_slab+0x175/0x660 mm/vmscan.c:1013
+ shrink_one+0x502/0x810 mm/vmscan.c:5343
+ shrink_many mm/vmscan.c:5394 [inline]
+ lru_gen_shrink_node mm/vmscan.c:5511 [inline]
+ shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
+ kswapd_shrink_node mm/vmscan.c:7262 [inline]
+ balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
+ kswapd+0x677/0xd60 mm/vmscan.c:7712
+ kthread+0x2e8/0x3a0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+
+This warning corresponds to this code in __queue_work:
+
+	/*
+	 * For a draining wq, only works from the same workqueue are
+	 * allowed. The __WQ_DESTROYING helps to spot the issue that
+	 * queues a new work item to a wq after destroy_workqueue(wq).
+	 */
+	if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+		     WARN_ON_ONCE(!is_chained_work(wq))))
+		return;
+
+For this to trip, we must have a thread draining the inodedgc workqueue
+and a second thread trying to queue inodegc work to that workqueue.
+This can happen if freezing or a ro remount race with reclaim poking our
+faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
+file:
+
+Thread 0	Thread 1	Thread 2
+
+xfs_inodegc_stop
+
+				xfs_inodegc_shrinker_scan
+				xfs_is_inodegc_enabled
+				<yes, will continue>
+
+xfs_clear_inodegc_enabled
+xfs_inodegc_queue_all
+<list empty, do not queue inodegc worker>
+
+		xfs_inodegc_queue
+		<add to list>
+		xfs_is_inodegc_enabled
+		<no, returns>
+
+drain_workqueue
+<set WQ_DRAINING>
+
+				llist_empty
+				<no, will queue list>
+				mod_delayed_work_on(..., 0)
+				__queue_work
+				<sees WQ_DRAINING, kaboom>
+
+In other words, everything between the access to inodegc_enabled state
+and the decision to poke the inodegc workqueue requires some kind of
+coordination to avoid the WQ_DRAINING state.  We could perhaps introduce
+a lock here, but we could also try to eliminate WQ_DRAINING from the
+picture.
+
+We could replace the drain_workqueue call with a loop that flushes the
+workqueue and queues workers as long as there is at least one inode
+present in the per-cpu inodegc llists.  We've disabled inodegc at this
+point, so we know that the number of queued inodes will eventually hit
+zero as long as xfs_inodegc_start cannot reactivate the workers.
+
+There are four callers of xfs_inodegc_start.  Three of them come from the
+VFS with s_umount held: filesystem thawing, failed filesystem freezing,
+and the rw remount transition.  The fourth caller is mounting rw (no
+remount or freezing possible).
+
+There are three callers ofs xfs_inodegc_stop.  One is unmounting (no
+remount or thaw possible).  Two of them come from the VFS with s_umount
+held: fs freezing and ro remount transition.
+
+Hence, it is correct to replace the drain_workqueue call with a loop
+that drains the inodegc llists.
+
+Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   32 +++++++++++++++++++++++++++-----
+ 1 file changed, 27 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -431,18 +431,23 @@ xfs_iget_check_free_state(
+ }
+ 
+ /* Make all pending inactivation work start immediately. */
+-static void
++static bool
+ xfs_inodegc_queue_all(
+ 	struct xfs_mount	*mp)
+ {
+ 	struct xfs_inodegc	*gc;
+ 	int			cpu;
++	bool			ret = false;
+ 
+ 	for_each_online_cpu(cpu) {
+ 		gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-		if (!llist_empty(&gc->list))
++		if (!llist_empty(&gc->list)) {
+ 			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
++			ret = true;
++		}
+ 	}
++
++	return ret;
+ }
+ 
+ /*
+@@ -1894,24 +1899,41 @@ xfs_inodegc_flush(
+ 
+ /*
+  * Flush all the pending work and then disable the inode inactivation background
+- * workers and wait for them to stop.
++ * workers and wait for them to stop.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_stop(
+ 	struct xfs_mount	*mp)
+ {
++	bool			rerun;
++
+ 	if (!xfs_clear_inodegc_enabled(mp))
+ 		return;
+ 
++	/*
++	 * Drain all pending inodegc work, including inodes that could be
++	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
++	 * threads that sample the inodegc state just prior to us clearing it.
++	 * The inodegc flag state prevents new threads from queuing more
++	 * inodes, so we queue pending work items and flush the workqueue until
++	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
++	 * here because it does not allow other unserialized mechanisms to
++	 * reschedule inodegc work while this draining is in progress.
++	 */
+ 	xfs_inodegc_queue_all(mp);
+-	drain_workqueue(mp->m_inodegc_wq);
++	do {
++		flush_workqueue(mp->m_inodegc_wq);
++		rerun = xfs_inodegc_queue_all(mp);
++	} while (rerun);
+ 
+ 	trace_xfs_inodegc_stop(mp, __return_address);
+ }
+ 
+ /*
+  * Enable the inode inactivation background workers and schedule deferred inode
+- * inactivation work if there is any.
++ * inactivation work if there is any.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_start(