]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 16 Jul 2023 15:05:15 +0000 (17:05 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 16 Jul 2023 15:05:15 +0000 (17:05 +0200)
added patches:
mm-mmap-fix-extra-maple-tree-write.patch
xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
xfs-disable-reaping-in-fscounters-scrub.patch
xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch

queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch [new file with mode: 0644]
queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch [new file with mode: 0644]
queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch [new file with mode: 0644]
queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch [new file with mode: 0644]

diff --git a/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch b/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch
new file mode 100644 (file)
index 0000000..14dc636
--- /dev/null
@@ -0,0 +1,45 @@
+From Liam.Howlett@oracle.com  Sun Jul 16 17:02:51 2023
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Thu,  6 Jul 2023 14:51:35 -0400
+Subject: mm/mmap: Fix extra maple tree write
+To: linux-kernel@vger.kernel.org
+Cc: Andrew Morton <akpm@linux-foundation.org>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, John Hsu <John.Hsu@mediatek.com>, stable@vger.kernel.org, linux-mm@kvack.org
+Message-ID: <20230706185135.2235532-1-Liam.Howlett@oracle.com>
+
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+
+based on commit 0503ea8f5ba73eb3ab13a81c1eefbaf51405385a upstream.
+
+This was inadvertently fixed during the removal of __vma_adjust().
+
+When __vma_adjust() is adjusting next with a negative value (pushing
+vma->vm_end lower), there would be two writes to the maple tree.  The
+first write is unnecessary and uses all allocated nodes in the maple
+state.  The second write is necessary but will need to allocate nodes
+since the first write has used the allocated nodes.  This may be a
+problem as it may not be safe to allocate at this time, such as a low
+memory situation.  Fix the issue by avoiding the first write and only
+write the adjusted "next" VMA.
+
+Reported-by: John Hsu <John.Hsu@mediatek.com>
+Link: https://lore.kernel.org/lkml/9cb8c599b1d7f9c1c300d1a334d5eb70ec4d7357.camel@mediatek.com/
+Cc: stable@vger.kernel.org
+Cc: linux-mm@kvack.org
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mmap.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -767,7 +767,8 @@ int __vma_adjust(struct vm_area_struct *
+       }
+       if (end != vma->vm_end) {
+               if (vma->vm_end > end) {
+-                      if (!insert || (insert->vm_start != end)) {
++                      if ((vma->vm_end + adjust_next != end) &&
++                          (!insert || (insert->vm_start != end))) {
+                               vma_mas_szero(&mas, end, vma->vm_end);
+                               mas_reset(&mas);
+                               VM_WARN_ON(insert &&
index fc83f5eb22b5e547ee23f2d12c879401dc63f947..ae25d8f18ee8da46203e6ee436732848dacaabed 100644 (file)
@@ -583,3 +583,8 @@ arm-orion5x-fix-d2net-gpio-initialization.patch
 leds-trigger-netdev-recheck-netdev_led_mode_linkup-on-dev-rename.patch
 blktrace-use-inline-function-for-blk_trace_remove-while-blktrace-is-disabled.patch
 fs-no-need-to-check-source.patch
+xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
+xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
+xfs-disable-reaping-in-fscounters-scrub.patch
+xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
+mm-mmap-fix-extra-maple-tree-write.patch
diff --git a/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch b/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch
new file mode 100644 (file)
index 0000000..714f5ba
--- /dev/null
@@ -0,0 +1,63 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:12 +0300
+Subject: xfs: check that per-cpu inodegc workers actually run on that cpu
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-3-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit b37c4c8339cd394ea6b8b415026603320a185651 upstream.
+
+Now that we've allegedly worked out the problem of the per-cpu inodegc
+workers being scheduled on the wrong cpu, let's put in a debugging knob
+to let us know if a worker ever gets mis-scheduled again.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    2 ++
+ fs/xfs/xfs_mount.h  |    3 +++
+ fs/xfs/xfs_super.c  |    3 +++
+ 3 files changed, 8 insertions(+)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1848,6 +1848,8 @@ xfs_inodegc_worker(
+       struct llist_node       *node = llist_del_all(&gc->list);
+       struct xfs_inode        *ip, *n;
++      ASSERT(gc->cpu == smp_processor_id());
++
+       WRITE_ONCE(gc->items, 0);
+       if (!node)
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -66,6 +66,9 @@ struct xfs_inodegc {
+       /* approximate count of inodes in the list */
+       unsigned int            items;
+       unsigned int            shrinker_hits;
++#if defined(DEBUG) || defined(XFS_WARN)
++      unsigned int            cpu;
++#endif
+ };
+ /*
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1084,6 +1084,9 @@ xfs_inodegc_init_percpu(
+       for_each_possible_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
++#if defined(DEBUG) || defined(XFS_WARN)
++              gc->cpu = cpu;
++#endif
+               init_llist_head(&gc->list);
+               gc->items = 0;
+               INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
diff --git a/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch
new file mode 100644 (file)
index 0000000..90153d1
--- /dev/null
@@ -0,0 +1,132 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:13 +0300
+Subject: xfs: disable reaping in fscounters scrub
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-4-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee upstream.
+
+The fscounters scrub code doesn't work properly because it cannot
+quiesce updates to the percpu counters in the filesystem, hence it
+returns false corruption reports.  This has been fixed properly in
+one of the online repair patchsets that are under review by replacing
+the xchk_disable_reaping calls with an exclusive filesystem freeze.
+Disabling background gc isn't sufficient to fix the problem.
+
+In other words, scrub doesn't need to call xfs_inodegc_stop, which is
+just as well since it wasn't correct to allow scrub to call
+xfs_inodegc_start when something else could be calling xfs_inodegc_stop
+(e.g. trying to freeze the filesystem).
+
+Neuter the scrubber for now, and remove the xchk_*_reaping functions.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/common.c     |   26 --------------------------
+ fs/xfs/scrub/common.h     |    2 --
+ fs/xfs/scrub/fscounters.c |   13 ++++++-------
+ fs/xfs/scrub/scrub.c      |    2 --
+ fs/xfs/scrub/scrub.h      |    1 -
+ 5 files changed, 6 insertions(+), 38 deletions(-)
+
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -865,29 +865,3 @@ xchk_ilock_inverted(
+       }
+       return -EDEADLOCK;
+ }
+-
+-/* Pause background reaping of resources. */
+-void
+-xchk_stop_reaping(
+-      struct xfs_scrub        *sc)
+-{
+-      sc->flags |= XCHK_REAPING_DISABLED;
+-      xfs_blockgc_stop(sc->mp);
+-      xfs_inodegc_stop(sc->mp);
+-}
+-
+-/* Restart background reaping of resources. */
+-void
+-xchk_start_reaping(
+-      struct xfs_scrub        *sc)
+-{
+-      /*
+-       * Readonly filesystems do not perform inactivation or speculative
+-       * preallocation, so there's no need to restart the workers.
+-       */
+-      if (!xfs_is_readonly(sc->mp)) {
+-              xfs_inodegc_start(sc->mp);
+-              xfs_blockgc_start(sc->mp);
+-      }
+-      sc->flags &= ~XCHK_REAPING_DISABLED;
+-}
+--- a/fs/xfs/scrub/common.h
++++ b/fs/xfs/scrub/common.h
+@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct
+ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+-void xchk_stop_reaping(struct xfs_scrub *sc);
+-void xchk_start_reaping(struct xfs_scrub *sc);
+ #endif        /* __XFS_SCRUB_COMMON_H__ */
+--- a/fs/xfs/scrub/fscounters.c
++++ b/fs/xfs/scrub/fscounters.c
+@@ -128,13 +128,6 @@ xchk_setup_fscounters(
+       if (error)
+               return error;
+-      /*
+-       * Pause background reclaim while we're scrubbing to reduce the
+-       * likelihood of background perturbations to the counters throwing off
+-       * our calculations.
+-       */
+-      xchk_stop_reaping(sc);
+-
+       return xchk_trans_alloc(sc, 0);
+ }
+@@ -354,6 +347,12 @@ xchk_fscounters(
+               xchk_set_corrupt(sc);
+       /*
++       * XXX: We can't quiesce percpu counter updates, so exit early.
++       * This can be re-enabled when we gain exclusive freeze functionality.
++       */
++      return 0;
++
++      /*
+        * If ifree exceeds icount by more than the minimum variance then
+        * something's probably wrong with the counters.
+        */
+--- a/fs/xfs/scrub/scrub.c
++++ b/fs/xfs/scrub/scrub.c
+@@ -171,8 +171,6 @@ xchk_teardown(
+       }
+       if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+               mnt_drop_write_file(sc->file);
+-      if (sc->flags & XCHK_REAPING_DISABLED)
+-              xchk_start_reaping(sc);
+       if (sc->buf) {
+               kmem_free(sc->buf);
+               sc->buf = NULL;
+--- a/fs/xfs/scrub/scrub.h
++++ b/fs/xfs/scrub/scrub.h
+@@ -88,7 +88,6 @@ struct xfs_scrub {
+ /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+ #define XCHK_TRY_HARDER               (1 << 0)  /* can't get resources, try again */
+-#define XCHK_REAPING_DISABLED (1 << 2)  /* background block reaping paused */
+ #define XREP_ALREADY_FIXED    (1 << 31) /* checking our repair work */
+ /* Metadata scrubbers */
diff --git a/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch b/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch
new file mode 100644 (file)
index 0000000..2bba344
--- /dev/null
@@ -0,0 +1,72 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:32 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:11 +0300
+Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run immediately
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-2-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 upstream.
+
+I've been noticing odd racing behavior in the inodegc code that could
+only be explained by one cpu adding an inode to its inactivation llist
+at the same time that another cpu is processing that cpu's llist.
+Preemption is disabled between get/put_cpu_ptr, so the only explanation
+is scheduler mayhem.  I inserted the following debug code into
+xfs_inodegc_worker (see the next patch):
+
+       ASSERT(gc->cpu == smp_processor_id());
+
+This assertion tripped during overnight tests on the arm64 machines, but
+curiously not on x86_64.  I think we haven't observed any resource leaks
+here because the lockfree list code can handle simultaneous llist_add
+and llist_del_all functions operating on the same list.  However, the
+whole point of having percpu inodegc lists is to take advantage of warm
+memory caches by inactivating inodes on the last processor to touch the
+inode.
+
+The incorrect scheduling seems to occur after an inodegc worker is
+subjected to mod_delayed_work().  This wraps mod_delayed_work_on with
+WORK_CPU_UNBOUND specified as the cpu number.  Unbound allows for
+scheduling on any cpu, not necessarily the same one that scheduled the
+work.
+
+Because preemption is disabled for as long as we have the gc pointer, I
+think it's safe to use current_cpu() (aka smp_processor_id) to queue the
+delayed work item on the correct cpu.
+
+Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2052,7 +2052,8 @@ xfs_inodegc_queue(
+               queue_delay = 0;
+       trace_xfs_inodegc_queue(mp, __return_address);
+-      mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++      mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++                      queue_delay);
+       put_cpu_ptr(gc);
+       if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+@@ -2096,7 +2097,8 @@ xfs_inodegc_cpu_dead(
+       if (xfs_is_inodegc_enabled(mp)) {
+               trace_xfs_inodegc_queue(mp, __return_address);
+-              mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
++              mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++                              0);
+       }
+       put_cpu_ptr(gc);
+ }
diff --git a/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch b/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch
new file mode 100644 (file)
index 0000000..3db57ab
--- /dev/null
@@ -0,0 +1,186 @@
+From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Sat, 15 Jul 2023 09:31:14 +0300
+Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, "Darrick J . Wong" <djwong@kernel.org>, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>
+Message-ID: <20230715063114.1485841-5-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 2254a7396a0ca6309854948ee1c0a33fa4268cec upstream.
+
+syzbot reported this warning from the faux inodegc shrinker that tries
+to kick off inodegc work:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+Call Trace:
+ __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
+ mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
+ xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
+ xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
+ do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
+ shrink_slab+0x175/0x660 mm/vmscan.c:1013
+ shrink_one+0x502/0x810 mm/vmscan.c:5343
+ shrink_many mm/vmscan.c:5394 [inline]
+ lru_gen_shrink_node mm/vmscan.c:5511 [inline]
+ shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
+ kswapd_shrink_node mm/vmscan.c:7262 [inline]
+ balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
+ kswapd+0x677/0xd60 mm/vmscan.c:7712
+ kthread+0x2e8/0x3a0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+
+This warning corresponds to this code in __queue_work:
+
+       /*
+        * For a draining wq, only works from the same workqueue are
+        * allowed. The __WQ_DESTROYING helps to spot the issue that
+        * queues a new work item to a wq after destroy_workqueue(wq).
+        */
+       if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+                    WARN_ON_ONCE(!is_chained_work(wq))))
+               return;
+
+For this to trip, we must have a thread draining the inodedgc workqueue
+and a second thread trying to queue inodegc work to that workqueue.
+This can happen if freezing or a ro remount race with reclaim poking our
+faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
+file:
+
+Thread 0       Thread 1        Thread 2
+
+xfs_inodegc_stop
+
+                               xfs_inodegc_shrinker_scan
+                               xfs_is_inodegc_enabled
+                               <yes, will continue>
+
+xfs_clear_inodegc_enabled
+xfs_inodegc_queue_all
+<list empty, do not queue inodegc worker>
+
+               xfs_inodegc_queue
+               <add to list>
+               xfs_is_inodegc_enabled
+               <no, returns>
+
+drain_workqueue
+<set WQ_DRAINING>
+
+                               llist_empty
+                               <no, will queue list>
+                               mod_delayed_work_on(..., 0)
+                               __queue_work
+                               <sees WQ_DRAINING, kaboom>
+
+In other words, everything between the access to inodegc_enabled state
+and the decision to poke the inodegc workqueue requires some kind of
+coordination to avoid the WQ_DRAINING state.  We could perhaps introduce
+a lock here, but we could also try to eliminate WQ_DRAINING from the
+picture.
+
+We could replace the drain_workqueue call with a loop that flushes the
+workqueue and queues workers as long as there is at least one inode
+present in the per-cpu inodegc llists.  We've disabled inodegc at this
+point, so we know that the number of queued inodes will eventually hit
+zero as long as xfs_inodegc_start cannot reactivate the workers.
+
+There are four callers of xfs_inodegc_start.  Three of them come from the
+VFS with s_umount held: filesystem thawing, failed filesystem freezing,
+and the rw remount transition.  The fourth caller is mounting rw (no
+remount or freezing possible).
+
+There are three callers ofs xfs_inodegc_stop.  One is unmounting (no
+remount or thaw possible).  Two of them come from the VFS with s_umount
+held: fs freezing and ro remount transition.
+
+Hence, it is correct to replace the drain_workqueue call with a loop
+that drains the inodegc llists.
+
+Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   32 +++++++++++++++++++++++++++-----
+ 1 file changed, 27 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -431,18 +431,23 @@ xfs_iget_check_free_state(
+ }
+ /* Make all pending inactivation work start immediately. */
+-static void
++static bool
+ xfs_inodegc_queue_all(
+       struct xfs_mount        *mp)
+ {
+       struct xfs_inodegc      *gc;
+       int                     cpu;
++      bool                    ret = false;
+       for_each_online_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-              if (!llist_empty(&gc->list))
++              if (!llist_empty(&gc->list)) {
+                       mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
++                      ret = true;
++              }
+       }
++
++      return ret;
+ }
+ /*
+@@ -1894,24 +1899,41 @@ xfs_inodegc_flush(
+ /*
+  * Flush all the pending work and then disable the inode inactivation background
+- * workers and wait for them to stop.
++ * workers and wait for them to stop.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_stop(
+       struct xfs_mount        *mp)
+ {
++      bool                    rerun;
++
+       if (!xfs_clear_inodegc_enabled(mp))
+               return;
++      /*
++       * Drain all pending inodegc work, including inodes that could be
++       * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
++       * threads that sample the inodegc state just prior to us clearing it.
++       * The inodegc flag state prevents new threads from queuing more
++       * inodes, so we queue pending work items and flush the workqueue until
++       * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
++       * here because it does not allow other unserialized mechanisms to
++       * reschedule inodegc work while this draining is in progress.
++       */
+       xfs_inodegc_queue_all(mp);
+-      drain_workqueue(mp->m_inodegc_wq);
++      do {
++              flush_workqueue(mp->m_inodegc_wq);
++              rerun = xfs_inodegc_queue_all(mp);
++      } while (rerun);
+       trace_xfs_inodegc_stop(mp, __return_address);
+ }
+ /*
+  * Enable the inode inactivation background workers and schedule deferred inode
+- * inactivation work if there is any.
++ * inactivation work if there is any.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_start(