]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.15
authorSasha Levin <sashal@kernel.org>
Mon, 25 Sep 2023 19:11:40 +0000 (15:11 -0400)
committerSasha Levin <sashal@kernel.org>
Mon, 25 Sep 2023 19:11:40 +0000 (15:11 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-5.15/series
queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch [new file with mode: 0644]
queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch [new file with mode: 0644]
queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch [new file with mode: 0644]
queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch [new file with mode: 0644]
queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch [new file with mode: 0644]
queue-5.15/xfs-introduce-xfs_inodegc_push.patch [new file with mode: 0644]

index 9e9cd5b6a9b66079dc3d8bf45740623b46443b8b..0671f8a25ddf38a55093cd2f8b3f7db30d6bf373 100644 (file)
@@ -69,3 +69,9 @@ gpio-tb10x-fix-an-error-handling-path-in-tb10x_gpio_.patch
 i2c-mux-demux-pinctrl-check-the-return-value-of-devm.patch
 i2c-mux-gpio-replace-custom-acpi_get_local_address.patch
 i2c-mux-gpio-add-missing-fwnode_handle_put.patch
+xfs-bound-maximum-wait-time-for-inodegc-work.patch
+xfs-introduce-xfs_inodegc_push.patch
+xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
+xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
+xfs-disable-reaping-in-fscounters-scrub.patch
+xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
diff --git a/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch b/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch
new file mode 100644 (file)
index 0000000..d20e483
--- /dev/null
@@ -0,0 +1,161 @@
+From 5749af0ac33eef67973f0631a0ebe3f21d7a9abd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:51 -0700
+Subject: xfs: bound maximum wait time for inodegc work
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 7cf2b0f9611b9971d663e1fc3206eeda3b902922 ]
+
+Currently inodegc work can sit queued on the per-cpu queue until
+the workqueue is either flushed of the queue reaches a depth that
+triggers work queuing (and later throttling). This means that we
+could queue work that waits for a long time for some other event to
+trigger flushing.
+
+Hence instead of just queueing work at a specific depth, use a
+delayed work that queues the work at a bound time. We can still
+schedule the work immediately at a given depth, but we no long need
+to worry about leaving a number of items on the list that won't get
+processed until external events prevail.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 36 ++++++++++++++++++++++--------------
+ fs/xfs/xfs_mount.h  |  2 +-
+ fs/xfs/xfs_super.c  |  2 +-
+ 3 files changed, 24 insertions(+), 16 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 5e44d7bbd8fca..2c3ef553f5ef0 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -458,7 +458,7 @@ xfs_inodegc_queue_all(
+       for_each_online_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+               if (!llist_empty(&gc->list))
+-                      queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++                      mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+       }
+ }
+@@ -1851,8 +1851,8 @@ void
+ xfs_inodegc_worker(
+       struct work_struct      *work)
+ {
+-      struct xfs_inodegc      *gc = container_of(work, struct xfs_inodegc,
+-                                                      work);
++      struct xfs_inodegc      *gc = container_of(to_delayed_work(work),
++                                              struct xfs_inodegc, work);
+       struct llist_node       *node = llist_del_all(&gc->list);
+       struct xfs_inode        *ip, *n;
+@@ -2021,6 +2021,7 @@ xfs_inodegc_queue(
+       struct xfs_inodegc      *gc;
+       int                     items;
+       unsigned int            shrinker_hits;
++      unsigned long           queue_delay = 1;
+       trace_xfs_inode_set_need_inactive(ip);
+       spin_lock(&ip->i_flags_lock);
+@@ -2032,19 +2033,26 @@ xfs_inodegc_queue(
+       items = READ_ONCE(gc->items);
+       WRITE_ONCE(gc->items, items + 1);
+       shrinker_hits = READ_ONCE(gc->shrinker_hits);
+-      put_cpu_ptr(gc);
+-      if (!xfs_is_inodegc_enabled(mp))
++      /*
++       * We queue the work while holding the current CPU so that the work
++       * is scheduled to run on this CPU.
++       */
++      if (!xfs_is_inodegc_enabled(mp)) {
++              put_cpu_ptr(gc);
+               return;
+-
+-      if (xfs_inodegc_want_queue_work(ip, items)) {
+-              trace_xfs_inodegc_queue(mp, __return_address);
+-              queue_work(mp->m_inodegc_wq, &gc->work);
+       }
++      if (xfs_inodegc_want_queue_work(ip, items))
++              queue_delay = 0;
++
++      trace_xfs_inodegc_queue(mp, __return_address);
++      mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++      put_cpu_ptr(gc);
++
+       if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+               trace_xfs_inodegc_throttle(mp, __return_address);
+-              flush_work(&gc->work);
++              flush_delayed_work(&gc->work);
+       }
+ }
+@@ -2061,7 +2069,7 @@ xfs_inodegc_cpu_dead(
+       unsigned int            count = 0;
+       dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+-      cancel_work_sync(&dead_gc->work);
++      cancel_delayed_work_sync(&dead_gc->work);
+       if (llist_empty(&dead_gc->list))
+               return;
+@@ -2080,12 +2088,12 @@ xfs_inodegc_cpu_dead(
+       llist_add_batch(first, last, &gc->list);
+       count += READ_ONCE(gc->items);
+       WRITE_ONCE(gc->items, count);
+-      put_cpu_ptr(gc);
+       if (xfs_is_inodegc_enabled(mp)) {
+               trace_xfs_inodegc_queue(mp, __return_address);
+-              queue_work(mp->m_inodegc_wq, &gc->work);
++              mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+       }
++      put_cpu_ptr(gc);
+ }
+ /*
+@@ -2180,7 +2188,7 @@ xfs_inodegc_shrinker_scan(
+                       unsigned int    h = READ_ONCE(gc->shrinker_hits);
+                       WRITE_ONCE(gc->shrinker_hits, h + 1);
+-                      queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++                      mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+                       no_items = false;
+               }
+       }
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 86564295fce6d..3d58938a6f75b 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -61,7 +61,7 @@ struct xfs_error_cfg {
+  */
+ struct xfs_inodegc {
+       struct llist_head       list;
+-      struct work_struct      work;
++      struct delayed_work     work;
+       /* approximate count of inodes in the list */
+       unsigned int            items;
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index df1d6be61bfa3..8fe6ca9208de6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1061,7 +1061,7 @@ xfs_inodegc_init_percpu(
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+               init_llist_head(&gc->list);
+               gc->items = 0;
+-              INIT_WORK(&gc->work, xfs_inodegc_worker);
++              INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+       }
+       return 0;
+ }
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch b/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
new file mode 100644 (file)
index 0000000..2d80934
--- /dev/null
@@ -0,0 +1,69 @@
+From afd86b3c5e1ab8fc89d632a031839219d31d54b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:54 -0700
+Subject: xfs: check that per-cpu inodegc workers actually run on that cpu
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit b37c4c8339cd394ea6b8b415026603320a185651 ]
+
+Now that we've allegedly worked out the problem of the per-cpu inodegc
+workers being scheduled on the wrong cpu, let's put in a debugging knob
+to let us know if a worker ever gets mis-scheduled again.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 2 ++
+ fs/xfs/xfs_mount.h  | 3 +++
+ fs/xfs/xfs_super.c  | 3 +++
+ 3 files changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index ab8181f8d08a9..02022164772de 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1856,6 +1856,8 @@ xfs_inodegc_worker(
+       struct llist_node       *node = llist_del_all(&gc->list);
+       struct xfs_inode        *ip, *n;
++      ASSERT(gc->cpu == smp_processor_id());
++
+       WRITE_ONCE(gc->items, 0);
+       if (!node)
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 3d58938a6f75b..29f35169bf9cf 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -66,6 +66,9 @@ struct xfs_inodegc {
+       /* approximate count of inodes in the list */
+       unsigned int            items;
+       unsigned int            shrinker_hits;
++#if defined(DEBUG) || defined(XFS_WARN)
++      unsigned int            cpu;
++#endif
+ };
+ /*
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 9b3af7611eaa9..569960e4ea3a6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1062,6 +1062,9 @@ xfs_inodegc_init_percpu(
+       for_each_possible_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
++#if defined(DEBUG) || defined(XFS_WARN)
++              gc->cpu = cpu;
++#endif
+               init_llist_head(&gc->list);
+               gc->items = 0;
+               INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch
new file mode 100644 (file)
index 0000000..fa2db1d
--- /dev/null
@@ -0,0 +1,141 @@
+From fb88730e10e7b3525c580bf34d31d95815a3c35c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:55 -0700
+Subject: xfs: disable reaping in fscounters scrub
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee ]
+
+The fscounters scrub code doesn't work properly because it cannot
+quiesce updates to the percpu counters in the filesystem, hence it
+returns false corruption reports.  This has been fixed properly in
+one of the online repair patchsets that are under review by replacing
+the xchk_disable_reaping calls with an exclusive filesystem freeze.
+Disabling background gc isn't sufficient to fix the problem.
+
+In other words, scrub doesn't need to call xfs_inodegc_stop, which is
+just as well since it wasn't correct to allow scrub to call
+xfs_inodegc_start when something else could be calling xfs_inodegc_stop
+(e.g. trying to freeze the filesystem).
+
+Neuter the scrubber for now, and remove the xchk_*_reaping functions.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/scrub/common.c     | 25 -------------------------
+ fs/xfs/scrub/common.h     |  2 --
+ fs/xfs/scrub/fscounters.c | 13 ++++++-------
+ fs/xfs/scrub/scrub.c      |  2 --
+ fs/xfs/scrub/scrub.h      |  1 -
+ 5 files changed, 6 insertions(+), 37 deletions(-)
+
+diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
+index bf1f3607d0b60..08df23edea720 100644
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -864,28 +864,3 @@ xchk_ilock_inverted(
+       return -EDEADLOCK;
+ }
+-/* Pause background reaping of resources. */
+-void
+-xchk_stop_reaping(
+-      struct xfs_scrub        *sc)
+-{
+-      sc->flags |= XCHK_REAPING_DISABLED;
+-      xfs_blockgc_stop(sc->mp);
+-      xfs_inodegc_stop(sc->mp);
+-}
+-
+-/* Restart background reaping of resources. */
+-void
+-xchk_start_reaping(
+-      struct xfs_scrub        *sc)
+-{
+-      /*
+-       * Readonly filesystems do not perform inactivation or speculative
+-       * preallocation, so there's no need to restart the workers.
+-       */
+-      if (!xfs_is_readonly(sc->mp)) {
+-              xfs_inodegc_start(sc->mp);
+-              xfs_blockgc_start(sc->mp);
+-      }
+-      sc->flags &= ~XCHK_REAPING_DISABLED;
+-}
+diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
+index 454145db10e71..2ca80102e704a 100644
+--- a/fs/xfs/scrub/common.h
++++ b/fs/xfs/scrub/common.h
+@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
+ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+-void xchk_stop_reaping(struct xfs_scrub *sc);
+-void xchk_start_reaping(struct xfs_scrub *sc);
+ #endif        /* __XFS_SCRUB_COMMON_H__ */
+diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
+index 48a6cbdf95d08..037541339d80a 100644
+--- a/fs/xfs/scrub/fscounters.c
++++ b/fs/xfs/scrub/fscounters.c
+@@ -128,13 +128,6 @@ xchk_setup_fscounters(
+       if (error)
+               return error;
+-      /*
+-       * Pause background reclaim while we're scrubbing to reduce the
+-       * likelihood of background perturbations to the counters throwing off
+-       * our calculations.
+-       */
+-      xchk_stop_reaping(sc);
+-
+       return xchk_trans_alloc(sc, 0);
+ }
+@@ -353,6 +346,12 @@ xchk_fscounters(
+       if (fdblocks > mp->m_sb.sb_dblocks)
+               xchk_set_corrupt(sc);
++      /*
++       * XXX: We can't quiesce percpu counter updates, so exit early.
++       * This can be re-enabled when we gain exclusive freeze functionality.
++       */
++      return 0;
++
+       /*
+        * If ifree exceeds icount by more than the minimum variance then
+        * something's probably wrong with the counters.
+diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
+index 51e4c61916d20..e4d2a41983f73 100644
+--- a/fs/xfs/scrub/scrub.c
++++ b/fs/xfs/scrub/scrub.c
+@@ -171,8 +171,6 @@ xchk_teardown(
+       }
+       if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+               mnt_drop_write_file(sc->file);
+-      if (sc->flags & XCHK_REAPING_DISABLED)
+-              xchk_start_reaping(sc);
+       if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
+               mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+               sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
+index 80e5026bba44a..e8d9fe9de26ed 100644
+--- a/fs/xfs/scrub/scrub.h
++++ b/fs/xfs/scrub/scrub.h
+@@ -89,7 +89,6 @@ struct xfs_scrub {
+ /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+ #define XCHK_TRY_HARDER               (1 << 0)  /* can't get resources, try again */
+ #define XCHK_HAS_QUOTAOFFLOCK (1 << 1)  /* we hold the quotaoff lock */
+-#define XCHK_REAPING_DISABLED (1 << 2)  /* background block reaping paused */
+ #define XREP_ALREADY_FIXED    (1 << 31) /* checking our repair work */
+ /* Metadata scrubbers */
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch b/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
new file mode 100644 (file)
index 0000000..d1414c9
--- /dev/null
@@ -0,0 +1,75 @@
+From b210ea2535b4205d2e826b1d6d4b82d4d8917c79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:53 -0700
+Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run
+ immediately
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 ]
+
+I've been noticing odd racing behavior in the inodegc code that could
+only be explained by one cpu adding an inode to its inactivation llist
+at the same time that another cpu is processing that cpu's llist.
+Preemption is disabled between get/put_cpu_ptr, so the only explanation
+is scheduler mayhem.  I inserted the following debug code into
+xfs_inodegc_worker (see the next patch):
+
+       ASSERT(gc->cpu == smp_processor_id());
+
+This assertion tripped during overnight tests on the arm64 machines, but
+curiously not on x86_64.  I think we haven't observed any resource leaks
+here because the lockfree list code can handle simultaneous llist_add
+and llist_del_all functions operating on the same list.  However, the
+whole point of having percpu inodegc lists is to take advantage of warm
+memory caches by inactivating inodes on the last processor to touch the
+inode.
+
+The incorrect scheduling seems to occur after an inodegc worker is
+subjected to mod_delayed_work().  This wraps mod_delayed_work_on with
+WORK_CPU_UNBOUND specified as the cpu number.  Unbound allows for
+scheduling on any cpu, not necessarily the same one that scheduled the
+work.
+
+Because preemption is disabled for as long as we have the gc pointer, I
+think it's safe to use current_cpu() (aka smp_processor_id) to queue the
+delayed work item on the correct cpu.
+
+Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index e9ebfe6f80150..ab8181f8d08a9 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2057,7 +2057,8 @@ xfs_inodegc_queue(
+               queue_delay = 0;
+       trace_xfs_inodegc_queue(mp, __return_address);
+-      mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++      mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++                      queue_delay);
+       put_cpu_ptr(gc);
+       if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+@@ -2101,7 +2102,8 @@ xfs_inodegc_cpu_dead(
+       if (xfs_is_inodegc_enabled(mp)) {
+               trace_xfs_inodegc_queue(mp, __return_address);
+-              mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
++              mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++                              0);
+       }
+       put_cpu_ptr(gc);
+ }
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch b/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
new file mode 100644 (file)
index 0000000..903f7de
--- /dev/null
@@ -0,0 +1,188 @@
+From 40c124cf66ae4ddfa82dbe3c34a357a0cae38c66 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:56 -0700
+Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2254a7396a0ca6309854948ee1c0a33fa4268cec ]
+
+syzbot reported this warning from the faux inodegc shrinker that tries
+to kick off inodegc work:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+Call Trace:
+ __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
+ mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
+ xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
+ xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
+ do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
+ shrink_slab+0x175/0x660 mm/vmscan.c:1013
+ shrink_one+0x502/0x810 mm/vmscan.c:5343
+ shrink_many mm/vmscan.c:5394 [inline]
+ lru_gen_shrink_node mm/vmscan.c:5511 [inline]
+ shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
+ kswapd_shrink_node mm/vmscan.c:7262 [inline]
+ balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
+ kswapd+0x677/0xd60 mm/vmscan.c:7712
+ kthread+0x2e8/0x3a0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+
+This warning corresponds to this code in __queue_work:
+
+       /*
+        * For a draining wq, only works from the same workqueue are
+        * allowed. The __WQ_DESTROYING helps to spot the issue that
+        * queues a new work item to a wq after destroy_workqueue(wq).
+        */
+       if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+                    WARN_ON_ONCE(!is_chained_work(wq))))
+               return;
+
+For this to trip, we must have a thread draining the inodedgc workqueue
+and a second thread trying to queue inodegc work to that workqueue.
+This can happen if freezing or a ro remount race with reclaim poking our
+faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
+file:
+
+Thread 0       Thread 1        Thread 2
+
+xfs_inodegc_stop
+
+                               xfs_inodegc_shrinker_scan
+                               xfs_is_inodegc_enabled
+                               <yes, will continue>
+
+xfs_clear_inodegc_enabled
+xfs_inodegc_queue_all
+<list empty, do not queue inodegc worker>
+
+               xfs_inodegc_queue
+               <add to list>
+               xfs_is_inodegc_enabled
+               <no, returns>
+
+drain_workqueue
+<set WQ_DRAINING>
+
+                               llist_empty
+                               <no, will queue list>
+                               mod_delayed_work_on(..., 0)
+                               __queue_work
+                               <sees WQ_DRAINING, kaboom>
+
+In other words, everything between the access to inodegc_enabled state
+and the decision to poke the inodegc workqueue requires some kind of
+coordination to avoid the WQ_DRAINING state.  We could perhaps introduce
+a lock here, but we could also try to eliminate WQ_DRAINING from the
+picture.
+
+We could replace the drain_workqueue call with a loop that flushes the
+workqueue and queues workers as long as there is at least one inode
+present in the per-cpu inodegc llists.  We've disabled inodegc at this
+point, so we know that the number of queued inodes will eventually hit
+zero as long as xfs_inodegc_start cannot reactivate the workers.
+
+There are four callers of xfs_inodegc_start.  Three of them come from the
+VFS with s_umount held: filesystem thawing, failed filesystem freezing,
+and the rw remount transition.  The fourth caller is mounting rw (no
+remount or freezing possible).
+
+There are three callers ofs xfs_inodegc_stop.  One is unmounting (no
+remount or thaw possible).  Two of them come from the VFS with s_umount
+held: fs freezing and ro remount transition.
+
+Hence, it is correct to replace the drain_workqueue call with a loop
+that drains the inodegc llists.
+
+Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 32 +++++++++++++++++++++++++++-----
+ 1 file changed, 27 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 02022164772de..eab98d76dbe1f 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -448,18 +448,23 @@ xfs_iget_check_free_state(
+ }
+ /* Make all pending inactivation work start immediately. */
+-static void
++static bool
+ xfs_inodegc_queue_all(
+       struct xfs_mount        *mp)
+ {
+       struct xfs_inodegc      *gc;
+       int                     cpu;
++      bool                    ret = false;
+       for_each_online_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-              if (!llist_empty(&gc->list))
++              if (!llist_empty(&gc->list)) {
+                       mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
++                      ret = true;
++              }
+       }
++
++      return ret;
+ }
+ /*
+@@ -1902,24 +1907,41 @@ xfs_inodegc_flush(
+ /*
+  * Flush all the pending work and then disable the inode inactivation background
+- * workers and wait for them to stop.
++ * workers and wait for them to stop.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_stop(
+       struct xfs_mount        *mp)
+ {
++      bool                    rerun;
++
+       if (!xfs_clear_inodegc_enabled(mp))
+               return;
++      /*
++       * Drain all pending inodegc work, including inodes that could be
++       * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
++       * threads that sample the inodegc state just prior to us clearing it.
++       * The inodegc flag state prevents new threads from queuing more
++       * inodes, so we queue pending work items and flush the workqueue until
++       * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
++       * here because it does not allow other unserialized mechanisms to
++       * reschedule inodegc work while this draining is in progress.
++       */
+       xfs_inodegc_queue_all(mp);
+-      drain_workqueue(mp->m_inodegc_wq);
++      do {
++              flush_workqueue(mp->m_inodegc_wq);
++              rerun = xfs_inodegc_queue_all(mp);
++      } while (rerun);
+       trace_xfs_inodegc_stop(mp, __return_address);
+ }
+ /*
+  * Enable the inode inactivation background workers and schedule deferred inode
+- * inactivation work if there is any.
++ * inactivation work if there is any.  Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+  */
+ void
+ xfs_inodegc_start(
+-- 
+2.40.1
+
diff --git a/queue-5.15/xfs-introduce-xfs_inodegc_push.patch b/queue-5.15/xfs-introduce-xfs_inodegc_push.patch
new file mode 100644 (file)
index 0000000..db5c76f
--- /dev/null
@@ -0,0 +1,157 @@
+From f764f1d54ed4d383d265427d89575057e36dc870 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:52 -0700
+Subject: xfs: introduce xfs_inodegc_push()
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 5e672cd69f0a534a445df4372141fd0d1d00901d ]
+
+The current blocking mechanism for pushing the inodegc queue out to
+disk can result in systems becoming unusable when there is a long
+running inodegc operation. This is because the statfs()
+implementation currently issues a blocking flush of the inodegc
+queue and a significant number of common system utilities will call
+statfs() to discover something about the underlying filesystem.
+
+This can result in userspace operations getting stuck on inodegc
+progress, and when trying to remove a heavily reflinked file on slow
+storage with a full journal, this can result in delays measuring in
+hours.
+
+Avoid this problem by adding "push" function that expedites the
+flushing of the inodegc queue, but doesn't wait for it to complete.
+
+Convert xfs_fs_statfs() and xfs_qm_scall_getquota() to use this
+mechanism so they don't block but still ensure that queued
+operations are expedited.
+
+Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues")
+Reported-by: Chris Dunlop <chris@onthe.net.au>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+[djwong: fix _getquota_next to use _inodegc_push too]
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c      | 20 +++++++++++++++-----
+ fs/xfs/xfs_icache.h      |  1 +
+ fs/xfs/xfs_qm_syscalls.c |  9 ++++++---
+ fs/xfs/xfs_super.c       |  7 +++++--
+ fs/xfs/xfs_trace.h       |  1 +
+ 5 files changed, 28 insertions(+), 10 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 2c3ef553f5ef0..e9ebfe6f80150 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1872,19 +1872,29 @@ xfs_inodegc_worker(
+ }
+ /*
+- * Force all currently queued inode inactivation work to run immediately and
+- * wait for the work to finish.
++ * Expedite all pending inodegc work to run immediately. This does not wait for
++ * completion of the work.
+  */
+ void
+-xfs_inodegc_flush(
++xfs_inodegc_push(
+       struct xfs_mount        *mp)
+ {
+       if (!xfs_is_inodegc_enabled(mp))
+               return;
++      trace_xfs_inodegc_push(mp, __return_address);
++      xfs_inodegc_queue_all(mp);
++}
++/*
++ * Force all currently queued inode inactivation work to run immediately and
++ * wait for the work to finish.
++ */
++void
++xfs_inodegc_flush(
++      struct xfs_mount        *mp)
++{
++      xfs_inodegc_push(mp);
+       trace_xfs_inodegc_flush(mp, __return_address);
+-
+-      xfs_inodegc_queue_all(mp);
+       flush_workqueue(mp->m_inodegc_wq);
+ }
+diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
+index 2e4cfddf8b8ed..6cd180721659b 100644
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
+ void xfs_blockgc_start(struct xfs_mount *mp);
+ void xfs_inodegc_worker(struct work_struct *work);
++void xfs_inodegc_push(struct xfs_mount *mp);
+ void xfs_inodegc_flush(struct xfs_mount *mp);
+ void xfs_inodegc_stop(struct xfs_mount *mp);
+ void xfs_inodegc_start(struct xfs_mount *mp);
+diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
+index 47fe60e1a8873..322a111dfbc0f 100644
+--- a/fs/xfs/xfs_qm_syscalls.c
++++ b/fs/xfs/xfs_qm_syscalls.c
+@@ -481,9 +481,12 @@ xfs_qm_scall_getquota(
+       struct xfs_dquot        *dqp;
+       int                     error;
+-      /* Flush inodegc work at the start of a quota reporting scan. */
++      /*
++       * Expedite pending inodegc work at the start of a quota reporting
++       * scan but don't block waiting for it to complete.
++       */
+       if (id == 0)
+-              xfs_inodegc_flush(mp);
++              xfs_inodegc_push(mp);
+       /*
+        * Try to get the dquot. We don't want it allocated on disk, so don't
+@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next(
+       /* Flush inodegc work at the start of a quota reporting scan. */
+       if (*id == 0)
+-              xfs_inodegc_flush(mp);
++              xfs_inodegc_push(mp);
+       error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+       if (error)
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 8fe6ca9208de6..9b3af7611eaa9 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -795,8 +795,11 @@ xfs_fs_statfs(
+       xfs_extlen_t            lsize;
+       int64_t                 ffree;
+-      /* Wait for whatever inactivations are in progress. */
+-      xfs_inodegc_flush(mp);
++      /*
++       * Expedite background inodegc but don't wait. We do not want to block
++       * here waiting hours for a billion extent file to be truncated.
++       */
++      xfs_inodegc_push(mp);
+       statp->f_type = XFS_SUPER_MAGIC;
+       statp->f_namelen = MAXNAMELEN - 1;
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index 1033a95fbf8e6..ebd17ddba0243 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name,                                   \
+       TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+       TP_ARGS(mp, caller_ip))
+ DEFINE_FS_EVENT(xfs_inodegc_flush);
++DEFINE_FS_EVENT(xfs_inodegc_push);
+ DEFINE_FS_EVENT(xfs_inodegc_start);
+ DEFINE_FS_EVENT(xfs_inodegc_stop);
+ DEFINE_FS_EVENT(xfs_inodegc_queue);
+-- 
+2.40.1
+