i2c-mux-demux-pinctrl-check-the-return-value-of-devm.patch
i2c-mux-gpio-replace-custom-acpi_get_local_address.patch
i2c-mux-gpio-add-missing-fwnode_handle_put.patch
+xfs-bound-maximum-wait-time-for-inodegc-work.patch
+xfs-introduce-xfs_inodegc_push.patch
+xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch
+xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch
+xfs-disable-reaping-in-fscounters-scrub.patch
+xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch
--- /dev/null
+From 5749af0ac33eef67973f0631a0ebe3f21d7a9abd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:51 -0700
+Subject: xfs: bound maximum wait time for inodegc work
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 7cf2b0f9611b9971d663e1fc3206eeda3b902922 ]
+
+Currently inodegc work can sit queued on the per-cpu queue until
+the workqueue is either flushed of the queue reaches a depth that
+triggers work queuing (and later throttling). This means that we
+could queue work that waits for a long time for some other event to
+trigger flushing.
+
+Hence instead of just queueing work at a specific depth, use a
+delayed work that queues the work at a bound time. We can still
+schedule the work immediately at a given depth, but we no long need
+to worry about leaving a number of items on the list that won't get
+processed until external events prevail.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 36 ++++++++++++++++++++++--------------
+ fs/xfs/xfs_mount.h | 2 +-
+ fs/xfs/xfs_super.c | 2 +-
+ 3 files changed, 24 insertions(+), 16 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 5e44d7bbd8fca..2c3ef553f5ef0 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -458,7 +458,7 @@ xfs_inodegc_queue_all(
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list))
+- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ }
+ }
+
+@@ -1851,8 +1851,8 @@ void
+ xfs_inodegc_worker(
+ struct work_struct *work)
+ {
+- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
+- work);
++ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
++ struct xfs_inodegc, work);
+ struct llist_node *node = llist_del_all(&gc->list);
+ struct xfs_inode *ip, *n;
+
+@@ -2021,6 +2021,7 @@ xfs_inodegc_queue(
+ struct xfs_inodegc *gc;
+ int items;
+ unsigned int shrinker_hits;
++ unsigned long queue_delay = 1;
+
+ trace_xfs_inode_set_need_inactive(ip);
+ spin_lock(&ip->i_flags_lock);
+@@ -2032,19 +2033,26 @@ xfs_inodegc_queue(
+ items = READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, items + 1);
+ shrinker_hits = READ_ONCE(gc->shrinker_hits);
+- put_cpu_ptr(gc);
+
+- if (!xfs_is_inodegc_enabled(mp))
++ /*
++ * We queue the work while holding the current CPU so that the work
++ * is scheduled to run on this CPU.
++ */
++ if (!xfs_is_inodegc_enabled(mp)) {
++ put_cpu_ptr(gc);
+ return;
+-
+- if (xfs_inodegc_want_queue_work(ip, items)) {
+- trace_xfs_inodegc_queue(mp, __return_address);
+- queue_work(mp->m_inodegc_wq, &gc->work);
+ }
+
++ if (xfs_inodegc_want_queue_work(ip, items))
++ queue_delay = 0;
++
++ trace_xfs_inodegc_queue(mp, __return_address);
++ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++ put_cpu_ptr(gc);
++
+ if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+ trace_xfs_inodegc_throttle(mp, __return_address);
+- flush_work(&gc->work);
++ flush_delayed_work(&gc->work);
+ }
+ }
+
+@@ -2061,7 +2069,7 @@ xfs_inodegc_cpu_dead(
+ unsigned int count = 0;
+
+ dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+- cancel_work_sync(&dead_gc->work);
++ cancel_delayed_work_sync(&dead_gc->work);
+
+ if (llist_empty(&dead_gc->list))
+ return;
+@@ -2080,12 +2088,12 @@ xfs_inodegc_cpu_dead(
+ llist_add_batch(first, last, &gc->list);
+ count += READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, count);
+- put_cpu_ptr(gc);
+
+ if (xfs_is_inodegc_enabled(mp)) {
+ trace_xfs_inodegc_queue(mp, __return_address);
+- queue_work(mp->m_inodegc_wq, &gc->work);
++ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+ }
++ put_cpu_ptr(gc);
+ }
+
+ /*
+@@ -2180,7 +2188,7 @@ xfs_inodegc_shrinker_scan(
+ unsigned int h = READ_ONCE(gc->shrinker_hits);
+
+ WRITE_ONCE(gc->shrinker_hits, h + 1);
+- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
++ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ no_items = false;
+ }
+ }
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 86564295fce6d..3d58938a6f75b 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -61,7 +61,7 @@ struct xfs_error_cfg {
+ */
+ struct xfs_inodegc {
+ struct llist_head list;
+- struct work_struct work;
++ struct delayed_work work;
+
+ /* approximate count of inodes in the list */
+ unsigned int items;
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index df1d6be61bfa3..8fe6ca9208de6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1061,7 +1061,7 @@ xfs_inodegc_init_percpu(
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ init_llist_head(&gc->list);
+ gc->items = 0;
+- INIT_WORK(&gc->work, xfs_inodegc_worker);
++ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+ }
+ return 0;
+ }
+--
+2.40.1
+
--- /dev/null
+From afd86b3c5e1ab8fc89d632a031839219d31d54b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:54 -0700
+Subject: xfs: check that per-cpu inodegc workers actually run on that cpu
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit b37c4c8339cd394ea6b8b415026603320a185651 ]
+
+Now that we've allegedly worked out the problem of the per-cpu inodegc
+workers being scheduled on the wrong cpu, let's put in a debugging knob
+to let us know if a worker ever gets mis-scheduled again.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 2 ++
+ fs/xfs/xfs_mount.h | 3 +++
+ fs/xfs/xfs_super.c | 3 +++
+ 3 files changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index ab8181f8d08a9..02022164772de 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1856,6 +1856,8 @@ xfs_inodegc_worker(
+ struct llist_node *node = llist_del_all(&gc->list);
+ struct xfs_inode *ip, *n;
+
++ ASSERT(gc->cpu == smp_processor_id());
++
+ WRITE_ONCE(gc->items, 0);
+
+ if (!node)
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 3d58938a6f75b..29f35169bf9cf 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -66,6 +66,9 @@ struct xfs_inodegc {
+ /* approximate count of inodes in the list */
+ unsigned int items;
+ unsigned int shrinker_hits;
++#if defined(DEBUG) || defined(XFS_WARN)
++ unsigned int cpu;
++#endif
+ };
+
+ /*
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 9b3af7611eaa9..569960e4ea3a6 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1062,6 +1062,9 @@ xfs_inodegc_init_percpu(
+
+ for_each_possible_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
++#if defined(DEBUG) || defined(XFS_WARN)
++ gc->cpu = cpu;
++#endif
+ init_llist_head(&gc->list);
+ gc->items = 0;
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+--
+2.40.1
+
--- /dev/null
+From fb88730e10e7b3525c580bf34d31d95815a3c35c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:55 -0700
+Subject: xfs: disable reaping in fscounters scrub
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee ]
+
+The fscounters scrub code doesn't work properly because it cannot
+quiesce updates to the percpu counters in the filesystem, hence it
+returns false corruption reports. This has been fixed properly in
+one of the online repair patchsets that are under review by replacing
+the xchk_disable_reaping calls with an exclusive filesystem freeze.
+Disabling background gc isn't sufficient to fix the problem.
+
+In other words, scrub doesn't need to call xfs_inodegc_stop, which is
+just as well since it wasn't correct to allow scrub to call
+xfs_inodegc_start when something else could be calling xfs_inodegc_stop
+(e.g. trying to freeze the filesystem).
+
+Neuter the scrubber for now, and remove the xchk_*_reaping functions.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/scrub/common.c | 25 -------------------------
+ fs/xfs/scrub/common.h | 2 --
+ fs/xfs/scrub/fscounters.c | 13 ++++++-------
+ fs/xfs/scrub/scrub.c | 2 --
+ fs/xfs/scrub/scrub.h | 1 -
+ 5 files changed, 6 insertions(+), 37 deletions(-)
+
+diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
+index bf1f3607d0b60..08df23edea720 100644
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -864,28 +864,3 @@ xchk_ilock_inverted(
+ return -EDEADLOCK;
+ }
+
+-/* Pause background reaping of resources. */
+-void
+-xchk_stop_reaping(
+- struct xfs_scrub *sc)
+-{
+- sc->flags |= XCHK_REAPING_DISABLED;
+- xfs_blockgc_stop(sc->mp);
+- xfs_inodegc_stop(sc->mp);
+-}
+-
+-/* Restart background reaping of resources. */
+-void
+-xchk_start_reaping(
+- struct xfs_scrub *sc)
+-{
+- /*
+- * Readonly filesystems do not perform inactivation or speculative
+- * preallocation, so there's no need to restart the workers.
+- */
+- if (!xfs_is_readonly(sc->mp)) {
+- xfs_inodegc_start(sc->mp);
+- xfs_blockgc_start(sc->mp);
+- }
+- sc->flags &= ~XCHK_REAPING_DISABLED;
+-}
+diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
+index 454145db10e71..2ca80102e704a 100644
+--- a/fs/xfs/scrub/common.h
++++ b/fs/xfs/scrub/common.h
+@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
+
+ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
+ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+-void xchk_stop_reaping(struct xfs_scrub *sc);
+-void xchk_start_reaping(struct xfs_scrub *sc);
+
+ #endif /* __XFS_SCRUB_COMMON_H__ */
+diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
+index 48a6cbdf95d08..037541339d80a 100644
+--- a/fs/xfs/scrub/fscounters.c
++++ b/fs/xfs/scrub/fscounters.c
+@@ -128,13 +128,6 @@ xchk_setup_fscounters(
+ if (error)
+ return error;
+
+- /*
+- * Pause background reclaim while we're scrubbing to reduce the
+- * likelihood of background perturbations to the counters throwing off
+- * our calculations.
+- */
+- xchk_stop_reaping(sc);
+-
+ return xchk_trans_alloc(sc, 0);
+ }
+
+@@ -353,6 +346,12 @@ xchk_fscounters(
+ if (fdblocks > mp->m_sb.sb_dblocks)
+ xchk_set_corrupt(sc);
+
++ /*
++ * XXX: We can't quiesce percpu counter updates, so exit early.
++ * This can be re-enabled when we gain exclusive freeze functionality.
++ */
++ return 0;
++
+ /*
+ * If ifree exceeds icount by more than the minimum variance then
+ * something's probably wrong with the counters.
+diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
+index 51e4c61916d20..e4d2a41983f73 100644
+--- a/fs/xfs/scrub/scrub.c
++++ b/fs/xfs/scrub/scrub.c
+@@ -171,8 +171,6 @@ xchk_teardown(
+ }
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mnt_drop_write_file(sc->file);
+- if (sc->flags & XCHK_REAPING_DISABLED)
+- xchk_start_reaping(sc);
+ if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
+index 80e5026bba44a..e8d9fe9de26ed 100644
+--- a/fs/xfs/scrub/scrub.h
++++ b/fs/xfs/scrub/scrub.h
+@@ -89,7 +89,6 @@ struct xfs_scrub {
+ /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
+ #define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */
+-#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
+ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
+
+ /* Metadata scrubbers */
+--
+2.40.1
+
--- /dev/null
+From b210ea2535b4205d2e826b1d6d4b82d4d8917c79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:53 -0700
+Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run
+ immediately
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 ]
+
+I've been noticing odd racing behavior in the inodegc code that could
+only be explained by one cpu adding an inode to its inactivation llist
+at the same time that another cpu is processing that cpu's llist.
+Preemption is disabled between get/put_cpu_ptr, so the only explanation
+is scheduler mayhem. I inserted the following debug code into
+xfs_inodegc_worker (see the next patch):
+
+ ASSERT(gc->cpu == smp_processor_id());
+
+This assertion tripped during overnight tests on the arm64 machines, but
+curiously not on x86_64. I think we haven't observed any resource leaks
+here because the lockfree list code can handle simultaneous llist_add
+and llist_del_all functions operating on the same list. However, the
+whole point of having percpu inodegc lists is to take advantage of warm
+memory caches by inactivating inodes on the last processor to touch the
+inode.
+
+The incorrect scheduling seems to occur after an inodegc worker is
+subjected to mod_delayed_work(). This wraps mod_delayed_work_on with
+WORK_CPU_UNBOUND specified as the cpu number. Unbound allows for
+scheduling on any cpu, not necessarily the same one that scheduled the
+work.
+
+Because preemption is disabled for as long as we have the gc pointer, I
+think it's safe to use current_cpu() (aka smp_processor_id) to queue the
+delayed work item on the correct cpu.
+
+Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index e9ebfe6f80150..ab8181f8d08a9 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2057,7 +2057,8 @@ xfs_inodegc_queue(
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+- mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++ queue_delay);
+ put_cpu_ptr(gc);
+
+ if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+@@ -2101,7 +2102,8 @@ xfs_inodegc_cpu_dead(
+
+ if (xfs_is_inodegc_enabled(mp)) {
+ trace_xfs_inodegc_queue(mp, __return_address);
+- mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
++ 0);
+ }
+ put_cpu_ptr(gc);
+ }
+--
+2.40.1
+
--- /dev/null
+From 40c124cf66ae4ddfa82dbe3c34a357a0cae38c66 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:56 -0700
+Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work
+
+From: Darrick J. Wong <djwong@kernel.org>
+
+[ Upstream commit 2254a7396a0ca6309854948ee1c0a33fa4268cec ]
+
+syzbot reported this warning from the faux inodegc shrinker that tries
+to kick off inodegc work:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
+Call Trace:
+ __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
+ mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
+ xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
+ xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
+ do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
+ shrink_slab+0x175/0x660 mm/vmscan.c:1013
+ shrink_one+0x502/0x810 mm/vmscan.c:5343
+ shrink_many mm/vmscan.c:5394 [inline]
+ lru_gen_shrink_node mm/vmscan.c:5511 [inline]
+ shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
+ kswapd_shrink_node mm/vmscan.c:7262 [inline]
+ balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
+ kswapd+0x677/0xd60 mm/vmscan.c:7712
+ kthread+0x2e8/0x3a0 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+
+This warning corresponds to this code in __queue_work:
+
+ /*
+ * For a draining wq, only works from the same workqueue are
+ * allowed. The __WQ_DESTROYING helps to spot the issue that
+ * queues a new work item to a wq after destroy_workqueue(wq).
+ */
+ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq))))
+ return;
+
+For this to trip, we must have a thread draining the inodedgc workqueue
+and a second thread trying to queue inodegc work to that workqueue.
+This can happen if freezing or a ro remount race with reclaim poking our
+faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
+file:
+
+Thread 0 Thread 1 Thread 2
+
+xfs_inodegc_stop
+
+ xfs_inodegc_shrinker_scan
+ xfs_is_inodegc_enabled
+ <yes, will continue>
+
+xfs_clear_inodegc_enabled
+xfs_inodegc_queue_all
+<list empty, do not queue inodegc worker>
+
+ xfs_inodegc_queue
+ <add to list>
+ xfs_is_inodegc_enabled
+ <no, returns>
+
+drain_workqueue
+<set WQ_DRAINING>
+
+ llist_empty
+ <no, will queue list>
+ mod_delayed_work_on(..., 0)
+ __queue_work
+ <sees WQ_DRAINING, kaboom>
+
+In other words, everything between the access to inodegc_enabled state
+and the decision to poke the inodegc workqueue requires some kind of
+coordination to avoid the WQ_DRAINING state. We could perhaps introduce
+a lock here, but we could also try to eliminate WQ_DRAINING from the
+picture.
+
+We could replace the drain_workqueue call with a loop that flushes the
+workqueue and queues workers as long as there is at least one inode
+present in the per-cpu inodegc llists. We've disabled inodegc at this
+point, so we know that the number of queued inodes will eventually hit
+zero as long as xfs_inodegc_start cannot reactivate the workers.
+
+There are four callers of xfs_inodegc_start. Three of them come from the
+VFS with s_umount held: filesystem thawing, failed filesystem freezing,
+and the rw remount transition. The fourth caller is mounting rw (no
+remount or freezing possible).
+
+There are three callers ofs xfs_inodegc_stop. One is unmounting (no
+remount or thaw possible). Two of them come from the VFS with s_umount
+held: fs freezing and ro remount transition.
+
+Hence, it is correct to replace the drain_workqueue call with a loop
+that drains the inodegc llists.
+
+Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 32 +++++++++++++++++++++++++++-----
+ 1 file changed, 27 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 02022164772de..eab98d76dbe1f 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -448,18 +448,23 @@ xfs_iget_check_free_state(
+ }
+
+ /* Make all pending inactivation work start immediately. */
+-static void
++static bool
+ xfs_inodegc_queue_all(
+ struct xfs_mount *mp)
+ {
+ struct xfs_inodegc *gc;
+ int cpu;
++ bool ret = false;
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+- if (!llist_empty(&gc->list))
++ if (!llist_empty(&gc->list)) {
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
++ ret = true;
++ }
+ }
++
++ return ret;
+ }
+
+ /*
+@@ -1902,24 +1907,41 @@ xfs_inodegc_flush(
+
+ /*
+ * Flush all the pending work and then disable the inode inactivation background
+- * workers and wait for them to stop.
++ * workers and wait for them to stop. Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+ */
+ void
+ xfs_inodegc_stop(
+ struct xfs_mount *mp)
+ {
++ bool rerun;
++
+ if (!xfs_clear_inodegc_enabled(mp))
+ return;
+
++ /*
++ * Drain all pending inodegc work, including inodes that could be
++ * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
++ * threads that sample the inodegc state just prior to us clearing it.
++ * The inodegc flag state prevents new threads from queuing more
++ * inodes, so we queue pending work items and flush the workqueue until
++ * all inodegc lists are empty. IOWs, we cannot use drain_workqueue
++ * here because it does not allow other unserialized mechanisms to
++ * reschedule inodegc work while this draining is in progress.
++ */
+ xfs_inodegc_queue_all(mp);
+- drain_workqueue(mp->m_inodegc_wq);
++ do {
++ flush_workqueue(mp->m_inodegc_wq);
++ rerun = xfs_inodegc_queue_all(mp);
++ } while (rerun);
+
+ trace_xfs_inodegc_stop(mp, __return_address);
+ }
+
+ /*
+ * Enable the inode inactivation background workers and schedule deferred inode
+- * inactivation work if there is any.
++ * inactivation work if there is any. Caller must hold sb->s_umount to
++ * coordinate changes in the inodegc_enabled state.
+ */
+ void
+ xfs_inodegc_start(
+--
+2.40.1
+
--- /dev/null
+From f764f1d54ed4d383d265427d89575057e36dc870 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Sep 2023 18:01:52 -0700
+Subject: xfs: introduce xfs_inodegc_push()
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 5e672cd69f0a534a445df4372141fd0d1d00901d ]
+
+The current blocking mechanism for pushing the inodegc queue out to
+disk can result in systems becoming unusable when there is a long
+running inodegc operation. This is because the statfs()
+implementation currently issues a blocking flush of the inodegc
+queue and a significant number of common system utilities will call
+statfs() to discover something about the underlying filesystem.
+
+This can result in userspace operations getting stuck on inodegc
+progress, and when trying to remove a heavily reflinked file on slow
+storage with a full journal, this can result in delays measuring in
+hours.
+
+Avoid this problem by adding "push" function that expedites the
+flushing of the inodegc queue, but doesn't wait for it to complete.
+
+Convert xfs_fs_statfs() and xfs_qm_scall_getquota() to use this
+mechanism so they don't block but still ensure that queued
+operations are expedited.
+
+Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues")
+Reported-by: Chris Dunlop <chris@onthe.net.au>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+[djwong: fix _getquota_next to use _inodegc_push too]
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_icache.c | 20 +++++++++++++++-----
+ fs/xfs/xfs_icache.h | 1 +
+ fs/xfs/xfs_qm_syscalls.c | 9 ++++++---
+ fs/xfs/xfs_super.c | 7 +++++--
+ fs/xfs/xfs_trace.h | 1 +
+ 5 files changed, 28 insertions(+), 10 deletions(-)
+
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 2c3ef553f5ef0..e9ebfe6f80150 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1872,19 +1872,29 @@ xfs_inodegc_worker(
+ }
+
+ /*
+- * Force all currently queued inode inactivation work to run immediately and
+- * wait for the work to finish.
++ * Expedite all pending inodegc work to run immediately. This does not wait for
++ * completion of the work.
+ */
+ void
+-xfs_inodegc_flush(
++xfs_inodegc_push(
+ struct xfs_mount *mp)
+ {
+ if (!xfs_is_inodegc_enabled(mp))
+ return;
++ trace_xfs_inodegc_push(mp, __return_address);
++ xfs_inodegc_queue_all(mp);
++}
+
++/*
++ * Force all currently queued inode inactivation work to run immediately and
++ * wait for the work to finish.
++ */
++void
++xfs_inodegc_flush(
++ struct xfs_mount *mp)
++{
++ xfs_inodegc_push(mp);
+ trace_xfs_inodegc_flush(mp, __return_address);
+-
+- xfs_inodegc_queue_all(mp);
+ flush_workqueue(mp->m_inodegc_wq);
+ }
+
+diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
+index 2e4cfddf8b8ed..6cd180721659b 100644
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
+ void xfs_blockgc_start(struct xfs_mount *mp);
+
+ void xfs_inodegc_worker(struct work_struct *work);
++void xfs_inodegc_push(struct xfs_mount *mp);
+ void xfs_inodegc_flush(struct xfs_mount *mp);
+ void xfs_inodegc_stop(struct xfs_mount *mp);
+ void xfs_inodegc_start(struct xfs_mount *mp);
+diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
+index 47fe60e1a8873..322a111dfbc0f 100644
+--- a/fs/xfs/xfs_qm_syscalls.c
++++ b/fs/xfs/xfs_qm_syscalls.c
+@@ -481,9 +481,12 @@ xfs_qm_scall_getquota(
+ struct xfs_dquot *dqp;
+ int error;
+
+- /* Flush inodegc work at the start of a quota reporting scan. */
++ /*
++ * Expedite pending inodegc work at the start of a quota reporting
++ * scan but don't block waiting for it to complete.
++ */
+ if (id == 0)
+- xfs_inodegc_flush(mp);
++ xfs_inodegc_push(mp);
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so don't
+@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next(
+
+ /* Flush inodegc work at the start of a quota reporting scan. */
+ if (*id == 0)
+- xfs_inodegc_flush(mp);
++ xfs_inodegc_push(mp);
+
+ error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+ if (error)
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 8fe6ca9208de6..9b3af7611eaa9 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -795,8 +795,11 @@ xfs_fs_statfs(
+ xfs_extlen_t lsize;
+ int64_t ffree;
+
+- /* Wait for whatever inactivations are in progress. */
+- xfs_inodegc_flush(mp);
++ /*
++ * Expedite background inodegc but don't wait. We do not want to block
++ * here waiting hours for a billion extent file to be truncated.
++ */
++ xfs_inodegc_push(mp);
+
+ statp->f_type = XFS_SUPER_MAGIC;
+ statp->f_namelen = MAXNAMELEN - 1;
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index 1033a95fbf8e6..ebd17ddba0243 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+ TP_ARGS(mp, caller_ip))
+ DEFINE_FS_EVENT(xfs_inodegc_flush);
++DEFINE_FS_EVENT(xfs_inodegc_push);
+ DEFINE_FS_EVENT(xfs_inodegc_start);
+ DEFINE_FS_EVENT(xfs_inodegc_stop);
+ DEFINE_FS_EVENT(xfs_inodegc_queue);
+--
+2.40.1
+