From: Sasha Levin Date: Mon, 25 Sep 2023 19:11:40 +0000 (-0400) Subject: Fixes for 5.15 X-Git-Tag: v6.5.6~90 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9b0c0e5da473e099e88d43c0fae1d639ebf9699d;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/series b/queue-5.15/series index 9e9cd5b6a9b..0671f8a25dd 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -69,3 +69,9 @@ gpio-tb10x-fix-an-error-handling-path-in-tb10x_gpio_.patch i2c-mux-demux-pinctrl-check-the-return-value-of-devm.patch i2c-mux-gpio-replace-custom-acpi_get_local_address.patch i2c-mux-gpio-add-missing-fwnode_handle_put.patch +xfs-bound-maximum-wait-time-for-inodegc-work.patch +xfs-introduce-xfs_inodegc_push.patch +xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch +xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch +xfs-disable-reaping-in-fscounters-scrub.patch +xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch diff --git a/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch b/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch new file mode 100644 index 00000000000..d20e4837305 --- /dev/null +++ b/queue-5.15/xfs-bound-maximum-wait-time-for-inodegc-work.patch @@ -0,0 +1,161 @@ +From 5749af0ac33eef67973f0631a0ebe3f21d7a9abd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:51 -0700 +Subject: xfs: bound maximum wait time for inodegc work + +From: Dave Chinner + +[ Upstream commit 7cf2b0f9611b9971d663e1fc3206eeda3b902922 ] + +Currently inodegc work can sit queued on the per-cpu queue until +the workqueue is either flushed of the queue reaches a depth that +triggers work queuing (and later throttling). This means that we +could queue work that waits for a long time for some other event to +trigger flushing. + +Hence instead of just queueing work at a specific depth, use a +delayed work that queues the work at a bound time. We can still +schedule the work immediately at a given depth, but we no long need +to worry about leaving a number of items on the list that won't get +processed until external events prevail. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_icache.c | 36 ++++++++++++++++++++++-------------- + fs/xfs/xfs_mount.h | 2 +- + fs/xfs/xfs_super.c | 2 +- + 3 files changed, 24 insertions(+), 16 deletions(-) + +diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c +index 5e44d7bbd8fca..2c3ef553f5ef0 100644 +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -458,7 +458,7 @@ xfs_inodegc_queue_all( + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) +- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); ++ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); + } + } + +@@ -1851,8 +1851,8 @@ void + xfs_inodegc_worker( + struct work_struct *work) + { +- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, +- work); ++ struct xfs_inodegc *gc = container_of(to_delayed_work(work), ++ struct xfs_inodegc, work); + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + +@@ -2021,6 +2021,7 @@ xfs_inodegc_queue( + struct xfs_inodegc *gc; + int items; + unsigned int shrinker_hits; ++ unsigned long queue_delay = 1; + + trace_xfs_inode_set_need_inactive(ip); + spin_lock(&ip->i_flags_lock); +@@ -2032,19 +2033,26 @@ xfs_inodegc_queue( + items = READ_ONCE(gc->items); + WRITE_ONCE(gc->items, items + 1); + shrinker_hits = READ_ONCE(gc->shrinker_hits); +- put_cpu_ptr(gc); + +- if (!xfs_is_inodegc_enabled(mp)) ++ /* ++ * We queue the work while holding the current CPU so that the work ++ * is scheduled to run on this CPU. ++ */ ++ if (!xfs_is_inodegc_enabled(mp)) { ++ put_cpu_ptr(gc); + return; +- +- if (xfs_inodegc_want_queue_work(ip, items)) { +- trace_xfs_inodegc_queue(mp, __return_address); +- queue_work(mp->m_inodegc_wq, &gc->work); + } + ++ if (xfs_inodegc_want_queue_work(ip, items)) ++ queue_delay = 0; ++ ++ trace_xfs_inodegc_queue(mp, __return_address); ++ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); ++ put_cpu_ptr(gc); ++ + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { + trace_xfs_inodegc_throttle(mp, __return_address); +- flush_work(&gc->work); ++ flush_delayed_work(&gc->work); + } + } + +@@ -2061,7 +2069,7 @@ xfs_inodegc_cpu_dead( + unsigned int count = 0; + + dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); +- cancel_work_sync(&dead_gc->work); ++ cancel_delayed_work_sync(&dead_gc->work); + + if (llist_empty(&dead_gc->list)) + return; +@@ -2080,12 +2088,12 @@ xfs_inodegc_cpu_dead( + llist_add_batch(first, last, &gc->list); + count += READ_ONCE(gc->items); + WRITE_ONCE(gc->items, count); +- put_cpu_ptr(gc); + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); +- queue_work(mp->m_inodegc_wq, &gc->work); ++ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); + } ++ put_cpu_ptr(gc); + } + + /* +@@ -2180,7 +2188,7 @@ xfs_inodegc_shrinker_scan( + unsigned int h = READ_ONCE(gc->shrinker_hits); + + WRITE_ONCE(gc->shrinker_hits, h + 1); +- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); ++ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); + no_items = false; + } + } +diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h +index 86564295fce6d..3d58938a6f75b 100644 +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -61,7 +61,7 @@ struct xfs_error_cfg { + */ + struct xfs_inodegc { + struct llist_head list; +- struct work_struct work; ++ struct delayed_work work; + + /* approximate count of inodes in the list */ + unsigned int items; +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index df1d6be61bfa3..8fe6ca9208de6 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1061,7 +1061,7 @@ xfs_inodegc_init_percpu( + gc = per_cpu_ptr(mp->m_inodegc, cpu); + init_llist_head(&gc->list); + gc->items = 0; +- INIT_WORK(&gc->work, xfs_inodegc_worker); ++ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; + } +-- +2.40.1 + diff --git a/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch b/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch new file mode 100644 index 00000000000..2d809345342 --- /dev/null +++ b/queue-5.15/xfs-check-that-per-cpu-inodegc-workers-actually-run-.patch @@ -0,0 +1,69 @@ +From afd86b3c5e1ab8fc89d632a031839219d31d54b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:54 -0700 +Subject: xfs: check that per-cpu inodegc workers actually run on that cpu + +From: Darrick J. Wong + +[ Upstream commit b37c4c8339cd394ea6b8b415026603320a185651 ] + +Now that we've allegedly worked out the problem of the per-cpu inodegc +workers being scheduled on the wrong cpu, let's put in a debugging knob +to let us know if a worker ever gets mis-scheduled again. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_icache.c | 2 ++ + fs/xfs/xfs_mount.h | 3 +++ + fs/xfs/xfs_super.c | 3 +++ + 3 files changed, 8 insertions(+) + +diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c +index ab8181f8d08a9..02022164772de 100644 +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1856,6 +1856,8 @@ xfs_inodegc_worker( + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + ++ ASSERT(gc->cpu == smp_processor_id()); ++ + WRITE_ONCE(gc->items, 0); + + if (!node) +diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h +index 3d58938a6f75b..29f35169bf9cf 100644 +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -66,6 +66,9 @@ struct xfs_inodegc { + /* approximate count of inodes in the list */ + unsigned int items; + unsigned int shrinker_hits; ++#if defined(DEBUG) || defined(XFS_WARN) ++ unsigned int cpu; ++#endif + }; + + /* +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index 9b3af7611eaa9..569960e4ea3a6 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1062,6 +1062,9 @@ xfs_inodegc_init_percpu( + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); ++#if defined(DEBUG) || defined(XFS_WARN) ++ gc->cpu = cpu; ++#endif + init_llist_head(&gc->list); + gc->items = 0; + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); +-- +2.40.1 + diff --git a/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch new file mode 100644 index 00000000000..fa2db1d5e1c --- /dev/null +++ b/queue-5.15/xfs-disable-reaping-in-fscounters-scrub.patch @@ -0,0 +1,141 @@ +From fb88730e10e7b3525c580bf34d31d95815a3c35c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:55 -0700 +Subject: xfs: disable reaping in fscounters scrub + +From: Darrick J. Wong + +[ Upstream commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee ] + +The fscounters scrub code doesn't work properly because it cannot +quiesce updates to the percpu counters in the filesystem, hence it +returns false corruption reports. This has been fixed properly in +one of the online repair patchsets that are under review by replacing +the xchk_disable_reaping calls with an exclusive filesystem freeze. +Disabling background gc isn't sufficient to fix the problem. + +In other words, scrub doesn't need to call xfs_inodegc_stop, which is +just as well since it wasn't correct to allow scrub to call +xfs_inodegc_start when something else could be calling xfs_inodegc_stop +(e.g. trying to freeze the filesystem). + +Neuter the scrubber for now, and remove the xchk_*_reaping functions. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/scrub/common.c | 25 ------------------------- + fs/xfs/scrub/common.h | 2 -- + fs/xfs/scrub/fscounters.c | 13 ++++++------- + fs/xfs/scrub/scrub.c | 2 -- + fs/xfs/scrub/scrub.h | 1 - + 5 files changed, 6 insertions(+), 37 deletions(-) + +diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c +index bf1f3607d0b60..08df23edea720 100644 +--- a/fs/xfs/scrub/common.c ++++ b/fs/xfs/scrub/common.c +@@ -864,28 +864,3 @@ xchk_ilock_inverted( + return -EDEADLOCK; + } + +-/* Pause background reaping of resources. */ +-void +-xchk_stop_reaping( +- struct xfs_scrub *sc) +-{ +- sc->flags |= XCHK_REAPING_DISABLED; +- xfs_blockgc_stop(sc->mp); +- xfs_inodegc_stop(sc->mp); +-} +- +-/* Restart background reaping of resources. */ +-void +-xchk_start_reaping( +- struct xfs_scrub *sc) +-{ +- /* +- * Readonly filesystems do not perform inactivation or speculative +- * preallocation, so there's no need to restart the workers. +- */ +- if (!xfs_is_readonly(sc->mp)) { +- xfs_inodegc_start(sc->mp); +- xfs_blockgc_start(sc->mp); +- } +- sc->flags &= ~XCHK_REAPING_DISABLED; +-} +diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h +index 454145db10e71..2ca80102e704a 100644 +--- a/fs/xfs/scrub/common.h ++++ b/fs/xfs/scrub/common.h +@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) + + int xchk_metadata_inode_forks(struct xfs_scrub *sc); + int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); +-void xchk_stop_reaping(struct xfs_scrub *sc); +-void xchk_start_reaping(struct xfs_scrub *sc); + + #endif /* __XFS_SCRUB_COMMON_H__ */ +diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c +index 48a6cbdf95d08..037541339d80a 100644 +--- a/fs/xfs/scrub/fscounters.c ++++ b/fs/xfs/scrub/fscounters.c +@@ -128,13 +128,6 @@ xchk_setup_fscounters( + if (error) + return error; + +- /* +- * Pause background reclaim while we're scrubbing to reduce the +- * likelihood of background perturbations to the counters throwing off +- * our calculations. +- */ +- xchk_stop_reaping(sc); +- + return xchk_trans_alloc(sc, 0); + } + +@@ -353,6 +346,12 @@ xchk_fscounters( + if (fdblocks > mp->m_sb.sb_dblocks) + xchk_set_corrupt(sc); + ++ /* ++ * XXX: We can't quiesce percpu counter updates, so exit early. ++ * This can be re-enabled when we gain exclusive freeze functionality. ++ */ ++ return 0; ++ + /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. +diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c +index 51e4c61916d20..e4d2a41983f73 100644 +--- a/fs/xfs/scrub/scrub.c ++++ b/fs/xfs/scrub/scrub.c +@@ -171,8 +171,6 @@ xchk_teardown( + } + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mnt_drop_write_file(sc->file); +- if (sc->flags & XCHK_REAPING_DISABLED) +- xchk_start_reaping(sc); + if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; +diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h +index 80e5026bba44a..e8d9fe9de26ed 100644 +--- a/fs/xfs/scrub/scrub.h ++++ b/fs/xfs/scrub/scrub.h +@@ -89,7 +89,6 @@ struct xfs_scrub { + /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ + #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ + #define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ +-#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ + #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ + + /* Metadata scrubbers */ +-- +2.40.1 + diff --git a/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch b/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch new file mode 100644 index 00000000000..d1414c95d1a --- /dev/null +++ b/queue-5.15/xfs-explicitly-specify-cpu-when-forcing-inodegc-dela.patch @@ -0,0 +1,75 @@ +From b210ea2535b4205d2e826b1d6d4b82d4d8917c79 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:53 -0700 +Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run + immediately + +From: Darrick J. Wong + +[ Upstream commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 ] + +I've been noticing odd racing behavior in the inodegc code that could +only be explained by one cpu adding an inode to its inactivation llist +at the same time that another cpu is processing that cpu's llist. +Preemption is disabled between get/put_cpu_ptr, so the only explanation +is scheduler mayhem. I inserted the following debug code into +xfs_inodegc_worker (see the next patch): + + ASSERT(gc->cpu == smp_processor_id()); + +This assertion tripped during overnight tests on the arm64 machines, but +curiously not on x86_64. I think we haven't observed any resource leaks +here because the lockfree list code can handle simultaneous llist_add +and llist_del_all functions operating on the same list. However, the +whole point of having percpu inodegc lists is to take advantage of warm +memory caches by inactivating inodes on the last processor to touch the +inode. + +The incorrect scheduling seems to occur after an inodegc worker is +subjected to mod_delayed_work(). This wraps mod_delayed_work_on with +WORK_CPU_UNBOUND specified as the cpu number. Unbound allows for +scheduling on any cpu, not necessarily the same one that scheduled the +work. + +Because preemption is disabled for as long as we have the gc pointer, I +think it's safe to use current_cpu() (aka smp_processor_id) to queue the +delayed work item on the correct cpu. + +Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_icache.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c +index e9ebfe6f80150..ab8181f8d08a9 100644 +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -2057,7 +2057,8 @@ xfs_inodegc_queue( + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); +- mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); ++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, ++ queue_delay); + put_cpu_ptr(gc); + + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { +@@ -2101,7 +2102,8 @@ xfs_inodegc_cpu_dead( + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); +- mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); ++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, ++ 0); + } + put_cpu_ptr(gc); + } +-- +2.40.1 + diff --git a/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch b/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch new file mode 100644 index 00000000000..903f7de4ad9 --- /dev/null +++ b/queue-5.15/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_wor.patch @@ -0,0 +1,188 @@ +From 40c124cf66ae4ddfa82dbe3c34a357a0cae38c66 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:56 -0700 +Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work + +From: Darrick J. Wong + +[ Upstream commit 2254a7396a0ca6309854948ee1c0a33fa4268cec ] + +syzbot reported this warning from the faux inodegc shrinker that tries +to kick off inodegc work: + +------------[ cut here ]------------ +WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444 +RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444 +Call Trace: + __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672 + mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746 + xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline] + xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191 + do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853 + shrink_slab+0x175/0x660 mm/vmscan.c:1013 + shrink_one+0x502/0x810 mm/vmscan.c:5343 + shrink_many mm/vmscan.c:5394 [inline] + lru_gen_shrink_node mm/vmscan.c:5511 [inline] + shrink_node+0x2064/0x35f0 mm/vmscan.c:6459 + kswapd_shrink_node mm/vmscan.c:7262 [inline] + balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452 + kswapd+0x677/0xd60 mm/vmscan.c:7712 + kthread+0x2e8/0x3a0 kernel/kthread.c:376 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 + +This warning corresponds to this code in __queue_work: + + /* + * For a draining wq, only works from the same workqueue are + * allowed. The __WQ_DESTROYING helps to spot the issue that + * queues a new work item to a wq after destroy_workqueue(wq). + */ + if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && + WARN_ON_ONCE(!is_chained_work(wq)))) + return; + +For this to trip, we must have a thread draining the inodedgc workqueue +and a second thread trying to queue inodegc work to that workqueue. +This can happen if freezing or a ro remount race with reclaim poking our +faux inodegc shrinker and another thread dropping an unlinked O_RDONLY +file: + +Thread 0 Thread 1 Thread 2 + +xfs_inodegc_stop + + xfs_inodegc_shrinker_scan + xfs_is_inodegc_enabled + + +xfs_clear_inodegc_enabled +xfs_inodegc_queue_all + + + xfs_inodegc_queue + + xfs_is_inodegc_enabled + + +drain_workqueue + + + llist_empty + + mod_delayed_work_on(..., 0) + __queue_work + + +In other words, everything between the access to inodegc_enabled state +and the decision to poke the inodegc workqueue requires some kind of +coordination to avoid the WQ_DRAINING state. We could perhaps introduce +a lock here, but we could also try to eliminate WQ_DRAINING from the +picture. + +We could replace the drain_workqueue call with a loop that flushes the +workqueue and queues workers as long as there is at least one inode +present in the per-cpu inodegc llists. We've disabled inodegc at this +point, so we know that the number of queued inodes will eventually hit +zero as long as xfs_inodegc_start cannot reactivate the workers. + +There are four callers of xfs_inodegc_start. Three of them come from the +VFS with s_umount held: filesystem thawing, failed filesystem freezing, +and the rw remount transition. The fourth caller is mounting rw (no +remount or freezing possible). + +There are three callers ofs xfs_inodegc_stop. One is unmounting (no +remount or thaw possible). Two of them come from the VFS with s_umount +held: fs freezing and ro remount transition. + +Hence, it is correct to replace the drain_workqueue call with a loop +that drains the inodegc llists. + +Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_icache.c | 32 +++++++++++++++++++++++++++----- + 1 file changed, 27 insertions(+), 5 deletions(-) + +diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c +index 02022164772de..eab98d76dbe1f 100644 +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -448,18 +448,23 @@ xfs_iget_check_free_state( + } + + /* Make all pending inactivation work start immediately. */ +-static void ++static bool + xfs_inodegc_queue_all( + struct xfs_mount *mp) + { + struct xfs_inodegc *gc; + int cpu; ++ bool ret = false; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); +- if (!llist_empty(&gc->list)) ++ if (!llist_empty(&gc->list)) { + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); ++ ret = true; ++ } + } ++ ++ return ret; + } + + /* +@@ -1902,24 +1907,41 @@ xfs_inodegc_flush( + + /* + * Flush all the pending work and then disable the inode inactivation background +- * workers and wait for them to stop. ++ * workers and wait for them to stop. Caller must hold sb->s_umount to ++ * coordinate changes in the inodegc_enabled state. + */ + void + xfs_inodegc_stop( + struct xfs_mount *mp) + { ++ bool rerun; ++ + if (!xfs_clear_inodegc_enabled(mp)) + return; + ++ /* ++ * Drain all pending inodegc work, including inodes that could be ++ * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan ++ * threads that sample the inodegc state just prior to us clearing it. ++ * The inodegc flag state prevents new threads from queuing more ++ * inodes, so we queue pending work items and flush the workqueue until ++ * all inodegc lists are empty. IOWs, we cannot use drain_workqueue ++ * here because it does not allow other unserialized mechanisms to ++ * reschedule inodegc work while this draining is in progress. ++ */ + xfs_inodegc_queue_all(mp); +- drain_workqueue(mp->m_inodegc_wq); ++ do { ++ flush_workqueue(mp->m_inodegc_wq); ++ rerun = xfs_inodegc_queue_all(mp); ++ } while (rerun); + + trace_xfs_inodegc_stop(mp, __return_address); + } + + /* + * Enable the inode inactivation background workers and schedule deferred inode +- * inactivation work if there is any. ++ * inactivation work if there is any. Caller must hold sb->s_umount to ++ * coordinate changes in the inodegc_enabled state. + */ + void + xfs_inodegc_start( +-- +2.40.1 + diff --git a/queue-5.15/xfs-introduce-xfs_inodegc_push.patch b/queue-5.15/xfs-introduce-xfs_inodegc_push.patch new file mode 100644 index 00000000000..db5c76f7505 --- /dev/null +++ b/queue-5.15/xfs-introduce-xfs_inodegc_push.patch @@ -0,0 +1,157 @@ +From f764f1d54ed4d383d265427d89575057e36dc870 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Sep 2023 18:01:52 -0700 +Subject: xfs: introduce xfs_inodegc_push() + +From: Dave Chinner + +[ Upstream commit 5e672cd69f0a534a445df4372141fd0d1d00901d ] + +The current blocking mechanism for pushing the inodegc queue out to +disk can result in systems becoming unusable when there is a long +running inodegc operation. This is because the statfs() +implementation currently issues a blocking flush of the inodegc +queue and a significant number of common system utilities will call +statfs() to discover something about the underlying filesystem. + +This can result in userspace operations getting stuck on inodegc +progress, and when trying to remove a heavily reflinked file on slow +storage with a full journal, this can result in delays measuring in +hours. + +Avoid this problem by adding "push" function that expedites the +flushing of the inodegc queue, but doesn't wait for it to complete. + +Convert xfs_fs_statfs() and xfs_qm_scall_getquota() to use this +mechanism so they don't block but still ensure that queued +operations are expedited. + +Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues") +Reported-by: Chris Dunlop +Signed-off-by: Dave Chinner +[djwong: fix _getquota_next to use _inodegc_push too] +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_icache.c | 20 +++++++++++++++----- + fs/xfs/xfs_icache.h | 1 + + fs/xfs/xfs_qm_syscalls.c | 9 ++++++--- + fs/xfs/xfs_super.c | 7 +++++-- + fs/xfs/xfs_trace.h | 1 + + 5 files changed, 28 insertions(+), 10 deletions(-) + +diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c +index 2c3ef553f5ef0..e9ebfe6f80150 100644 +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1872,19 +1872,29 @@ xfs_inodegc_worker( + } + + /* +- * Force all currently queued inode inactivation work to run immediately and +- * wait for the work to finish. ++ * Expedite all pending inodegc work to run immediately. This does not wait for ++ * completion of the work. + */ + void +-xfs_inodegc_flush( ++xfs_inodegc_push( + struct xfs_mount *mp) + { + if (!xfs_is_inodegc_enabled(mp)) + return; ++ trace_xfs_inodegc_push(mp, __return_address); ++ xfs_inodegc_queue_all(mp); ++} + ++/* ++ * Force all currently queued inode inactivation work to run immediately and ++ * wait for the work to finish. ++ */ ++void ++xfs_inodegc_flush( ++ struct xfs_mount *mp) ++{ ++ xfs_inodegc_push(mp); + trace_xfs_inodegc_flush(mp, __return_address); +- +- xfs_inodegc_queue_all(mp); + flush_workqueue(mp->m_inodegc_wq); + } + +diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h +index 2e4cfddf8b8ed..6cd180721659b 100644 +--- a/fs/xfs/xfs_icache.h ++++ b/fs/xfs/xfs_icache.h +@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); + void xfs_blockgc_start(struct xfs_mount *mp); + + void xfs_inodegc_worker(struct work_struct *work); ++void xfs_inodegc_push(struct xfs_mount *mp); + void xfs_inodegc_flush(struct xfs_mount *mp); + void xfs_inodegc_stop(struct xfs_mount *mp); + void xfs_inodegc_start(struct xfs_mount *mp); +diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c +index 47fe60e1a8873..322a111dfbc0f 100644 +--- a/fs/xfs/xfs_qm_syscalls.c ++++ b/fs/xfs/xfs_qm_syscalls.c +@@ -481,9 +481,12 @@ xfs_qm_scall_getquota( + struct xfs_dquot *dqp; + int error; + +- /* Flush inodegc work at the start of a quota reporting scan. */ ++ /* ++ * Expedite pending inodegc work at the start of a quota reporting ++ * scan but don't block waiting for it to complete. ++ */ + if (id == 0) +- xfs_inodegc_flush(mp); ++ xfs_inodegc_push(mp); + + /* + * Try to get the dquot. We don't want it allocated on disk, so don't +@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next( + + /* Flush inodegc work at the start of a quota reporting scan. */ + if (*id == 0) +- xfs_inodegc_flush(mp); ++ xfs_inodegc_push(mp); + + error = xfs_qm_dqget_next(mp, *id, type, &dqp); + if (error) +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index 8fe6ca9208de6..9b3af7611eaa9 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -795,8 +795,11 @@ xfs_fs_statfs( + xfs_extlen_t lsize; + int64_t ffree; + +- /* Wait for whatever inactivations are in progress. */ +- xfs_inodegc_flush(mp); ++ /* ++ * Expedite background inodegc but don't wait. We do not want to block ++ * here waiting hours for a billion extent file to be truncated. ++ */ ++ xfs_inodegc_push(mp); + + statp->f_type = XFS_SUPER_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; +diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h +index 1033a95fbf8e6..ebd17ddba0243 100644 +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ + TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ + TP_ARGS(mp, caller_ip)) + DEFINE_FS_EVENT(xfs_inodegc_flush); ++DEFINE_FS_EVENT(xfs_inodegc_push); + DEFINE_FS_EVENT(xfs_inodegc_start); + DEFINE_FS_EVENT(xfs_inodegc_stop); + DEFINE_FS_EVENT(xfs_inodegc_queue); +-- +2.40.1 +