From: Greg Kroah-Hartman Date: Sun, 16 Jul 2023 15:05:15 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v6.1.39~61 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=dc6184c1c58c04507edc2b0f04e056705e14375e;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: mm-mmap-fix-extra-maple-tree-write.patch xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch xfs-disable-reaping-in-fscounters-scrub.patch xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch --- diff --git a/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch b/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch new file mode 100644 index 00000000000..14dc636d2a8 --- /dev/null +++ b/queue-6.1/mm-mmap-fix-extra-maple-tree-write.patch @@ -0,0 +1,45 @@ +From Liam.Howlett@oracle.com Sun Jul 16 17:02:51 2023 +From: "Liam R. Howlett" +Date: Thu, 6 Jul 2023 14:51:35 -0400 +Subject: mm/mmap: Fix extra maple tree write +To: linux-kernel@vger.kernel.org +Cc: Andrew Morton , "Liam R. Howlett" , John Hsu , stable@vger.kernel.org, linux-mm@kvack.org +Message-ID: <20230706185135.2235532-1-Liam.Howlett@oracle.com> + +From: "Liam R. Howlett" + +based on commit 0503ea8f5ba73eb3ab13a81c1eefbaf51405385a upstream. + +This was inadvertently fixed during the removal of __vma_adjust(). + +When __vma_adjust() is adjusting next with a negative value (pushing +vma->vm_end lower), there would be two writes to the maple tree. The +first write is unnecessary and uses all allocated nodes in the maple +state. The second write is necessary but will need to allocate nodes +since the first write has used the allocated nodes. This may be a +problem as it may not be safe to allocate at this time, such as a low +memory situation. Fix the issue by avoiding the first write and only +write the adjusted "next" VMA. + +Reported-by: John Hsu +Link: https://lore.kernel.org/lkml/9cb8c599b1d7f9c1c300d1a334d5eb70ec4d7357.camel@mediatek.com/ +Cc: stable@vger.kernel.org +Cc: linux-mm@kvack.org +Signed-off-by: Liam R. Howlett +Signed-off-by: Greg Kroah-Hartman +--- + mm/mmap.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -767,7 +767,8 @@ int __vma_adjust(struct vm_area_struct * + } + if (end != vma->vm_end) { + if (vma->vm_end > end) { +- if (!insert || (insert->vm_start != end)) { ++ if ((vma->vm_end + adjust_next != end) && ++ (!insert || (insert->vm_start != end))) { + vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); + VM_WARN_ON(insert && diff --git a/queue-6.1/series b/queue-6.1/series index fc83f5eb22b..ae25d8f18ee 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -583,3 +583,8 @@ arm-orion5x-fix-d2net-gpio-initialization.patch leds-trigger-netdev-recheck-netdev_led_mode_linkup-on-dev-rename.patch blktrace-use-inline-function-for-blk_trace_remove-while-blktrace-is-disabled.patch fs-no-need-to-check-source.patch +xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch +xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch +xfs-disable-reaping-in-fscounters-scrub.patch +xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch +mm-mmap-fix-extra-maple-tree-write.patch diff --git a/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch b/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch new file mode 100644 index 00000000000..714f5ba526c --- /dev/null +++ b/queue-6.1/xfs-check-that-per-cpu-inodegc-workers-actually-run-on-that-cpu.patch @@ -0,0 +1,63 @@ +From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023 +From: Amir Goldstein +Date: Sat, 15 Jul 2023 09:31:12 +0300 +Subject: xfs: check that per-cpu inodegc workers actually run on that cpu +To: Greg Kroah-Hartman +Cc: Sasha Levin , Leah Rumancik , Chandan Babu R , "Darrick J . Wong" , linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Dave Chinner +Message-ID: <20230715063114.1485841-3-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit b37c4c8339cd394ea6b8b415026603320a185651 upstream. + +Now that we've allegedly worked out the problem of the per-cpu inodegc +workers being scheduled on the wrong cpu, let's put in a debugging knob +to let us know if a worker ever gets mis-scheduled again. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 2 ++ + fs/xfs/xfs_mount.h | 3 +++ + fs/xfs/xfs_super.c | 3 +++ + 3 files changed, 8 insertions(+) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1848,6 +1848,8 @@ xfs_inodegc_worker( + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + ++ ASSERT(gc->cpu == smp_processor_id()); ++ + WRITE_ONCE(gc->items, 0); + + if (!node) +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -66,6 +66,9 @@ struct xfs_inodegc { + /* approximate count of inodes in the list */ + unsigned int items; + unsigned int shrinker_hits; ++#if defined(DEBUG) || defined(XFS_WARN) ++ unsigned int cpu; ++#endif + }; + + /* +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1084,6 +1084,9 @@ xfs_inodegc_init_percpu( + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); ++#if defined(DEBUG) || defined(XFS_WARN) ++ gc->cpu = cpu; ++#endif + init_llist_head(&gc->list); + gc->items = 0; + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); diff --git a/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch b/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch new file mode 100644 index 00000000000..90153d13b0c --- /dev/null +++ b/queue-6.1/xfs-disable-reaping-in-fscounters-scrub.patch @@ -0,0 +1,132 @@ +From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023 +From: Amir Goldstein +Date: Sat, 15 Jul 2023 09:31:13 +0300 +Subject: xfs: disable reaping in fscounters scrub +To: Greg Kroah-Hartman +Cc: Sasha Levin , Leah Rumancik , Chandan Babu R , "Darrick J . Wong" , linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Dave Chinner +Message-ID: <20230715063114.1485841-4-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 2d5f38a31980d7090f5bf91021488dc61a0ba8ee upstream. + +The fscounters scrub code doesn't work properly because it cannot +quiesce updates to the percpu counters in the filesystem, hence it +returns false corruption reports. This has been fixed properly in +one of the online repair patchsets that are under review by replacing +the xchk_disable_reaping calls with an exclusive filesystem freeze. +Disabling background gc isn't sufficient to fix the problem. + +In other words, scrub doesn't need to call xfs_inodegc_stop, which is +just as well since it wasn't correct to allow scrub to call +xfs_inodegc_start when something else could be calling xfs_inodegc_stop +(e.g. trying to freeze the filesystem). + +Neuter the scrubber for now, and remove the xchk_*_reaping functions. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/common.c | 26 -------------------------- + fs/xfs/scrub/common.h | 2 -- + fs/xfs/scrub/fscounters.c | 13 ++++++------- + fs/xfs/scrub/scrub.c | 2 -- + fs/xfs/scrub/scrub.h | 1 - + 5 files changed, 6 insertions(+), 38 deletions(-) + +--- a/fs/xfs/scrub/common.c ++++ b/fs/xfs/scrub/common.c +@@ -865,29 +865,3 @@ xchk_ilock_inverted( + } + return -EDEADLOCK; + } +- +-/* Pause background reaping of resources. */ +-void +-xchk_stop_reaping( +- struct xfs_scrub *sc) +-{ +- sc->flags |= XCHK_REAPING_DISABLED; +- xfs_blockgc_stop(sc->mp); +- xfs_inodegc_stop(sc->mp); +-} +- +-/* Restart background reaping of resources. */ +-void +-xchk_start_reaping( +- struct xfs_scrub *sc) +-{ +- /* +- * Readonly filesystems do not perform inactivation or speculative +- * preallocation, so there's no need to restart the workers. +- */ +- if (!xfs_is_readonly(sc->mp)) { +- xfs_inodegc_start(sc->mp); +- xfs_blockgc_start(sc->mp); +- } +- sc->flags &= ~XCHK_REAPING_DISABLED; +-} +--- a/fs/xfs/scrub/common.h ++++ b/fs/xfs/scrub/common.h +@@ -148,7 +148,5 @@ static inline bool xchk_skip_xref(struct + + int xchk_metadata_inode_forks(struct xfs_scrub *sc); + int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); +-void xchk_stop_reaping(struct xfs_scrub *sc); +-void xchk_start_reaping(struct xfs_scrub *sc); + + #endif /* __XFS_SCRUB_COMMON_H__ */ +--- a/fs/xfs/scrub/fscounters.c ++++ b/fs/xfs/scrub/fscounters.c +@@ -128,13 +128,6 @@ xchk_setup_fscounters( + if (error) + return error; + +- /* +- * Pause background reclaim while we're scrubbing to reduce the +- * likelihood of background perturbations to the counters throwing off +- * our calculations. +- */ +- xchk_stop_reaping(sc); +- + return xchk_trans_alloc(sc, 0); + } + +@@ -354,6 +347,12 @@ xchk_fscounters( + xchk_set_corrupt(sc); + + /* ++ * XXX: We can't quiesce percpu counter updates, so exit early. ++ * This can be re-enabled when we gain exclusive freeze functionality. ++ */ ++ return 0; ++ ++ /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. + */ +--- a/fs/xfs/scrub/scrub.c ++++ b/fs/xfs/scrub/scrub.c +@@ -171,8 +171,6 @@ xchk_teardown( + } + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mnt_drop_write_file(sc->file); +- if (sc->flags & XCHK_REAPING_DISABLED) +- xchk_start_reaping(sc); + if (sc->buf) { + kmem_free(sc->buf); + sc->buf = NULL; +--- a/fs/xfs/scrub/scrub.h ++++ b/fs/xfs/scrub/scrub.h +@@ -88,7 +88,6 @@ struct xfs_scrub { + + /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ + #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ +-#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ + #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ + + /* Metadata scrubbers */ diff --git a/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch b/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch new file mode 100644 index 00000000000..2bba34427b3 --- /dev/null +++ b/queue-6.1/xfs-explicitly-specify-cpu-when-forcing-inodegc-delayed-work-to-run-immediately.patch @@ -0,0 +1,72 @@ +From stable-owner@vger.kernel.org Sat Jul 15 08:31:32 2023 +From: Amir Goldstein +Date: Sat, 15 Jul 2023 09:31:11 +0300 +Subject: xfs: explicitly specify cpu when forcing inodegc delayed work to run immediately +To: Greg Kroah-Hartman +Cc: Sasha Levin , Leah Rumancik , Chandan Babu R , "Darrick J . Wong" , linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Dave Chinner +Message-ID: <20230715063114.1485841-2-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 03e0add80f4cf3f7393edb574eeb3a89a1db7758 upstream. + +I've been noticing odd racing behavior in the inodegc code that could +only be explained by one cpu adding an inode to its inactivation llist +at the same time that another cpu is processing that cpu's llist. +Preemption is disabled between get/put_cpu_ptr, so the only explanation +is scheduler mayhem. I inserted the following debug code into +xfs_inodegc_worker (see the next patch): + + ASSERT(gc->cpu == smp_processor_id()); + +This assertion tripped during overnight tests on the arm64 machines, but +curiously not on x86_64. I think we haven't observed any resource leaks +here because the lockfree list code can handle simultaneous llist_add +and llist_del_all functions operating on the same list. However, the +whole point of having percpu inodegc lists is to take advantage of warm +memory caches by inactivating inodes on the last processor to touch the +inode. + +The incorrect scheduling seems to occur after an inodegc worker is +subjected to mod_delayed_work(). This wraps mod_delayed_work_on with +WORK_CPU_UNBOUND specified as the cpu number. Unbound allows for +scheduling on any cpu, not necessarily the same one that scheduled the +work. + +Because preemption is disabled for as long as we have the gc pointer, I +think it's safe to use current_cpu() (aka smp_processor_id) to queue the +delayed work item on the correct cpu. + +Fixes: 7cf2b0f9611b ("xfs: bound maximum wait time for inodegc work") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -2052,7 +2052,8 @@ xfs_inodegc_queue( + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); +- mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); ++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, ++ queue_delay); + put_cpu_ptr(gc); + + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { +@@ -2096,7 +2097,8 @@ xfs_inodegc_cpu_dead( + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); +- mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); ++ mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, ++ 0); + } + put_cpu_ptr(gc); + } diff --git a/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch b/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch new file mode 100644 index 00000000000..3db57ab266f --- /dev/null +++ b/queue-6.1/xfs-fix-xfs_inodegc_stop-racing-with-mod_delayed_work.patch @@ -0,0 +1,186 @@ +From stable-owner@vger.kernel.org Sat Jul 15 08:31:33 2023 +From: Amir Goldstein +Date: Sat, 15 Jul 2023 09:31:14 +0300 +Subject: xfs: fix xfs_inodegc_stop racing with mod_delayed_work +To: Greg Kroah-Hartman +Cc: Sasha Levin , Leah Rumancik , Chandan Babu R , "Darrick J . Wong" , linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Dave Chinner +Message-ID: <20230715063114.1485841-5-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 2254a7396a0ca6309854948ee1c0a33fa4268cec upstream. + +syzbot reported this warning from the faux inodegc shrinker that tries +to kick off inodegc work: + +------------[ cut here ]------------ +WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444 +RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444 +Call Trace: + __queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672 + mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746 + xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline] + xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191 + do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853 + shrink_slab+0x175/0x660 mm/vmscan.c:1013 + shrink_one+0x502/0x810 mm/vmscan.c:5343 + shrink_many mm/vmscan.c:5394 [inline] + lru_gen_shrink_node mm/vmscan.c:5511 [inline] + shrink_node+0x2064/0x35f0 mm/vmscan.c:6459 + kswapd_shrink_node mm/vmscan.c:7262 [inline] + balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452 + kswapd+0x677/0xd60 mm/vmscan.c:7712 + kthread+0x2e8/0x3a0 kernel/kthread.c:376 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 + +This warning corresponds to this code in __queue_work: + + /* + * For a draining wq, only works from the same workqueue are + * allowed. The __WQ_DESTROYING helps to spot the issue that + * queues a new work item to a wq after destroy_workqueue(wq). + */ + if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && + WARN_ON_ONCE(!is_chained_work(wq)))) + return; + +For this to trip, we must have a thread draining the inodedgc workqueue +and a second thread trying to queue inodegc work to that workqueue. +This can happen if freezing or a ro remount race with reclaim poking our +faux inodegc shrinker and another thread dropping an unlinked O_RDONLY +file: + +Thread 0 Thread 1 Thread 2 + +xfs_inodegc_stop + + xfs_inodegc_shrinker_scan + xfs_is_inodegc_enabled + + +xfs_clear_inodegc_enabled +xfs_inodegc_queue_all + + + xfs_inodegc_queue + + xfs_is_inodegc_enabled + + +drain_workqueue + + + llist_empty + + mod_delayed_work_on(..., 0) + __queue_work + + +In other words, everything between the access to inodegc_enabled state +and the decision to poke the inodegc workqueue requires some kind of +coordination to avoid the WQ_DRAINING state. We could perhaps introduce +a lock here, but we could also try to eliminate WQ_DRAINING from the +picture. + +We could replace the drain_workqueue call with a loop that flushes the +workqueue and queues workers as long as there is at least one inode +present in the per-cpu inodegc llists. We've disabled inodegc at this +point, so we know that the number of queued inodes will eventually hit +zero as long as xfs_inodegc_start cannot reactivate the workers. + +There are four callers of xfs_inodegc_start. Three of them come from the +VFS with s_umount held: filesystem thawing, failed filesystem freezing, +and the rw remount transition. The fourth caller is mounting rw (no +remount or freezing possible). + +There are three callers ofs xfs_inodegc_stop. One is unmounting (no +remount or thaw possible). Two of them come from the VFS with s_umount +held: fs freezing and ro remount transition. + +Hence, it is correct to replace the drain_workqueue call with a loop +that drains the inodegc llists. + +Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 32 +++++++++++++++++++++++++++----- + 1 file changed, 27 insertions(+), 5 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -431,18 +431,23 @@ xfs_iget_check_free_state( + } + + /* Make all pending inactivation work start immediately. */ +-static void ++static bool + xfs_inodegc_queue_all( + struct xfs_mount *mp) + { + struct xfs_inodegc *gc; + int cpu; ++ bool ret = false; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); +- if (!llist_empty(&gc->list)) ++ if (!llist_empty(&gc->list)) { + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); ++ ret = true; ++ } + } ++ ++ return ret; + } + + /* +@@ -1894,24 +1899,41 @@ xfs_inodegc_flush( + + /* + * Flush all the pending work and then disable the inode inactivation background +- * workers and wait for them to stop. ++ * workers and wait for them to stop. Caller must hold sb->s_umount to ++ * coordinate changes in the inodegc_enabled state. + */ + void + xfs_inodegc_stop( + struct xfs_mount *mp) + { ++ bool rerun; ++ + if (!xfs_clear_inodegc_enabled(mp)) + return; + ++ /* ++ * Drain all pending inodegc work, including inodes that could be ++ * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan ++ * threads that sample the inodegc state just prior to us clearing it. ++ * The inodegc flag state prevents new threads from queuing more ++ * inodes, so we queue pending work items and flush the workqueue until ++ * all inodegc lists are empty. IOWs, we cannot use drain_workqueue ++ * here because it does not allow other unserialized mechanisms to ++ * reschedule inodegc work while this draining is in progress. ++ */ + xfs_inodegc_queue_all(mp); +- drain_workqueue(mp->m_inodegc_wq); ++ do { ++ flush_workqueue(mp->m_inodegc_wq); ++ rerun = xfs_inodegc_queue_all(mp); ++ } while (rerun); + + trace_xfs_inodegc_stop(mp, __return_address); + } + + /* + * Enable the inode inactivation background workers and schedule deferred inode +- * inactivation work if there is any. ++ * inactivation work if there is any. Caller must hold sb->s_umount to ++ * coordinate changes in the inodegc_enabled state. + */ + void + xfs_inodegc_start(