From: Greg Kroah-Hartman Date: Thu, 29 May 2025 11:41:35 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v5.4.294~43 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2a1ce3921235ddfdda9bb86391e4023e471e7fa0;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch --- diff --git a/queue-6.1/btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch b/queue-6.1/btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch new file mode 100644 index 0000000000..75234eb067 --- /dev/null +++ b/queue-6.1/btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch @@ -0,0 +1,103 @@ +From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Fri, 13 Dec 2024 12:22:32 -0800 +Subject: btrfs: check folio mapping after unlock in relocate_one_folio() + +From: Boris Burkov + +commit 3e74859ee35edc33a022c3f3971df066ea0ca6b9 upstream. + +When we call btrfs_read_folio() to bring a folio uptodate, we unlock the +folio. The result of that is that a different thread can modify the +mapping (like remove it with invalidate) before we call folio_lock(). +This results in an invalid page and we need to try again. + +In particular, if we are relocating concurrently with aborting a +transaction, this can result in a crash like the following: + + BUG: kernel NULL pointer dereference, address: 0000000000000000 + PGD 0 P4D 0 + Oops: 0000 [#1] SMP + CPU: 76 PID: 1411631 Comm: kworker/u322:5 + Workqueue: events_unbound btrfs_reclaim_bgs_work + RIP: 0010:set_page_extent_mapped+0x20/0xb0 + RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246 + RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880 + RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0 + R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000 + R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff + FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + PKRU: 55555554 + Call Trace: + + ? __die+0x78/0xc0 + ? page_fault_oops+0x2a8/0x3a0 + ? __switch_to+0x133/0x530 + ? wq_worker_running+0xa/0x40 + ? exc_page_fault+0x63/0x130 + ? asm_exc_page_fault+0x22/0x30 + ? set_page_extent_mapped+0x20/0xb0 + relocate_file_extent_cluster+0x1a7/0x940 + relocate_data_extent+0xaf/0x120 + relocate_block_group+0x20f/0x480 + btrfs_relocate_block_group+0x152/0x320 + btrfs_relocate_chunk+0x3d/0x120 + btrfs_reclaim_bgs_work+0x2ae/0x4e0 + process_scheduled_works+0x184/0x370 + worker_thread+0xc6/0x3e0 + ? blk_add_timer+0xb0/0xb0 + kthread+0xae/0xe0 + ? flush_tlb_kernel_range+0x90/0x90 + ret_from_fork+0x2f/0x40 + ? flush_tlb_kernel_range+0x90/0x90 + ret_from_fork_asm+0x11/0x20 + + +This occurs because cleanup_one_transaction() calls +destroy_delalloc_inodes() which calls invalidate_inode_pages2() which +takes the folio_lock before setting mapping to NULL. We fail to check +this, and subsequently call set_extent_mapping(), which assumes that +mapping != NULL (in fact it asserts that in debug mode) + +Note that the "fixes" patch here is not the one that introduced the +race (the very first iteration of this code from 2009) but a more recent +change that made this particular crash happen in practice. + +Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Qu Wenruo +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Zhaoyang Li +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/relocation.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -2977,6 +2977,7 @@ static int relocate_one_page(struct inod + int ret; + + ASSERT(page_index <= last_index); ++again: + page = find_lock_page(inode->i_mapping, page_index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, ra, NULL, +@@ -2998,6 +2999,11 @@ static int relocate_one_page(struct inod + ret = -EIO; + goto release_page; + } ++ if (page->mapping != inode->i_mapping) { ++ unlock_page(page); ++ put_page(page); ++ goto again; ++ } + } + + /* diff --git a/queue-6.1/hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch b/queue-6.1/hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch new file mode 100644 index 0000000000..a572bf84d4 --- /dev/null +++ b/queue-6.1/hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch @@ -0,0 +1,270 @@ +From 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Sat, 18 Jan 2025 00:24:33 +0100 +Subject: hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING + +From: Frederic Weisbecker + +commit 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 upstream. + +hrtimers are migrated away from the dying CPU to any online target at +the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers +handling tasks involved in the CPU hotplug forward progress. + +However wakeups can still be performed by the outgoing CPU after +CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being +armed. Depending on several considerations (crystal ball power management +based election, earliest timer already enqueued, timer migration enabled or +not), the target may eventually be the current CPU even if offline. If that +happens, the timer is eventually ignored. + +The most notable example is RCU which had to deal with each and every of +those wake-ups by deferring them to an online CPU, along with related +workarounds: + +_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) +_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) +_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) + +The problem isn't confined to RCU though as the stop machine kthread +(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end +of its work through cpu_stop_signal_done() and performs a wake up that +eventually arms the deadline server timer: + + WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 + CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted + Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 + RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 + Call Trace: + + start_dl_timer + enqueue_dl_entity + dl_server_start + enqueue_task_fair + enqueue_task + ttwu_do_activate + try_to_wake_up + complete + cpu_stopper_thread + +Instead of providing yet another bandaid to work around the situation, fix +it in the hrtimers infrastructure instead: always migrate away a timer to +an online target whenever it is enqueued from an offline CPU. + +This will also allow to revert all the above RCU disgraceful hacks. + +Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") +Reported-by: Vlad Poenaru +Reported-by: Usama Arif +Signed-off-by: Frederic Weisbecker +Signed-off-by: Paul E. McKenney +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Tested-by: Paul E. McKenney +Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org +Closes: 20241213203739.1519801-1-usamaarif642@gmail.com +Signed-off-by: Zhaoyang Li +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 103 ++++++++++++++++++++++++++++++++++++++---------- + 2 files changed, 83 insertions(+), 21 deletions(-) + +--- a/include/linux/hrtimer.h ++++ b/include/linux/hrtimer.h +@@ -237,6 +237,7 @@ struct hrtimer_cpu_base { + ktime_t softirq_expires_next; + struct hrtimer *softirq_next_timer; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; ++ call_single_data_t csd; + } ____cacheline_aligned; + + static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -58,6 +58,8 @@ + #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) + #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + ++static void retrigger_next_event(void *arg); ++ + /* + * The timer bases: + * +@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, +- } ++ }, ++ .csd = CSD_INIT(retrigger_next_event, NULL) + }; + + static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { +@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_t + [CLOCK_TAI] = HRTIMER_BASE_TAI, + }; + ++static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) ++{ ++ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) ++ return true; ++ else ++ return likely(base->online); ++} ++ + /* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place +@@ -177,27 +188,54 @@ struct hrtimer_clock_base *lock_hrtimer_ + } + + /* +- * We do not migrate the timer when it is expiring before the next +- * event on the target cpu. When high resolution is enabled, we cannot +- * reprogram the target cpu hardware and we would cause it to fire +- * late. To keep it simple, we handle the high resolution enabled and +- * disabled case similar. ++ * Check if the elected target is suitable considering its next ++ * event and the hotplug state of the current CPU. ++ * ++ * If the elected target is remote and its next event is after the timer ++ * to queue, then a remote reprogram is necessary. However there is no ++ * guarantee the IPI handling the operation would arrive in time to meet ++ * the high resolution deadline. In this case the local CPU becomes a ++ * preferred target, unless it is offline. ++ * ++ * High and low resolution modes are handled the same way for simplicity. + * + * Called with cpu_base->lock of target cpu held. + */ +-static int +-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) ++static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, ++ struct hrtimer_cpu_base *new_cpu_base, ++ struct hrtimer_cpu_base *this_cpu_base) + { + ktime_t expires; + ++ /* ++ * The local CPU clockevent can be reprogrammed. Also get_target_base() ++ * guarantees it is online. ++ */ ++ if (new_cpu_base == this_cpu_base) ++ return true; ++ ++ /* ++ * The offline local CPU can't be the default target if the ++ * next remote target event is after this timer. Keep the ++ * elected new base. An IPI will we issued to reprogram ++ * it as a last resort. ++ */ ++ if (!hrtimer_base_is_online(this_cpu_base)) ++ return true; ++ + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); +- return expires < new_base->cpu_base->expires_next; ++ ++ return expires >= new_base->cpu_base->expires_next; + } + +-static inline +-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, +- int pinned) ++static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) + { ++ if (!hrtimer_base_is_online(base)) { ++ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); ++ ++ return &per_cpu(hrtimer_bases, cpu); ++ } ++ + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +@@ -248,8 +286,8 @@ again: + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + +- if (new_cpu_base != this_cpu_base && +- hrtimer_check_target(timer, new_base)) { ++ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, ++ this_cpu_base)) { + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_cpu_base; +@@ -258,8 +296,7 @@ again: + } + WRITE_ONCE(timer->base, new_base); + } else { +- if (new_cpu_base != this_cpu_base && +- hrtimer_check_target(timer, new_base)) { ++ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { + new_cpu_base = this_cpu_base; + goto again; + } +@@ -718,8 +755,6 @@ static inline int hrtimer_is_hres_enable + return hrtimer_hres_enabled; + } + +-static void retrigger_next_event(void *arg); +- + /* + * Switch to high resolution mode + */ +@@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(stru + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) + { ++ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *new_base; + bool force_local, first; + +@@ -1216,10 +1252,16 @@ static int __hrtimer_start_range_ns(stru + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ +- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); ++ force_local = base->cpu_base == this_cpu_base; + force_local &= base->cpu_base->next_timer == timer; + + /* ++ * Don't force local queuing if this enqueue happens on a unplugged ++ * CPU after hrtimer_cpu_dying() has been invoked. ++ */ ++ force_local &= this_cpu_base->online; ++ ++ /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. +@@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(stru + } + + first = enqueue_hrtimer(timer, new_base, mode); +- if (!force_local) +- return first; ++ if (!force_local) { ++ /* ++ * If the current CPU base is online, then the timer is ++ * never queued on a remote CPU if it would be the first ++ * expiring timer there. ++ */ ++ if (hrtimer_base_is_online(this_cpu_base)) ++ return first; ++ ++ /* ++ * Timer was enqueued remote because the current base is ++ * already offline. If the timer is the first to expire, ++ * kick the remote CPU to reprogram the clock event. ++ */ ++ if (first) { ++ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; ++ ++ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); ++ } ++ return 0; ++ } + + /* + * Timer was forced to stay on the current CPU to avoid diff --git a/queue-6.1/series b/queue-6.1/series index f6a8ed74da..578ed6403b 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -276,3 +276,5 @@ dmaengine-idxd-fix-passing-freed-memory-in-idxd_cdev_open.patch octeontx2-pf-fix-page_pool-creation-fail-for-rings-32k.patch octeontx2-pf-fix-page-pool-cache-index-corruption.patch octeontx2-pf-fix-page-pool-frag-allocation-warning.patch +hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch +btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch