]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for all trees
authorSasha Levin <sashal@kernel.org>
Thu, 30 Oct 2025 01:57:59 +0000 (21:57 -0400)
committerSasha Levin <sashal@kernel.org>
Thu, 30 Oct 2025 01:57:59 +0000 (21:57 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
95 files changed:
queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch [new file with mode: 0644]
queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-5.15/series
queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch [new file with mode: 0644]
queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch [new file with mode: 0644]
queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch [new file with mode: 0644]
queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch [new file with mode: 0644]
queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch [new file with mode: 0644]
queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch [new file with mode: 0644]
queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch [new file with mode: 0644]
queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch [new file with mode: 0644]
queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch [new file with mode: 0644]
queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch [new file with mode: 0644]
queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch [new file with mode: 0644]
queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch [new file with mode: 0644]
queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch [new file with mode: 0644]
queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch [new file with mode: 0644]
queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch [new file with mode: 0644]
queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch [new file with mode: 0644]
queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch [new file with mode: 0644]
queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch [new file with mode: 0644]
queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch [new file with mode: 0644]
queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch [new file with mode: 0644]
queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch [new file with mode: 0644]
queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch [new file with mode: 0644]
queue-6.12/series
queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch [new file with mode: 0644]
queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch [new file with mode: 0644]
queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch [new file with mode: 0644]
queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch [new file with mode: 0644]
queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch [new file with mode: 0644]
queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch [new file with mode: 0644]
queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch [new file with mode: 0644]
queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch [new file with mode: 0644]
queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch [new file with mode: 0644]
queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch [new file with mode: 0644]
queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch [new file with mode: 0644]
queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch [new file with mode: 0644]
queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch [new file with mode: 0644]
queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch [new file with mode: 0644]
queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch [new file with mode: 0644]
queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch [new file with mode: 0644]
queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch [new file with mode: 0644]
queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch [new file with mode: 0644]
queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch [new file with mode: 0644]
queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch [new file with mode: 0644]
queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch [new file with mode: 0644]
queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch [new file with mode: 0644]
queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch [new file with mode: 0644]
queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch [new file with mode: 0644]
queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch [new file with mode: 0644]
queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch [new file with mode: 0644]
queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch [new file with mode: 0644]
queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch [new file with mode: 0644]
queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch [new file with mode: 0644]
queue-6.17/series [new file with mode: 0644]
queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch [new file with mode: 0644]
queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch [new file with mode: 0644]
queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch [new file with mode: 0644]
queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch [new file with mode: 0644]
queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch [new file with mode: 0644]
queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch [new file with mode: 0644]
queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch [new file with mode: 0644]
queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch [new file with mode: 0644]
queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch [new file with mode: 0644]
queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch [new file with mode: 0644]
queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch [new file with mode: 0644]
queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch [new file with mode: 0644]
queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch [new file with mode: 0644]
queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch [new file with mode: 0644]
queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch [new file with mode: 0644]
queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch [new file with mode: 0644]
queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch [new file with mode: 0644]

diff --git a/queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..9ccc4f8
--- /dev/null
@@ -0,0 +1,63 @@
+From 7f87f0b776b9a3722815a1bc1b527e3d1c90f646 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 91475cb7d568b..29f0ba4adfbce 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2309,10 +2309,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       }
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 6d715bb773643..cdb5a2770faf3 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -6432,7 +6432,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..d21460e
--- /dev/null
@@ -0,0 +1,58 @@
+From 581461587a3316a3c933c4e90962e0a37cad44a6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index f68cfcc1f8300..d558f354b8b82 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1660,7 +1660,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..a95895e8398374e781fa4a77926b723aba77091e 100644 (file)
@@ -1 +1,4 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
diff --git a/queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..b7fb2b5
--- /dev/null
@@ -0,0 +1,51 @@
+From b65ea0af0079f32b432165b33550d4e95b18c8ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 8794e3f4974b3..57ba697e29180 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1508,7 +1508,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3011,9 +3011,6 @@ static char *pbrsb_eibrs_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..9528641
--- /dev/null
@@ -0,0 +1,63 @@
+From 8b977547797ba2015b11f98dbf944df250edf3c0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 9c2d6f96f46da..136902f27e441 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2387,10 +2387,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       }
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 34fedac4e1864..445c7a5641b62 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -6513,7 +6513,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
new file mode 100644 (file)
index 0000000..4f329bf
--- /dev/null
@@ -0,0 +1,44 @@
+From f476a1c6de77130f5290f607f2fd2094dd813876 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Sep 2025 17:01:44 +0200
+Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in
+ scrub_throttle_dev_io()
+
+From: Thorsten Blum <thorsten.blum@linux.dev>
+
+[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ]
+
+Replace max_t() followed by min_t() with a single clamp().
+
+As was pointed by David Laight in
+https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/
+the calculation may overflow u32 when the input value is too large, so
+clamp_t() is not used.  In practice the expected values are in range of
+megabytes to gigabytes (throughput limit) so the bug would not happen.
+
+Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ Use clamp() and add explanation. ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 6ffd34d39e992..aac4ee5880952 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -2017,8 +2017,7 @@ static void scrub_throttle(struct scrub_ctx *sctx)
+        * Slice is divided into intervals when the IO is submitted, adjust by
+        * bwlimit and maximum of 64 intervals.
+        */
+-      div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-      div = min_t(u32, 64, div);
++      div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
+       /* Start new epoch, set deadline */
+       now = ktime_get();
+-- 
+2.51.0
+
diff --git a/queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..2c6a645
--- /dev/null
@@ -0,0 +1,58 @@
+From 9bd34bcc56053310ef3ea6b6c0255bd75c8227be Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 4fb5e12c87d1b..d96221ed835e9 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1765,7 +1765,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..447d39e94a5341ef66700dc1ab611fa7c2e4d84c 100644 (file)
@@ -1 +1,5 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
diff --git a/queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..3f37fa6
--- /dev/null
@@ -0,0 +1,51 @@
+From 28fcb9a170c8be4e2920ecb17a4b5f15e5681b81 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 1628c00145892..8df48691f4910 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1539,7 +1539,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3168,9 +3168,6 @@ static const char *spectre_bhi_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..438a7aa
--- /dev/null
@@ -0,0 +1,58 @@
+From 2524d3603db07d1acaf0af4f49c597e74dcd7b07 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 094b024bbf0cf..6618b42defed7 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1546,7 +1546,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..342fc0f59035a1cc110922470bead724d4745665 100644 (file)
@@ -1 +1,3 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
diff --git a/queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..b03e77f
--- /dev/null
@@ -0,0 +1,51 @@
+From e39da45843868dd70b02049cb59d491ab47105f3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 4f803aed2ef0e..b10e257799c16 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1188,7 +1188,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -2280,9 +2280,6 @@ static char *pbrsb_eibrs_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
new file mode 100644 (file)
index 0000000..60ccf17
--- /dev/null
@@ -0,0 +1,295 @@
+From ffbe1930ee87e820eb9bd3809625807a5535f61c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Sep 2025 14:09:13 +0800
+Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c
+
+From: Menglong Dong <menglong8.dong@gmail.com>
+
+[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ]
+
+The include/generated/asm-offsets.h is generated in Kbuild during
+compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to
+generate another similar offset header file, circular dependency can
+happen.
+
+For example, we want to generate a offset file include/generated/test.h,
+which is included in include/sched/sched.h. If we generate asm-offsets.h
+first, it will fail, as include/sched/sched.h is included in asm-offsets.c
+and include/generated/test.h doesn't exist; If we generate test.h first,
+it can't success neither, as include/generated/asm-offsets.h is included
+by it.
+
+In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular
+dependency. We can generate asm-offsets.h first, and if the
+COMPILE_OFFSETS is defined, we don't include the "generated/test.h".
+
+And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this
+purpose.
+
+Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/alpha/kernel/asm-offsets.c      | 1 +
+ arch/arc/kernel/asm-offsets.c        | 1 +
+ arch/arm/kernel/asm-offsets.c        | 2 ++
+ arch/arm64/kernel/asm-offsets.c      | 1 +
+ arch/csky/kernel/asm-offsets.c       | 1 +
+ arch/hexagon/kernel/asm-offsets.c    | 1 +
+ arch/loongarch/kernel/asm-offsets.c  | 2 ++
+ arch/m68k/kernel/asm-offsets.c       | 1 +
+ arch/microblaze/kernel/asm-offsets.c | 1 +
+ arch/mips/kernel/asm-offsets.c       | 2 ++
+ arch/nios2/kernel/asm-offsets.c      | 1 +
+ arch/openrisc/kernel/asm-offsets.c   | 1 +
+ arch/parisc/kernel/asm-offsets.c     | 1 +
+ arch/powerpc/kernel/asm-offsets.c    | 1 +
+ arch/riscv/kernel/asm-offsets.c      | 1 +
+ arch/s390/kernel/asm-offsets.c       | 1 +
+ arch/sh/kernel/asm-offsets.c         | 1 +
+ arch/sparc/kernel/asm-offsets.c      | 1 +
+ arch/um/kernel/asm-offsets.c         | 2 ++
+ arch/xtensa/kernel/asm-offsets.c     | 1 +
+ 20 files changed, 24 insertions(+)
+
+diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
+index 05d9296af5ea6..a251f1bc74acf 100644
+--- a/arch/alpha/kernel/asm-offsets.c
++++ b/arch/alpha/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c
+index 0e884036ab743..897dcfc7c9fa0 100644
+--- a/arch/arc/kernel/asm-offsets.c
++++ b/arch/arc/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
+index 2c8d76fd7c662..820bc05685bab 100644
+--- a/arch/arm/kernel/asm-offsets.c
++++ b/arch/arm/kernel/asm-offsets.c
+@@ -7,6 +7,8 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
+index 1197e7679882e..4785e8947f520 100644
+--- a/arch/arm64/kernel/asm-offsets.c
++++ b/arch/arm64/kernel/asm-offsets.c
+@@ -6,6 +6,7 @@
+  *               2001-2002 Keith Owens
+  * Copyright (C) 2012 ARM Ltd.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/arm_sdei.h>
+ #include <linux/sched.h>
+diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c
+index d1e9035794733..5525c8e7e1d9e 100644
+--- a/arch/csky/kernel/asm-offsets.c
++++ b/arch/csky/kernel/asm-offsets.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
+index 03a7063f94561..50eea9fa6f137 100644
+--- a/arch/hexagon/kernel/asm-offsets.c
++++ b/arch/hexagon/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  *
+  * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/types.h>
+diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
+index bdd88eda9513f..91b3eae9414f7 100644
+--- a/arch/loongarch/kernel/asm-offsets.c
++++ b/arch/loongarch/kernel/asm-offsets.c
+@@ -4,6 +4,8 @@
+  *
+  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/types.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
+index 906d732305374..67a1990f9d748 100644
+--- a/arch/m68k/kernel/asm-offsets.c
++++ b/arch/m68k/kernel/asm-offsets.c
+@@ -9,6 +9,7 @@
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+ #include <linux/stddef.h>
+diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c
+index 104c3ac5f30c8..b4b67d58e7f6a 100644
+--- a/arch/microblaze/kernel/asm-offsets.c
++++ b/arch/microblaze/kernel/asm-offsets.c
+@@ -7,6 +7,7 @@
+  * License. See the file "COPYING" in the main directory of this archive
+  * for more details.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/init.h>
+ #include <linux/stddef.h>
+diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
+index 08342b9eccdbd..0f9ed454faf19 100644
+--- a/arch/mips/kernel/asm-offsets.c
++++ b/arch/mips/kernel/asm-offsets.c
+@@ -9,6 +9,8 @@
+  * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000 MIPS Technologies, Inc.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compat.h>
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c
+index e3d9b7b6fb48a..88190b503ce5d 100644
+--- a/arch/nios2/kernel/asm-offsets.c
++++ b/arch/nios2/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/sched.h>
+diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c
+index 710651d5aaae1..3cc826f2216b1 100644
+--- a/arch/openrisc/kernel/asm-offsets.c
++++ b/arch/openrisc/kernel/asm-offsets.c
+@@ -18,6 +18,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/signal.h>
+ #include <linux/sched.h>
+diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
+index 94652e13c2603..21e900c0aa958 100644
+--- a/arch/parisc/kernel/asm-offsets.c
++++ b/arch/parisc/kernel/asm-offsets.c
+@@ -13,6 +13,7 @@
+  *    Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
+  *    Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
+index 65d79dd0c92ce..5a4edc1e5504f 100644
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/signal.h>
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index 1ecafbcee9a0a..21f034b3fdbeb 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -3,6 +3,7 @@
+  * Copyright (C) 2012 Regents of the University of California
+  * Copyright (C) 2017 SiFive
+  */
++#define COMPILE_OFFSETS
+ #include <linux/kbuild.h>
+ #include <linux/mm.h>
+diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
+index d8ce965c0a97c..9ff68c7f61cc0 100644
+--- a/arch/s390/kernel/asm-offsets.c
++++ b/arch/s390/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
+index a0322e8328456..429b6a7631468 100644
+--- a/arch/sh/kernel/asm-offsets.c
++++ b/arch/sh/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/types.h>
+diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
+index 5784f2df489a4..f1e27a7f800f4 100644
+--- a/arch/sparc/kernel/asm-offsets.c
++++ b/arch/sparc/kernel/asm-offsets.c
+@@ -10,6 +10,7 @@
+  *
+  * On sparc, thread_info data is static and TI_XXX offsets are computed by hand.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm_types.h>
+diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
+index 1fb12235ab9c8..a69873aa697f4 100644
+--- a/arch/um/kernel/asm-offsets.c
++++ b/arch/um/kernel/asm-offsets.c
+@@ -1 +1,3 @@
++#define COMPILE_OFFSETS
++
+ #include <sysdep/kernel-offsets.h>
+diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
+index da38de20ae598..cfbced95e944a 100644
+--- a/arch/xtensa/kernel/asm-offsets.c
++++ b/arch/xtensa/kernel/asm-offsets.c
+@@ -11,6 +11,7 @@
+  *
+  * Chris Zankel <chris@zankel.net>
+  */
++#define COMPILE_OFFSETS
+ #include <asm/processor.h>
+ #include <asm/coprocessor.h>
+-- 
+2.51.0
+
diff --git a/queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..e7cdf7f
--- /dev/null
@@ -0,0 +1,63 @@
+From 1a4b6afa3244441ca0aea69a7dde7f080c6686da Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 76a261cbf39d6..8576ba4aa0b7d 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2413,10 +2413,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index e4cc287eee993..fdcf66ba318ad 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -7366,7 +7366,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
new file mode 100644 (file)
index 0000000..87c2bbf
--- /dev/null
@@ -0,0 +1,44 @@
+From 6afe9f968816990cae616be5a5ce679304c90cec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Sep 2025 17:01:44 +0200
+Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in
+ scrub_throttle_dev_io()
+
+From: Thorsten Blum <thorsten.blum@linux.dev>
+
+[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ]
+
+Replace max_t() followed by min_t() with a single clamp().
+
+As was pointed by David Laight in
+https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/
+the calculation may overflow u32 when the input value is too large, so
+clamp_t() is not used.  In practice the expected values are in range of
+megabytes to gigabytes (throughput limit) so the bug would not happen.
+
+Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ Use clamp() and add explanation. ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index f48895a9b165e..ce8a9c226534f 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -2191,8 +2191,7 @@ static void scrub_throttle(struct scrub_ctx *sctx)
+        * Slice is divided into intervals when the IO is submitted, adjust by
+        * bwlimit and maximum of 64 intervals.
+        */
+-      div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-      div = min_t(u32, 64, div);
++      div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
+       /* Start new epoch, set deadline */
+       now = ktime_get();
+-- 
+2.51.0
+
diff --git a/queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..01d02c9
--- /dev/null
@@ -0,0 +1,58 @@
+From db12f5ea1c6e8ad6962fd524fdc61c9a6f3158ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index ff3e0d4cf4b48..54894a950c6f7 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1787,7 +1787,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
diff --git a/queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch
new file mode 100644 (file)
index 0000000..20c909b
--- /dev/null
@@ -0,0 +1,59 @@
+From 138ae4c24306cd63476f082697a728b825b23160 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 11:13:15 +0900
+Subject: btrfs: zoned: refine extent allocator hint selection
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ]
+
+The hint block group selection in the extent allocator is wrong in the
+first place, as it can select the dedicated data relocation block group for
+the normal data allocation.
+
+Since we separated the normal data space_info and the data relocation
+space_info, we can easily identify a block group is for data relocation or
+not. Do not choose it for the normal data allocation.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 5395e27f9e89a..7985ca56f6b70 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4224,7 +4224,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ }
+ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+-                                  struct find_free_extent_ctl *ffe_ctl)
++                                  struct find_free_extent_ctl *ffe_ctl,
++                                  struct btrfs_space_info *space_info)
+ {
+       if (ffe_ctl->for_treelog) {
+               spin_lock(&fs_info->treelog_bg_lock);
+@@ -4248,6 +4249,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+                       u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+                       if (block_group_bits(block_group, ffe_ctl->flags) &&
++                          block_group->space_info == space_info &&
+                           avail >= ffe_ctl->num_bytes) {
+                               ffe_ctl->hint_byte = block_group->start;
+                               break;
+@@ -4269,7 +4271,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
+               return prepare_allocation_clustered(fs_info, ffe_ctl,
+                                                   space_info, ins);
+       case BTRFS_EXTENT_ALLOC_ZONED:
+-              return prepare_allocation_zoned(fs_info, ffe_ctl);
++              return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
+       default:
+               BUG();
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
new file mode 100644 (file)
index 0000000..2923101
--- /dev/null
@@ -0,0 +1,89 @@
+From 0eb6e482475c0c284a317ac5c506a4b8996c084e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 20:30:17 +0000
+Subject: EDAC/mc_sysfs: Increase legacy channel support to 16
+
+From: Avadhut Naik <avadhut.naik@amd.com>
+
+[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ]
+
+Newer AMD systems can support up to 16 channels per EDAC "mc" device.
+These are detected by the EDAC module running on the device, and the
+current EDAC interface is appropriately enumerated.
+
+The legacy EDAC sysfs interface however, provides device attributes for
+channels 0 through 11 only. Consequently, the last four channels, 12
+through 15, will not be enumerated and will not be visible through the
+legacy sysfs interface.
+
+Add additional device attributes to ensure that all 16 channels, if
+present, are enumerated by and visible through the legacy EDAC sysfs
+interface.
+
+Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
+index 15f63452a9bec..b01436d9ddaed 100644
+--- a/drivers/edac/edac_mc_sysfs.c
++++ b/drivers/edac/edac_mc_sysfs.c
+@@ -306,6 +306,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 10);
+ DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 11);
++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 12);
++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 13);
++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 14);
++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 15);
+ /* Total possible dynamic DIMM Label attribute file table */
+ static struct attribute *dynamic_csrow_dimm_attr[] = {
+@@ -321,6 +329,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
+       &dev_attr_legacy_ch9_dimm_label.attr.attr,
+       &dev_attr_legacy_ch10_dimm_label.attr.attr,
+       &dev_attr_legacy_ch11_dimm_label.attr.attr,
++      &dev_attr_legacy_ch12_dimm_label.attr.attr,
++      &dev_attr_legacy_ch13_dimm_label.attr.attr,
++      &dev_attr_legacy_ch14_dimm_label.attr.attr,
++      &dev_attr_legacy_ch15_dimm_label.attr.attr,
+       NULL
+ };
+@@ -349,6 +361,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 10);
+ DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 11);
++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 12);
++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 13);
++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 14);
++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 15);
+ /* Total possible dynamic ce_count attribute file table */
+ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+@@ -364,6 +384,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+       &dev_attr_legacy_ch9_ce_count.attr.attr,
+       &dev_attr_legacy_ch10_ce_count.attr.attr,
+       &dev_attr_legacy_ch11_ce_count.attr.attr,
++      &dev_attr_legacy_ch12_ce_count.attr.attr,
++      &dev_attr_legacy_ch13_ce_count.attr.attr,
++      &dev_attr_legacy_ch14_ce_count.attr.attr,
++      &dev_attr_legacy_ch15_ce_count.attr.attr,
+       NULL
+ };
+-- 
+2.51.0
+
diff --git a/queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch
new file mode 100644 (file)
index 0000000..4a19fd1
--- /dev/null
@@ -0,0 +1,68 @@
+From db01bc1334e68bb784336c7c6a17f5330fe8bd7a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:40 -0400
+Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are
+ set
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ]
+
+get_perf_callchain() doesn't support cross-task unwinding for user space
+stacks, have it return NULL if both the crosstask and user arguments are
+set.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index 1273be84392cf..ce5534c97cd1d 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -184,6 +184,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       struct perf_callchain_entry_ctx ctx;
+       int rctx;
++      /* crosstask is not supported for user stacks */
++      if (crosstask && user && !kernel)
++              return NULL;
++
+       entry = get_callchain_entry(&rctx);
+       if (!entry)
+               return NULL;
+@@ -200,7 +204,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               perf_callchain_kernel(&ctx, regs);
+       }
+-      if (user) {
++      if (user && !crosstask) {
+               if (!user_mode(regs)) {
+                       if  (current->mm)
+                               regs = task_pt_regs(current);
+@@ -209,9 +213,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+               if (regs) {
+-                      if (crosstask)
+-                              goto exit_put;
+-
+                       if (add_mark)
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+@@ -219,7 +220,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+       }
+-exit_put:
+       put_callchain_entry(rctx);
+       return entry;
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..3c8a42c3f4d42b20690543383d21416e97ca0939 100644 (file)
@@ -1 +1,9 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+perf-have-get_perf_callchain-return-null-if-crosstas.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
+btrfs-zoned-refine-extent-allocator-hint-selection.patch
+btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
+arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
diff --git a/queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..93e2653
--- /dev/null
@@ -0,0 +1,51 @@
+From 83bed2b0363895cb49a0657a2060b85a29fbee20 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index ff8965bce6c90..a0b362ac50a1b 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1539,7 +1539,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3169,9 +3169,6 @@ static const char *spectre_bhi_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
new file mode 100644 (file)
index 0000000..f6370bd
--- /dev/null
@@ -0,0 +1,295 @@
+From 8905c9ee0afca1bccc75668f38209611ee29903d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Sep 2025 14:09:13 +0800
+Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c
+
+From: Menglong Dong <menglong8.dong@gmail.com>
+
+[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ]
+
+The include/generated/asm-offsets.h is generated in Kbuild during
+compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to
+generate another similar offset header file, circular dependency can
+happen.
+
+For example, we want to generate a offset file include/generated/test.h,
+which is included in include/sched/sched.h. If we generate asm-offsets.h
+first, it will fail, as include/sched/sched.h is included in asm-offsets.c
+and include/generated/test.h doesn't exist; If we generate test.h first,
+it can't success neither, as include/generated/asm-offsets.h is included
+by it.
+
+In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular
+dependency. We can generate asm-offsets.h first, and if the
+COMPILE_OFFSETS is defined, we don't include the "generated/test.h".
+
+And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this
+purpose.
+
+Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/alpha/kernel/asm-offsets.c      | 1 +
+ arch/arc/kernel/asm-offsets.c        | 1 +
+ arch/arm/kernel/asm-offsets.c        | 2 ++
+ arch/arm64/kernel/asm-offsets.c      | 1 +
+ arch/csky/kernel/asm-offsets.c       | 1 +
+ arch/hexagon/kernel/asm-offsets.c    | 1 +
+ arch/loongarch/kernel/asm-offsets.c  | 2 ++
+ arch/m68k/kernel/asm-offsets.c       | 1 +
+ arch/microblaze/kernel/asm-offsets.c | 1 +
+ arch/mips/kernel/asm-offsets.c       | 2 ++
+ arch/nios2/kernel/asm-offsets.c      | 1 +
+ arch/openrisc/kernel/asm-offsets.c   | 1 +
+ arch/parisc/kernel/asm-offsets.c     | 1 +
+ arch/powerpc/kernel/asm-offsets.c    | 1 +
+ arch/riscv/kernel/asm-offsets.c      | 1 +
+ arch/s390/kernel/asm-offsets.c       | 1 +
+ arch/sh/kernel/asm-offsets.c         | 1 +
+ arch/sparc/kernel/asm-offsets.c      | 1 +
+ arch/um/kernel/asm-offsets.c         | 2 ++
+ arch/xtensa/kernel/asm-offsets.c     | 1 +
+ 20 files changed, 24 insertions(+)
+
+diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
+index e9dad60b147f3..1ebb058904992 100644
+--- a/arch/alpha/kernel/asm-offsets.c
++++ b/arch/alpha/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c
+index f77deb7991757..2978da85fcb65 100644
+--- a/arch/arc/kernel/asm-offsets.c
++++ b/arch/arc/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
+index 4853875740d0f..d9f129c584b1d 100644
+--- a/arch/arm/kernel/asm-offsets.c
++++ b/arch/arm/kernel/asm-offsets.c
+@@ -7,6 +7,8 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
+index b21dd24b8efc3..020e01181a0f1 100644
+--- a/arch/arm64/kernel/asm-offsets.c
++++ b/arch/arm64/kernel/asm-offsets.c
+@@ -6,6 +6,7 @@
+  *               2001-2002 Keith Owens
+  * Copyright (C) 2012 ARM Ltd.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/arm_sdei.h>
+ #include <linux/sched.h>
+diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c
+index d1e9035794733..5525c8e7e1d9e 100644
+--- a/arch/csky/kernel/asm-offsets.c
++++ b/arch/csky/kernel/asm-offsets.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
+index 03a7063f94561..50eea9fa6f137 100644
+--- a/arch/hexagon/kernel/asm-offsets.c
++++ b/arch/hexagon/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  *
+  * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/types.h>
+diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
+index bee9f7a3108f0..d20d71d4bcae6 100644
+--- a/arch/loongarch/kernel/asm-offsets.c
++++ b/arch/loongarch/kernel/asm-offsets.c
+@@ -4,6 +4,8 @@
+  *
+  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/types.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
+index 906d732305374..67a1990f9d748 100644
+--- a/arch/m68k/kernel/asm-offsets.c
++++ b/arch/m68k/kernel/asm-offsets.c
+@@ -9,6 +9,7 @@
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+ #include <linux/stddef.h>
+diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c
+index 104c3ac5f30c8..b4b67d58e7f6a 100644
+--- a/arch/microblaze/kernel/asm-offsets.c
++++ b/arch/microblaze/kernel/asm-offsets.c
+@@ -7,6 +7,7 @@
+  * License. See the file "COPYING" in the main directory of this archive
+  * for more details.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/init.h>
+ #include <linux/stddef.h>
+diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
+index cb1045ebab062..22c99a2cd5707 100644
+--- a/arch/mips/kernel/asm-offsets.c
++++ b/arch/mips/kernel/asm-offsets.c
+@@ -9,6 +9,8 @@
+  * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000 MIPS Technologies, Inc.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compat.h>
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c
+index e3d9b7b6fb48a..88190b503ce5d 100644
+--- a/arch/nios2/kernel/asm-offsets.c
++++ b/arch/nios2/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/sched.h>
+diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c
+index 710651d5aaae1..3cc826f2216b1 100644
+--- a/arch/openrisc/kernel/asm-offsets.c
++++ b/arch/openrisc/kernel/asm-offsets.c
+@@ -18,6 +18,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/signal.h>
+ #include <linux/sched.h>
+diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
+index 757816a7bd4b2..9abfe65492c65 100644
+--- a/arch/parisc/kernel/asm-offsets.c
++++ b/arch/parisc/kernel/asm-offsets.c
+@@ -13,6 +13,7 @@
+  *    Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
+  *    Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
+index 131a8cc10dbe8..cbeeda45c00a2 100644
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/signal.h>
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index c2f3129a8e5cf..05c6152a65310 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -3,6 +3,7 @@
+  * Copyright (C) 2012 Regents of the University of California
+  * Copyright (C) 2017 SiFive
+  */
++#define COMPILE_OFFSETS
+ #include <linux/kbuild.h>
+ #include <linux/mm.h>
+diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
+index 5529248d84fb8..3cfc4939033c9 100644
+--- a/arch/s390/kernel/asm-offsets.c
++++ b/arch/s390/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
+index a0322e8328456..429b6a7631468 100644
+--- a/arch/sh/kernel/asm-offsets.c
++++ b/arch/sh/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/types.h>
+diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
+index 3d9b9855dce91..6e660bde48dd8 100644
+--- a/arch/sparc/kernel/asm-offsets.c
++++ b/arch/sparc/kernel/asm-offsets.c
+@@ -10,6 +10,7 @@
+  *
+  * On sparc, thread_info data is static and TI_XXX offsets are computed by hand.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm_types.h>
+diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
+index 1fb12235ab9c8..a69873aa697f4 100644
+--- a/arch/um/kernel/asm-offsets.c
++++ b/arch/um/kernel/asm-offsets.c
+@@ -1 +1,3 @@
++#define COMPILE_OFFSETS
++
+ #include <sysdep/kernel-offsets.h>
+diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
+index da38de20ae598..cfbced95e944a 100644
+--- a/arch/xtensa/kernel/asm-offsets.c
++++ b/arch/xtensa/kernel/asm-offsets.c
+@@ -11,6 +11,7 @@
+  *
+  * Chris Zankel <chris@zankel.net>
+  */
++#define COMPILE_OFFSETS
+ #include <asm/processor.h>
+ #include <asm/coprocessor.h>
+-- 
+2.51.0
+
diff --git a/queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch
new file mode 100644 (file)
index 0000000..d2d02b8
--- /dev/null
@@ -0,0 +1,44 @@
+From 9c66e5dad6997a22b7ffbbecaed782d5db5c2542 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 17:04:07 -0400
+Subject: audit: record fanotify event regardless of presence of rules
+
+From: Richard Guy Briggs <rgb@redhat.com>
+
+[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ]
+
+When no audit rules are in place, fanotify event results are
+unconditionally dropped due to an explicit check for the existence of
+any audit rules.  Given this is a report from another security
+sub-system, allow it to be recorded regardless of the existence of any
+audit rules.
+
+To test, install and run the fapolicyd daemon with default config.  Then
+as an unprivileged user, create and run a very simple binary that should
+be denied.  Then check for an event with
+       ausearch -m FANOTIFY -ts recent
+
+Link: https://issues.redhat.com/browse/RHEL-9065
+Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/audit.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/linux/audit.h b/include/linux/audit.h
+index a394614ccd0b8..e3f06eba9c6e6 100644
+--- a/include/linux/audit.h
++++ b/include/linux/audit.h
+@@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name)
+ static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
+ {
+-      if (!audit_dummy_context())
++      if (audit_enabled)
+               __audit_fanotify(response, friar);
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch b/queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch
new file mode 100644 (file)
index 0000000..c2f5abe
--- /dev/null
@@ -0,0 +1,39 @@
+From 88d6ab1b772266a93ddd13ce9c15485bedc4322c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Sep 2025 17:43:04 +0100
+Subject: btrfs: abort transaction if we fail to update inode in log replay dir
+ fixup
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 5a0565cad3ef7cbf4cf43d1dd1e849b156205292 ]
+
+If we fail to update the inode at link_to_fixup_dir(), we don't abort the
+transaction and propagate the error up the call chain, which makes it hard
+to pinpoint the error to the inode update. So abort the transaction if the
+inode update call fails, so that if it happens we known immediately.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index b43a7c0c7cb7a..173e13e1d5b88 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1778,6 +1778,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+               else
+                       inc_nlink(vfs_inode);
+               ret = btrfs_update_inode(trans, inode);
++              if (ret)
++                      btrfs_abort_transaction(trans, ret);
+       } else if (ret == -EEXIST) {
+               ret = 0;
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch b/queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch
new file mode 100644 (file)
index 0000000..2157590
--- /dev/null
@@ -0,0 +1,77 @@
+From 9bdeac056de0e4eb2dfe729346d08037d6e5c175 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 15:49:31 +0100
+Subject: btrfs: abort transaction in the process_one_buffer() log tree walk
+ callback
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit e6dd405b6671b9753b98d8bdf76f8f0ed36c11cd ]
+
+In the process_one_buffer() log tree walk callback we return errors to the
+log tree walk caller and then the caller aborts the transaction, if we
+have one, or turns the fs into error state if we don't have one. While
+this reduces code it makes it harder to figure out where exactly an error
+came from. So add the transaction aborts after every failure inside the
+process_one_buffer() callback, so that it helps figuring out why failures
+happen.
+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 20 ++++++++++++++++----
+ 1 file changed, 16 insertions(+), 4 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index f3ca530f032df..1c207a6d71ecf 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -350,6 +350,7 @@ static int process_one_buffer(struct btrfs_root *log,
+                             struct extent_buffer *eb,
+                             struct walk_control *wc, u64 gen, int level)
+ {
++      struct btrfs_trans_handle *trans = wc->trans;
+       struct btrfs_fs_info *fs_info = log->fs_info;
+       int ret = 0;
+@@ -364,18 +365,29 @@ static int process_one_buffer(struct btrfs_root *log,
+               };
+               ret = btrfs_read_extent_buffer(eb, &check);
+-              if (ret)
++              if (ret) {
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                       return ret;
++              }
+       }
+       if (wc->pin) {
+-              ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
+-              if (ret)
++              ASSERT(trans != NULL);
++              ret = btrfs_pin_extent_for_log_replay(trans, eb);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
+                       return ret;
++              }
+               if (btrfs_buffer_uptodate(eb, gen, 0) &&
+-                  btrfs_header_level(eb) == 0)
++                  btrfs_header_level(eb) == 0) {
+                       ret = btrfs_exclude_logged_extents(eb);
++                      if (ret)
++                              btrfs_abort_transaction(trans, ret);
++              }
+       }
+       return ret;
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch b/queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch
new file mode 100644 (file)
index 0000000..fc41af4
--- /dev/null
@@ -0,0 +1,111 @@
+From 81186743181cc300f669cb0bfb781e773fa6ea6b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 14:56:11 +0100
+Subject: btrfs: abort transaction on specific error places when walking log
+ tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 6ebd726b104fa99d47c0d45979e6a6109844ac18 ]
+
+We do several things while walking a log tree (for replaying and for
+freeing a log tree) like reading extent buffers and cleaning them up,
+but we don't immediately abort the transaction, or turn the fs into an
+error state, when one of these things fails. Instead we the transaction
+abort or turn the fs into error state in the caller of the entry point
+function that walks a log tree - walk_log_tree() - which means we don't
+get to know exactly where an error came from.
+
+Improve on this by doing a transaction abort / turn fs into error state
+after each such failure so that when it happens we have a better
+understanding where the failure comes from. This deliberately leaves
+the transaction abort / turn fs into error state in the callers of
+walk_log_tree() as to ensure we don't get into an inconsistent state in
+case we forget to do it deeper in call chain. It also deliberately does
+not do it after errors from the calls to the callback defined in
+struct walk_control::process_func(), as we will do it later on another
+patch.
+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 33 ++++++++++++++++++++++++++++-----
+ 1 file changed, 28 insertions(+), 5 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 0022ad003791f..f3ca530f032df 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2612,15 +2612,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+ static int clean_log_buffer(struct btrfs_trans_handle *trans,
+                           struct extent_buffer *eb)
+ {
++      int ret;
++
+       btrfs_tree_lock(eb);
+       btrfs_clear_buffer_dirty(trans, eb);
+       wait_on_extent_buffer_writeback(eb);
+       btrfs_tree_unlock(eb);
+-      if (trans)
+-              return btrfs_pin_reserved_extent(trans, eb);
++      if (trans) {
++              ret = btrfs_pin_reserved_extent(trans, eb);
++              if (ret)
++                      btrfs_abort_transaction(trans, ret);
++              return ret;
++      }
+-      return unaccount_log_buffer(eb->fs_info, eb->start);
++      ret = unaccount_log_buffer(eb->fs_info, eb->start);
++      if (ret)
++              btrfs_handle_fs_error(eb->fs_info, ret, NULL);
++      return ret;
+ }
+ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+@@ -2656,8 +2665,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+               next = btrfs_find_create_tree_block(fs_info, bytenr,
+                                                   btrfs_header_owner(cur),
+                                                   *level - 1);
+-              if (IS_ERR(next))
+-                      return PTR_ERR(next);
++              if (IS_ERR(next)) {
++                      ret = PTR_ERR(next);
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
++                      return ret;
++              }
+               if (*level == 1) {
+                       ret = wc->process_func(root, next, wc, ptr_gen,
+@@ -2672,6 +2687,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+                               ret = btrfs_read_extent_buffer(next, &check);
+                               if (ret) {
+                                       free_extent_buffer(next);
++                                      if (trans)
++                                              btrfs_abort_transaction(trans, ret);
++                                      else
++                                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                                       return ret;
+                               }
+@@ -2687,6 +2706,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+               ret = btrfs_read_extent_buffer(next, &check);
+               if (ret) {
+                       free_extent_buffer(next);
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                       return ret;
+               }
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..15c93c8
--- /dev/null
@@ -0,0 +1,63 @@
+From c6fd6b1021ea066aa250a87df9a586cacfc01851 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index e655fa3bfd9be..3a73d218af464 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2100,10 +2100,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 1c207a6d71ecf..63b14005f5066 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -7457,7 +7457,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
new file mode 100644 (file)
index 0000000..4104cec
--- /dev/null
@@ -0,0 +1,44 @@
+From 8110fe15ebe327b95bcf726f399a81011b0fa47e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Sep 2025 17:01:44 +0200
+Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in
+ scrub_throttle_dev_io()
+
+From: Thorsten Blum <thorsten.blum@linux.dev>
+
+[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ]
+
+Replace max_t() followed by min_t() with a single clamp().
+
+As was pointed by David Laight in
+https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/
+the calculation may overflow u32 when the input value is too large, so
+clamp_t() is not used.  In practice the expected values are in range of
+megabytes to gigabytes (throughput limit) so the bug would not happen.
+
+Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ Use clamp() and add explanation. ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 3fcc7c092c5ec..9a6e0b047d3b6 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1270,8 +1270,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
+        * Slice is divided into intervals when the IO is submitted, adjust by
+        * bwlimit and maximum of 64 intervals.
+        */
+-      div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-      div = min_t(u32, 64, div);
++      div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
+       /* Start new epoch, set deadline */
+       now = ktime_get();
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch b/queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch
new file mode 100644 (file)
index 0000000..030d2d5
--- /dev/null
@@ -0,0 +1,90 @@
+From 277716e473637786bb6a577d628c45e2e3465378 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 08:34:05 +0930
+Subject: btrfs: tree-checker: add inode extref checks
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit aab9458b9f0019e97fae394c2d6d9d1a03addfb3 ]
+
+Like inode refs, inode extrefs have a variable length name, which means
+we have to do a proper check to make sure no header nor name can exceed
+the item limits.
+
+The check itself is very similar to check_inode_ref(), just a different
+structure (btrfs_inode_extref vs btrfs_inode_ref).
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-checker.c | 37 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 37 insertions(+)
+
+diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
+index 14f96d217e6e1..986b1612d5b04 100644
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
+       /* Only these key->types needs to be checked */
+       ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+              key->type == BTRFS_INODE_REF_KEY ||
++             key->type == BTRFS_INODE_EXTREF_KEY ||
+              key->type == BTRFS_DIR_INDEX_KEY ||
+              key->type == BTRFS_DIR_ITEM_KEY ||
+              key->type == BTRFS_EXTENT_DATA_KEY);
+@@ -1770,6 +1771,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
+       return 0;
+ }
++static int check_inode_extref(struct extent_buffer *leaf,
++                            struct btrfs_key *key, struct btrfs_key *prev_key,
++                            int slot)
++{
++      unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
++      unsigned long end = ptr + btrfs_item_size(leaf, slot);
++
++      if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
++              return -EUCLEAN;
++
++      while (ptr < end) {
++              struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
++              u16 namelen;
++
++              if (unlikely(ptr + sizeof(*extref)) > end) {
++                      inode_ref_err(leaf, slot,
++                      "inode extref overflow, ptr %lu end %lu inode_extref size %zu",
++                                    ptr, end, sizeof(*extref));
++                      return -EUCLEAN;
++              }
++
++              namelen = btrfs_inode_extref_name_len(leaf, extref);
++              if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
++                      inode_ref_err(leaf, slot,
++                              "inode extref overflow, ptr %lu end %lu namelen %u",
++                              ptr, end, namelen);
++                      return -EUCLEAN;
++              }
++              ptr += sizeof(*extref) + namelen;
++      }
++      return 0;
++}
++
+ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+                                   const struct btrfs_key *key, int slot)
+ {
+@@ -1881,6 +1915,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
+       case BTRFS_INODE_REF_KEY:
+               ret = check_inode_ref(leaf, key, prev_key, slot);
+               break;
++      case BTRFS_INODE_EXTREF_KEY:
++              ret = check_inode_extref(leaf, key, prev_key, slot);
++              break;
+       case BTRFS_BLOCK_GROUP_ITEM_KEY:
+               ret = check_block_group_item(leaf, key, slot);
+               break;
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
new file mode 100644 (file)
index 0000000..011f9ea
--- /dev/null
@@ -0,0 +1,50 @@
+From bbe02836089d717f5f340b95731ecc35b7434a60 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Aug 2025 17:46:18 +0100
+Subject: btrfs: use level argument in log tree walk callback
+ replay_one_buffer()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ]
+
+We already have the extent buffer's level in an argument, there's no need
+to first ensure the extent buffer's data is loaded (by calling
+btrfs_read_extent_buffer()) and then call btrfs_header_level() to check
+the level. So use the level argument and do the check before calling
+btrfs_read_extent_buffer().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 63b14005f5066..b43a7c0c7cb7a 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2443,15 +2443,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+       int i;
+       int ret;
++      if (level != 0)
++              return 0;
++
+       ret = btrfs_read_extent_buffer(eb, &check);
+       if (ret)
+               return ret;
+-      level = btrfs_header_level(eb);
+-
+-      if (level != 0)
+-              return 0;
+-
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..16b2efd
--- /dev/null
@@ -0,0 +1,58 @@
+From b6543dc88730987fe194ba4d74a9fb0de649075c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 1a029392eac52..f4dda72491feb 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1810,7 +1810,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch
new file mode 100644 (file)
index 0000000..0a81577
--- /dev/null
@@ -0,0 +1,59 @@
+From b99faec603c7d9385177f7858dac67ed1d08c110 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 11:13:15 +0900
+Subject: btrfs: zoned: refine extent allocator hint selection
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ]
+
+The hint block group selection in the extent allocator is wrong in the
+first place, as it can select the dedicated data relocation block group for
+the normal data allocation.
+
+Since we separated the normal data space_info and the data relocation
+space_info, we can easily identify a block group is for data relocation or
+not. Do not choose it for the normal data allocation.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index bb3602059906d..7bab2512468d5 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4299,7 +4299,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ }
+ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+-                                  struct find_free_extent_ctl *ffe_ctl)
++                                  struct find_free_extent_ctl *ffe_ctl,
++                                  struct btrfs_space_info *space_info)
+ {
+       if (ffe_ctl->for_treelog) {
+               spin_lock(&fs_info->treelog_bg_lock);
+@@ -4323,6 +4324,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+                       u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+                       if (block_group_bits(block_group, ffe_ctl->flags) &&
++                          block_group->space_info == space_info &&
+                           avail >= ffe_ctl->num_bytes) {
+                               ffe_ctl->hint_byte = block_group->start;
+                               break;
+@@ -4344,7 +4346,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
+               return prepare_allocation_clustered(fs_info, ffe_ctl,
+                                                   space_info, ins);
+       case BTRFS_EXTENT_ALLOC_ZONED:
+-              return prepare_allocation_zoned(fs_info, ffe_ctl);
++              return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
+       default:
+               BUG();
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
new file mode 100644 (file)
index 0000000..97d2401
--- /dev/null
@@ -0,0 +1,111 @@
+From 4a48cb581a19341097dba34421a1cb023b4d9e32 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Jul 2025 13:39:11 +0200
+Subject: btrfs: zoned: return error from btrfs_zone_finish_endio()
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ]
+
+Now that btrfs_zone_finish_endio_workfn() is directly calling
+do_zone_finish() the only caller of btrfs_zone_finish_endio() is
+btrfs_finish_one_ordered().
+
+btrfs_finish_one_ordered() already has error handling in-place so
+btrfs_zone_finish_endio() can return an error if the block group lookup
+fails.
+
+Also as btrfs_zone_finish_endio() already checks for zoned filesystems and
+returns early, there's no need to do this in the caller.
+
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 7 ++++---
+ fs/btrfs/zoned.c | 8 +++++---
+ fs/btrfs/zoned.h | 9 ++++++---
+ 3 files changed, 15 insertions(+), 9 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 19c0ec9c327c1..e32dd4193aea1 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3174,9 +3174,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
+               goto out;
+       }
+-      if (btrfs_is_zoned(fs_info))
+-              btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+-                                      ordered_extent->disk_num_bytes);
++      ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
++                                    ordered_extent->disk_num_bytes);
++      if (ret)
++              goto out;
+       if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+               truncated = true;
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 4966b4f5a7d24..64e0a5bf5f9a5 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2384,16 +2384,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+       return ret;
+ }
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+ {
+       struct btrfs_block_group *block_group;
+       u64 min_alloc_bytes;
+       if (!btrfs_is_zoned(fs_info))
+-              return;
++              return 0;
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+-      ASSERT(block_group);
++      if (WARN_ON_ONCE(!block_group))
++              return -ENOENT;
+       /* No MIXED_BG on zoned btrfs. */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+@@ -2410,6 +2411,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
+ out:
+       btrfs_put_block_group(block_group);
++      return 0;
+ }
+ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 7612e65726053..f7171ab6ed71e 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb);
+@@ -232,8 +232,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+       return true;
+ }
+-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+-                                         u64 logical, u64 length) { }
++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
++                                         u64 logical, u64 length)
++{
++      return 0;
++}
+ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                                struct extent_buffer *eb) { }
+-- 
+2.51.0
+
diff --git a/queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch b/queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch
new file mode 100644 (file)
index 0000000..146f792
--- /dev/null
@@ -0,0 +1,47 @@
+From e566c5390297c67166b64ac35818038e9c63be55 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Sep 2025 01:12:27 +0000
+Subject: cpuset: Use new excpus for nocpu error check when enabling root
+ partition
+
+From: Chen Ridong <chenridong@huawei.com>
+
+[ Upstream commit 59d5de3655698679ad8fd2cc82228de4679c4263 ]
+
+A previous patch fixed a bug where new_prs should be assigned before
+checking housekeeping conflicts. This patch addresses another potential
+issue: the nocpu error check currently uses the xcpus which is not updated.
+Although no issue has been observed so far, the check should be performed
+using the new effective exclusive cpus.
+
+The comment has been removed because the function returns an error if
+nocpu checking fails, which is unrelated to the parent.
+
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Reviewed-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 25f9565f798d4..13eb986172499 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -1679,11 +1679,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+               if (prstate_housekeeping_conflict(new_prs, xcpus))
+                       return PERR_HKEEPING;
+-              /*
+-               * A parent can be left with no CPU as long as there is no
+-               * task directly associated with the parent partition.
+-               */
+-              if (nocpu)
++              if (tasks_nocpu_error(parent, cs, xcpus))
+                       return PERR_NOCPUS;
+               deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
+-- 
+2.51.0
+
diff --git a/queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
new file mode 100644 (file)
index 0000000..be8aab9
--- /dev/null
@@ -0,0 +1,89 @@
+From 71bc3080d5e3c82d57a7d368802c0eeb3fe35796 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 20:30:17 +0000
+Subject: EDAC/mc_sysfs: Increase legacy channel support to 16
+
+From: Avadhut Naik <avadhut.naik@amd.com>
+
+[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ]
+
+Newer AMD systems can support up to 16 channels per EDAC "mc" device.
+These are detected by the EDAC module running on the device, and the
+current EDAC interface is appropriately enumerated.
+
+The legacy EDAC sysfs interface however, provides device attributes for
+channels 0 through 11 only. Consequently, the last four channels, 12
+through 15, will not be enumerated and will not be visible through the
+legacy sysfs interface.
+
+Add additional device attributes to ensure that all 16 channels, if
+present, are enumerated by and visible through the legacy EDAC sysfs
+interface.
+
+Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
+index 4200aec048318..70dc0ee1cc08f 100644
+--- a/drivers/edac/edac_mc_sysfs.c
++++ b/drivers/edac/edac_mc_sysfs.c
+@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 10);
+ DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 11);
++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 12);
++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 13);
++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 14);
++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 15);
+ /* Total possible dynamic DIMM Label attribute file table */
+ static struct attribute *dynamic_csrow_dimm_attr[] = {
+@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
+       &dev_attr_legacy_ch9_dimm_label.attr.attr,
+       &dev_attr_legacy_ch10_dimm_label.attr.attr,
+       &dev_attr_legacy_ch11_dimm_label.attr.attr,
++      &dev_attr_legacy_ch12_dimm_label.attr.attr,
++      &dev_attr_legacy_ch13_dimm_label.attr.attr,
++      &dev_attr_legacy_ch14_dimm_label.attr.attr,
++      &dev_attr_legacy_ch15_dimm_label.attr.attr,
+       NULL
+ };
+@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 10);
+ DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 11);
++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 12);
++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 13);
++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 14);
++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 15);
+ /* Total possible dynamic ce_count attribute file table */
+ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+       &dev_attr_legacy_ch9_ce_count.attr.attr,
+       &dev_attr_legacy_ch10_ce_count.attr.attr,
+       &dev_attr_legacy_ch11_ce_count.attr.attr,
++      &dev_attr_legacy_ch12_ce_count.attr.attr,
++      &dev_attr_legacy_ch13_ce_count.attr.attr,
++      &dev_attr_legacy_ch14_ce_count.attr.attr,
++      &dev_attr_legacy_ch15_ce_count.attr.attr,
+       NULL
+ };
+-- 
+2.51.0
+
diff --git a/queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch
new file mode 100644 (file)
index 0000000..b781a48
--- /dev/null
@@ -0,0 +1,68 @@
+From 52f1a40706a3d0a7fe1f00e045735ecd4d752fa9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:40 -0400
+Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are
+ set
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ]
+
+get_perf_callchain() doesn't support cross-task unwinding for user space
+stacks, have it return NULL if both the crosstask and user arguments are
+set.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index d1a09e6f514c9..49d87e6db553f 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -223,6 +223,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       struct perf_callchain_entry_ctx ctx;
+       int rctx, start_entry_idx;
++      /* crosstask is not supported for user stacks */
++      if (crosstask && user && !kernel)
++              return NULL;
++
+       entry = get_callchain_entry(&rctx);
+       if (!entry)
+               return NULL;
+@@ -239,7 +243,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               perf_callchain_kernel(&ctx, regs);
+       }
+-      if (user) {
++      if (user && !crosstask) {
+               if (!user_mode(regs)) {
+                       if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
+@@ -248,9 +252,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+               if (regs) {
+-                      if (crosstask)
+-                              goto exit_put;
+-
+                       if (add_mark)
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+@@ -260,7 +261,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+       }
+-exit_put:
+       put_callchain_entry(rctx);
+       return entry;
+-- 
+2.51.0
+
diff --git a/queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
new file mode 100644 (file)
index 0000000..8cf342b
--- /dev/null
@@ -0,0 +1,37 @@
+From a41a16b8fdc8a8c7396811aa98fdc15b7d36235f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:43 -0400
+Subject: perf: Skip user unwind if the task is a kernel thread
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ]
+
+If the task is not a user thread, there's no user stack to unwind.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/core.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 0339f60e34981..d6a86d8e9e59b 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -7847,7 +7847,8 @@ struct perf_callchain_entry *
+ perf_callchain(struct perf_event *event, struct pt_regs *regs)
+ {
+       bool kernel = !event->attr.exclude_callchain_kernel;
+-      bool user   = !event->attr.exclude_callchain_user;
++      bool user   = !event->attr.exclude_callchain_user &&
++              !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
+       /* Disallow cross-task user callchains. */
+       bool crosstask = event->ctx->task && event->ctx->task != current;
+       const u32 max_stack = event->attr.sample_max_stack;
+-- 
+2.51.0
+
diff --git a/queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
new file mode 100644 (file)
index 0000000..459f745
--- /dev/null
@@ -0,0 +1,67 @@
+From 5513a16de55c2fc4c77969e1300f73c3962cc296 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:41 -0400
+Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of
+ current->mm == NULL
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ]
+
+To determine if a task is a kernel thread or not, it is more reliable to
+use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on
+current->mm being NULL.  That is because some kernel tasks (io_uring
+helpers) may have a mm field.
+
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 6 +++---
+ kernel/events/core.c      | 4 ++--
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index 8a47e52a454f4..d1a09e6f514c9 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -241,10 +241,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       if (user) {
+               if (!user_mode(regs)) {
+-                      if  (current->mm)
+-                              regs = task_pt_regs(current);
+-                      else
++                      if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
++                      else
++                              regs = task_pt_regs(current);
+               }
+               if (regs) {
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index d60d48d482b01..0339f60e34981 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -7095,7 +7095,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
+       if (user_mode(regs)) {
+               regs_user->abi = perf_reg_abi(current);
+               regs_user->regs = regs;
+-      } else if (!(current->flags & PF_KTHREAD)) {
++      } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+               perf_get_regs_user(regs_user, regs);
+       } else {
+               regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+@@ -7735,7 +7735,7 @@ static u64 perf_virt_to_phys(u64 virt)
+                * Try IRQ-safe get_user_page_fast_only first.
+                * If failed, leave phys_addr as 0.
+                */
+-              if (current->mm != NULL) {
++              if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+                       struct page *p;
+                       pagefault_disable();
+-- 
+2.51.0
+
diff --git a/queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch b/queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch
new file mode 100644 (file)
index 0000000..e778d5c
--- /dev/null
@@ -0,0 +1,101 @@
+From 5d4fbce76c5677e241e63851a259064ad6435df3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 10:30:31 +0800
+Subject: perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into
+ INTEL_FIXED_BITS_MASK
+
+From: Dapeng Mi <dapeng1.mi@linux.intel.com>
+
+[ Upstream commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a ]
+
+ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK,
+add it.
+
+With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can
+be optimized. The old fixed counter control bits can be unconditionally
+cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on
+new configuration.
+
+Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
+Tested-by: Yi Lai <yi1.lai@intel.com>
+Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/events/intel/core.c      | 10 +++-------
+ arch/x86/include/asm/perf_event.h |  6 +++++-
+ arch/x86/kvm/pmu.h                |  2 +-
+ 3 files changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
+index 36d8404f406de..acc0774519ce2 100644
+--- a/arch/x86/events/intel/core.c
++++ b/arch/x86/events/intel/core.c
+@@ -2812,8 +2812,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
+ {
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+-      u64 mask, bits = 0;
+       int idx = hwc->idx;
++      u64 bits = 0;
+       if (is_topdown_idx(idx)) {
+               struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+@@ -2849,14 +2849,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
+       idx -= INTEL_PMC_IDX_FIXED;
+       bits = intel_fixed_bits_by_idx(idx, bits);
+-      mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
+-
+-      if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
++      if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
+               bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
+-              mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
+-      }
+-      cpuc->fixed_ctrl_val &= ~mask;
++      cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
+       cpuc->fixed_ctrl_val |= bits;
+ }
+diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
+index aa351c4a20eee..c69b6498f6eaa 100644
+--- a/arch/x86/include/asm/perf_event.h
++++ b/arch/x86/include/asm/perf_event.h
+@@ -35,7 +35,6 @@
+ #define ARCH_PERFMON_EVENTSEL_EQ                      (1ULL << 36)
+ #define ARCH_PERFMON_EVENTSEL_UMASK2                  (0xFFULL << 40)
+-#define INTEL_FIXED_BITS_MASK                         0xFULL
+ #define INTEL_FIXED_BITS_STRIDE                       4
+ #define INTEL_FIXED_0_KERNEL                          (1ULL << 0)
+ #define INTEL_FIXED_0_USER                            (1ULL << 1)
+@@ -47,6 +46,11 @@
+ #define ICL_EVENTSEL_ADAPTIVE                         (1ULL << 34)
+ #define ICL_FIXED_0_ADAPTIVE                          (1ULL << 32)
++#define INTEL_FIXED_BITS_MASK                                 \
++      (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER |            \
++       INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI |   \
++       ICL_FIXED_0_ADAPTIVE)
++
+ #define intel_fixed_bits_by_idx(_idx, _bits)                  \
+       ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
+diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
+index ad89d0bd60058..103604c4b33b5 100644
+--- a/arch/x86/kvm/pmu.h
++++ b/arch/x86/kvm/pmu.h
+@@ -13,7 +13,7 @@
+ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+                                         MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
++/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
+ #define fixed_ctrl_field(ctrl_reg, idx) \
+       (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
+-- 
+2.51.0
+
diff --git a/queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch b/queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch
new file mode 100644 (file)
index 0000000..55de656
--- /dev/null
@@ -0,0 +1,70 @@
+From cbcae6872c4865b170a63648115d9a0c9d6b8783 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Sep 2025 09:03:26 -1000
+Subject: sched_ext: Make qmap dump operation non-destructive
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit d452972858e5cfa4262320ab74fe8f016460b96f ]
+
+The qmap dump operation was destructively consuming queue entries while
+displaying them. As dump can be triggered anytime, this can easily lead to
+stalls. Add a temporary dump_store queue and modify the dump logic to pop
+entries, display them, and then restore them back to the original queue.
+This allows dump operations to be performed without affecting the
+scheduler's queue state.
+
+Note that if racing against new enqueues during dump, ordering can get
+mixed up, but this is acceptable for debugging purposes.
+
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/sched_ext/scx_qmap.bpf.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
+index 5d1f880d1149e..e952f525599bd 100644
+--- a/tools/sched_ext/scx_qmap.bpf.c
++++ b/tools/sched_ext/scx_qmap.bpf.c
+@@ -56,7 +56,8 @@ struct qmap {
+   queue1 SEC(".maps"),
+   queue2 SEC(".maps"),
+   queue3 SEC(".maps"),
+-  queue4 SEC(".maps");
++  queue4 SEC(".maps"),
++  dump_store SEC(".maps");
+ struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+@@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+                       return;
+               scx_bpf_dump("QMAP FIFO[%d]:", i);
++
++              /*
++               * Dump can be invoked anytime and there is no way to iterate in
++               * a non-destructive way. Pop and store in dump_store and then
++               * restore afterwards. If racing against new enqueues, ordering
++               * can get mixed up.
++               */
+               bpf_repeat(4096) {
+                       if (bpf_map_pop_elem(fifo, &pid))
+                               break;
++                      bpf_map_push_elem(&dump_store, &pid, 0);
+                       scx_bpf_dump(" %d", pid);
+               }
++
++              bpf_repeat(4096) {
++                      if (bpf_map_pop_elem(&dump_store, &pid))
++                              break;
++                      bpf_map_push_elem(fifo, &pid, 0);
++              }
++
+               scx_bpf_dump("\n");
+       }
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch b/queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch
new file mode 100644 (file)
index 0000000..ba43d7f
--- /dev/null
@@ -0,0 +1,85 @@
+From 5d620bd1214b6de59a55396c50bcce763e0e69e6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 20 Jul 2025 13:21:30 +0200
+Subject: seccomp: passthrough uprobe systemcall without filtering
+
+From: Jiri Olsa <jolsa@kernel.org>
+
+[ Upstream commit 89d1d8434d246c96309a6068dfcf9e36dc61227b ]
+
+Adding uprobe as another exception to the seccomp filter alongside
+with the uretprobe syscall.
+
+Same as the uretprobe the uprobe syscall is installed by kernel as
+replacement for the breakpoint exception and is limited to x86_64
+arch and isn't expected to ever be supported in i386.
+
+Signed-off-by: Jiri Olsa <jolsa@kernel.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Kees Cook <kees@kernel.org>
+Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/seccomp.c | 32 +++++++++++++++++++++++++-------
+ 1 file changed, 25 insertions(+), 7 deletions(-)
+
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 267b00005eaf2..1eac0d2b8ecbe 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -733,6 +733,26 @@ seccomp_prepare_user_filter(const char __user *user_filter)
+ }
+ #ifdef SECCOMP_ARCH_NATIVE
++static bool seccomp_uprobe_exception(struct seccomp_data *sd)
++{
++#if defined __NR_uretprobe || defined __NR_uprobe
++#ifdef SECCOMP_ARCH_COMPAT
++      if (sd->arch == SECCOMP_ARCH_NATIVE)
++#endif
++      {
++#ifdef __NR_uretprobe
++              if (sd->nr == __NR_uretprobe)
++                      return true;
++#endif
++#ifdef __NR_uprobe
++              if (sd->nr == __NR_uprobe)
++                      return true;
++#endif
++      }
++#endif
++      return false;
++}
++
+ /**
+  * seccomp_is_const_allow - check if filter is constant allow with given data
+  * @fprog: The BPF programs
+@@ -750,13 +770,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+               return false;
+       /* Our single exception to filtering. */
+-#ifdef __NR_uretprobe
+-#ifdef SECCOMP_ARCH_COMPAT
+-      if (sd->arch == SECCOMP_ARCH_NATIVE)
+-#endif
+-              if (sd->nr == __NR_uretprobe)
+-                      return true;
+-#endif
++      if (seccomp_uprobe_exception(sd))
++              return true;
+       for (pc = 0; pc < fprog->len; pc++) {
+               struct sock_filter *insn = &fprog->filter[pc];
+@@ -1034,6 +1049,9 @@ static const int mode1_syscalls[] = {
+       __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+ #ifdef __NR_uretprobe
+       __NR_uretprobe,
++#endif
++#ifdef __NR_uprobe
++      __NR_uprobe,
+ #endif
+       -1, /* negative terminated */
+ };
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..128ef58b138764d40f5018cb4df8d232233f8908 100644 (file)
@@ -1 +1,23 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+audit-record-fanotify-event-regardless-of-presence-o.patch
+perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch
+perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
+perf-have-get_perf_callchain-return-null-if-crosstas.patch
+perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
+seccomp-passthrough-uprobe-systemcall-without-filter.patch
+x86-bugs-report-correct-retbleed-mitigation-status.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
+cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch
+btrfs-abort-transaction-on-specific-error-places-whe.patch
+btrfs-abort-transaction-in-the-process_one_buffer-lo.patch
+btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
+btrfs-zoned-refine-extent-allocator-hint-selection.patch
+btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
+btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch
+btrfs-tree-checker-add-inode-extref-checks.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
+sched_ext-make-qmap-dump-operation-non-destructive.patch
+arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
diff --git a/queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..73653ac
--- /dev/null
@@ -0,0 +1,51 @@
+From a036c27a3f7cf40ed8c2a1810edd5984a792ad7c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 0c16457e06543..939401b5d2ef0 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1598,7 +1598,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3251,9 +3251,6 @@ static const char *spectre_bhi_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch
new file mode 100644 (file)
index 0000000..b7db3ac
--- /dev/null
@@ -0,0 +1,46 @@
+From f6920d5581ee7b7a5f0f69ff94e67cf03510b89e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:06 -0500
+Subject: x86/bugs: Report correct retbleed mitigation status
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ]
+
+On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this
+requires that a similar spectre_v2 mitigation is applied.  If the user
+selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a
+warning is printed but sysfs will still report 'Mitigation: IBRS' or
+'Mitigation: Enhanced IBRS'.  This is incorrect because retbleed is not
+mitigated, and IBRS is not actually set.
+
+Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the
+kernel correctly reports the system as vulnerable to retbleed.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index f3cb559a598df..0c16457e06543 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1186,8 +1186,10 @@ static void __init retbleed_select_mitigation(void)
+                       retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+                       break;
+               default:
+-                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
++                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+                               pr_err(RETBLEED_INTEL_MSG);
++                              retbleed_mitigation = RETBLEED_MITIGATION_NONE;
++                      }
+               }
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
new file mode 100644 (file)
index 0000000..51a10b9
--- /dev/null
@@ -0,0 +1,295 @@
+From 09755c95623ff205c25cd78c7f08e57b187626b1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Sep 2025 14:09:13 +0800
+Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c
+
+From: Menglong Dong <menglong8.dong@gmail.com>
+
+[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ]
+
+The include/generated/asm-offsets.h is generated in Kbuild during
+compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to
+generate another similar offset header file, circular dependency can
+happen.
+
+For example, we want to generate a offset file include/generated/test.h,
+which is included in include/sched/sched.h. If we generate asm-offsets.h
+first, it will fail, as include/sched/sched.h is included in asm-offsets.c
+and include/generated/test.h doesn't exist; If we generate test.h first,
+it can't success neither, as include/generated/asm-offsets.h is included
+by it.
+
+In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular
+dependency. We can generate asm-offsets.h first, and if the
+COMPILE_OFFSETS is defined, we don't include the "generated/test.h".
+
+And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this
+purpose.
+
+Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/alpha/kernel/asm-offsets.c      | 1 +
+ arch/arc/kernel/asm-offsets.c        | 1 +
+ arch/arm/kernel/asm-offsets.c        | 2 ++
+ arch/arm64/kernel/asm-offsets.c      | 1 +
+ arch/csky/kernel/asm-offsets.c       | 1 +
+ arch/hexagon/kernel/asm-offsets.c    | 1 +
+ arch/loongarch/kernel/asm-offsets.c  | 2 ++
+ arch/m68k/kernel/asm-offsets.c       | 1 +
+ arch/microblaze/kernel/asm-offsets.c | 1 +
+ arch/mips/kernel/asm-offsets.c       | 2 ++
+ arch/nios2/kernel/asm-offsets.c      | 1 +
+ arch/openrisc/kernel/asm-offsets.c   | 1 +
+ arch/parisc/kernel/asm-offsets.c     | 1 +
+ arch/powerpc/kernel/asm-offsets.c    | 1 +
+ arch/riscv/kernel/asm-offsets.c      | 1 +
+ arch/s390/kernel/asm-offsets.c       | 1 +
+ arch/sh/kernel/asm-offsets.c         | 1 +
+ arch/sparc/kernel/asm-offsets.c      | 1 +
+ arch/um/kernel/asm-offsets.c         | 2 ++
+ arch/xtensa/kernel/asm-offsets.c     | 1 +
+ 20 files changed, 24 insertions(+)
+
+diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
+index e9dad60b147f3..1ebb058904992 100644
+--- a/arch/alpha/kernel/asm-offsets.c
++++ b/arch/alpha/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c
+index f77deb7991757..2978da85fcb65 100644
+--- a/arch/arc/kernel/asm-offsets.c
++++ b/arch/arc/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
+index 123f4a8ef4466..2101938d27fcb 100644
+--- a/arch/arm/kernel/asm-offsets.c
++++ b/arch/arm/kernel/asm-offsets.c
+@@ -7,6 +7,8 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
+index 30d4bbe68661f..b6367ff3a49ca 100644
+--- a/arch/arm64/kernel/asm-offsets.c
++++ b/arch/arm64/kernel/asm-offsets.c
+@@ -6,6 +6,7 @@
+  *               2001-2002 Keith Owens
+  * Copyright (C) 2012 ARM Ltd.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/arm_sdei.h>
+ #include <linux/sched.h>
+diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c
+index d1e9035794733..5525c8e7e1d9e 100644
+--- a/arch/csky/kernel/asm-offsets.c
++++ b/arch/csky/kernel/asm-offsets.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
+index 03a7063f94561..50eea9fa6f137 100644
+--- a/arch/hexagon/kernel/asm-offsets.c
++++ b/arch/hexagon/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  *
+  * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/types.h>
+diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
+index db1e4bb26b6a0..3017c71576009 100644
+--- a/arch/loongarch/kernel/asm-offsets.c
++++ b/arch/loongarch/kernel/asm-offsets.c
+@@ -4,6 +4,8 @@
+  *
+  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/types.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
+index 906d732305374..67a1990f9d748 100644
+--- a/arch/m68k/kernel/asm-offsets.c
++++ b/arch/m68k/kernel/asm-offsets.c
+@@ -9,6 +9,7 @@
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+ #include <linux/stddef.h>
+diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c
+index 104c3ac5f30c8..b4b67d58e7f6a 100644
+--- a/arch/microblaze/kernel/asm-offsets.c
++++ b/arch/microblaze/kernel/asm-offsets.c
+@@ -7,6 +7,7 @@
+  * License. See the file "COPYING" in the main directory of this archive
+  * for more details.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/init.h>
+ #include <linux/stddef.h>
+diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
+index 1e29efcba46e5..5debd9a3854a9 100644
+--- a/arch/mips/kernel/asm-offsets.c
++++ b/arch/mips/kernel/asm-offsets.c
+@@ -9,6 +9,8 @@
+  * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000 MIPS Technologies, Inc.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compat.h>
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c
+index e3d9b7b6fb48a..88190b503ce5d 100644
+--- a/arch/nios2/kernel/asm-offsets.c
++++ b/arch/nios2/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/sched.h>
+diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c
+index 710651d5aaae1..3cc826f2216b1 100644
+--- a/arch/openrisc/kernel/asm-offsets.c
++++ b/arch/openrisc/kernel/asm-offsets.c
+@@ -18,6 +18,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/signal.h>
+ #include <linux/sched.h>
+diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
+index 757816a7bd4b2..9abfe65492c65 100644
+--- a/arch/parisc/kernel/asm-offsets.c
++++ b/arch/parisc/kernel/asm-offsets.c
+@@ -13,6 +13,7 @@
+  *    Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
+  *    Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
+index b3048f6d3822c..a4bc80b30410a 100644
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/signal.h>
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index 6e8c0d6feae9e..7d42d3b8a32a7 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -3,6 +3,7 @@
+  * Copyright (C) 2012 Regents of the University of California
+  * Copyright (C) 2017 SiFive
+  */
++#define COMPILE_OFFSETS
+ #include <linux/kbuild.h>
+ #include <linux/mm.h>
+diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
+index 95ecad9c7d7d2..a8915663e917f 100644
+--- a/arch/s390/kernel/asm-offsets.c
++++ b/arch/s390/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/kbuild.h>
+ #include <linux/sched.h>
+diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
+index a0322e8328456..429b6a7631468 100644
+--- a/arch/sh/kernel/asm-offsets.c
++++ b/arch/sh/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/types.h>
+diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
+index 3d9b9855dce91..6e660bde48dd8 100644
+--- a/arch/sparc/kernel/asm-offsets.c
++++ b/arch/sparc/kernel/asm-offsets.c
+@@ -10,6 +10,7 @@
+  *
+  * On sparc, thread_info data is static and TI_XXX offsets are computed by hand.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm_types.h>
+diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
+index 1fb12235ab9c8..a69873aa697f4 100644
+--- a/arch/um/kernel/asm-offsets.c
++++ b/arch/um/kernel/asm-offsets.c
+@@ -1 +1,3 @@
++#define COMPILE_OFFSETS
++
+ #include <sysdep/kernel-offsets.h>
+diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
+index da38de20ae598..cfbced95e944a 100644
+--- a/arch/xtensa/kernel/asm-offsets.c
++++ b/arch/xtensa/kernel/asm-offsets.c
+@@ -11,6 +11,7 @@
+  *
+  * Chris Zankel <chris@zankel.net>
+  */
++#define COMPILE_OFFSETS
+ #include <asm/processor.h>
+ #include <asm/coprocessor.h>
+-- 
+2.51.0
+
diff --git a/queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch
new file mode 100644 (file)
index 0000000..13f6234
--- /dev/null
@@ -0,0 +1,44 @@
+From a6297c943b52f9458b8ca489dfbe9bfdd26dce75 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 17:04:07 -0400
+Subject: audit: record fanotify event regardless of presence of rules
+
+From: Richard Guy Briggs <rgb@redhat.com>
+
+[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ]
+
+When no audit rules are in place, fanotify event results are
+unconditionally dropped due to an explicit check for the existence of
+any audit rules.  Given this is a report from another security
+sub-system, allow it to be recorded regardless of the existence of any
+audit rules.
+
+To test, install and run the fapolicyd daemon with default config.  Then
+as an unprivileged user, create and run a very simple binary that should
+be denied.  Then check for an event with
+       ausearch -m FANOTIFY -ts recent
+
+Link: https://issues.redhat.com/browse/RHEL-9065
+Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/audit.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/linux/audit.h b/include/linux/audit.h
+index a394614ccd0b8..e3f06eba9c6e6 100644
+--- a/include/linux/audit.h
++++ b/include/linux/audit.h
+@@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name)
+ static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
+ {
+-      if (!audit_dummy_context())
++      if (audit_enabled)
+               __audit_fanotify(response, friar);
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch b/queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch
new file mode 100644 (file)
index 0000000..c61e15c
--- /dev/null
@@ -0,0 +1,39 @@
+From 496727f1f5bd4315290c755db82f0460635f17b2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Sep 2025 17:43:04 +0100
+Subject: btrfs: abort transaction if we fail to update inode in log replay dir
+ fixup
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 5a0565cad3ef7cbf4cf43d1dd1e849b156205292 ]
+
+If we fail to update the inode at link_to_fixup_dir(), we don't abort the
+transaction and propagate the error up the call chain, which makes it hard
+to pinpoint the error to the inode update. So abort the transaction if the
+inode update call fails, so that if it happens we known immediately.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 4f92aa15d9b1d..165d2ee500ca3 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1796,6 +1796,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+               else
+                       inc_nlink(vfs_inode);
+               ret = btrfs_update_inode(trans, inode);
++              if (ret)
++                      btrfs_abort_transaction(trans, ret);
+       } else if (ret == -EEXIST) {
+               ret = 0;
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch b/queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch
new file mode 100644 (file)
index 0000000..a8bb08e
--- /dev/null
@@ -0,0 +1,77 @@
+From b2c31af40dd6f88a468a8613c542de8306f31b47 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 15:49:31 +0100
+Subject: btrfs: abort transaction in the process_one_buffer() log tree walk
+ callback
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit e6dd405b6671b9753b98d8bdf76f8f0ed36c11cd ]
+
+In the process_one_buffer() log tree walk callback we return errors to the
+log tree walk caller and then the caller aborts the transaction, if we
+have one, or turns the fs into error state if we don't have one. While
+this reduces code it makes it harder to figure out where exactly an error
+came from. So add the transaction aborts after every failure inside the
+process_one_buffer() callback, so that it helps figuring out why failures
+happen.
+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 20 ++++++++++++++++----
+ 1 file changed, 16 insertions(+), 4 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 6d92326a1a0c7..50ed84cb68a69 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -347,6 +347,7 @@ static int process_one_buffer(struct btrfs_root *log,
+                             struct extent_buffer *eb,
+                             struct walk_control *wc, u64 gen, int level)
+ {
++      struct btrfs_trans_handle *trans = wc->trans;
+       struct btrfs_fs_info *fs_info = log->fs_info;
+       int ret = 0;
+@@ -361,18 +362,29 @@ static int process_one_buffer(struct btrfs_root *log,
+               };
+               ret = btrfs_read_extent_buffer(eb, &check);
+-              if (ret)
++              if (ret) {
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                       return ret;
++              }
+       }
+       if (wc->pin) {
+-              ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
+-              if (ret)
++              ASSERT(trans != NULL);
++              ret = btrfs_pin_extent_for_log_replay(trans, eb);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
+                       return ret;
++              }
+               if (btrfs_buffer_uptodate(eb, gen, 0) &&
+-                  btrfs_header_level(eb) == 0)
++                  btrfs_header_level(eb) == 0) {
+                       ret = btrfs_exclude_logged_extents(eb);
++                      if (ret)
++                              btrfs_abort_transaction(trans, ret);
++              }
+       }
+       return ret;
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch b/queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch
new file mode 100644 (file)
index 0000000..71b71b0
--- /dev/null
@@ -0,0 +1,111 @@
+From b70c9bc307275209743b38f5f7c7507bef7b311d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 14:56:11 +0100
+Subject: btrfs: abort transaction on specific error places when walking log
+ tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 6ebd726b104fa99d47c0d45979e6a6109844ac18 ]
+
+We do several things while walking a log tree (for replaying and for
+freeing a log tree) like reading extent buffers and cleaning them up,
+but we don't immediately abort the transaction, or turn the fs into an
+error state, when one of these things fails. Instead we the transaction
+abort or turn the fs into error state in the caller of the entry point
+function that walks a log tree - walk_log_tree() - which means we don't
+get to know exactly where an error came from.
+
+Improve on this by doing a transaction abort / turn fs into error state
+after each such failure so that when it happens we have a better
+understanding where the failure comes from. This deliberately leaves
+the transaction abort / turn fs into error state in the callers of
+walk_log_tree() as to ensure we don't get into an inconsistent state in
+case we forget to do it deeper in call chain. It also deliberately does
+not do it after errors from the calls to the callback defined in
+struct walk_control::process_func(), as we will do it later on another
+patch.
+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 33 ++++++++++++++++++++++++++++-----
+ 1 file changed, 28 insertions(+), 5 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 7a63afedd01e6..6d92326a1a0c7 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2630,15 +2630,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+ static int clean_log_buffer(struct btrfs_trans_handle *trans,
+                           struct extent_buffer *eb)
+ {
++      int ret;
++
+       btrfs_tree_lock(eb);
+       btrfs_clear_buffer_dirty(trans, eb);
+       wait_on_extent_buffer_writeback(eb);
+       btrfs_tree_unlock(eb);
+-      if (trans)
+-              return btrfs_pin_reserved_extent(trans, eb);
++      if (trans) {
++              ret = btrfs_pin_reserved_extent(trans, eb);
++              if (ret)
++                      btrfs_abort_transaction(trans, ret);
++              return ret;
++      }
+-      return unaccount_log_buffer(eb->fs_info, eb->start);
++      ret = unaccount_log_buffer(eb->fs_info, eb->start);
++      if (ret)
++              btrfs_handle_fs_error(eb->fs_info, ret, NULL);
++      return ret;
+ }
+ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+@@ -2674,8 +2683,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+               next = btrfs_find_create_tree_block(fs_info, bytenr,
+                                                   btrfs_header_owner(cur),
+                                                   *level - 1);
+-              if (IS_ERR(next))
+-                      return PTR_ERR(next);
++              if (IS_ERR(next)) {
++                      ret = PTR_ERR(next);
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
++                      return ret;
++              }
+               if (*level == 1) {
+                       ret = wc->process_func(root, next, wc, ptr_gen,
+@@ -2690,6 +2705,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+                               ret = btrfs_read_extent_buffer(next, &check);
+                               if (ret) {
+                                       free_extent_buffer(next);
++                                      if (trans)
++                                              btrfs_abort_transaction(trans, ret);
++                                      else
++                                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                                       return ret;
+                               }
+@@ -2705,6 +2724,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+               ret = btrfs_read_extent_buffer(next, &check);
+               if (ret) {
+                       free_extent_buffer(next);
++                      if (trans)
++                              btrfs_abort_transaction(trans, ret);
++                      else
++                              btrfs_handle_fs_error(fs_info, ret, NULL);
+                       return ret;
+               }
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..d35c4eb
--- /dev/null
@@ -0,0 +1,63 @@
+From 043347135f8442ac0b1bf5b6bfc12f2c35374b9c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 70fc4e7cc5a0e..0b02e36b30558 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 50ed84cb68a69..518cd74191e77 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -7469,7 +7469,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
new file mode 100644 (file)
index 0000000..4de154a
--- /dev/null
@@ -0,0 +1,44 @@
+From 6961fd2310f25663e1cc6a8e7977438fa016289f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Sep 2025 17:01:44 +0200
+Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in
+ scrub_throttle_dev_io()
+
+From: Thorsten Blum <thorsten.blum@linux.dev>
+
+[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ]
+
+Replace max_t() followed by min_t() with a single clamp().
+
+As was pointed by David Laight in
+https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/
+the calculation may overflow u32 when the input value is too large, so
+clamp_t() is not used.  In practice the expected values are in range of
+megabytes to gigabytes (throughput limit) so the bug would not happen.
+
+Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ Use clamp() and add explanation. ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 6776e6ab8d108..fd4c1ca34b5e4 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1369,8 +1369,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
+        * Slice is divided into intervals when the IO is submitted, adjust by
+        * bwlimit and maximum of 64 intervals.
+        */
+-      div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-      div = min_t(u32, 64, div);
++      div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
+       /* Start new epoch, set deadline */
+       now = ktime_get();
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch b/queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch
new file mode 100644 (file)
index 0000000..1678a11
--- /dev/null
@@ -0,0 +1,90 @@
+From 5613ea5ed3366b504037789c8bb8cebb30a3524f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 08:34:05 +0930
+Subject: btrfs: tree-checker: add inode extref checks
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit aab9458b9f0019e97fae394c2d6d9d1a03addfb3 ]
+
+Like inode refs, inode extrefs have a variable length name, which means
+we have to do a proper check to make sure no header nor name can exceed
+the item limits.
+
+The check itself is very similar to check_inode_ref(), just a different
+structure (btrfs_inode_extref vs btrfs_inode_ref).
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-checker.c | 37 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 37 insertions(+)
+
+diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
+index a997c7cc35a26..a83e455f813bf 100644
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
+       /* Only these key->types needs to be checked */
+       ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+              key->type == BTRFS_INODE_REF_KEY ||
++             key->type == BTRFS_INODE_EXTREF_KEY ||
+              key->type == BTRFS_DIR_INDEX_KEY ||
+              key->type == BTRFS_DIR_ITEM_KEY ||
+              key->type == BTRFS_EXTENT_DATA_KEY);
+@@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
+       return 0;
+ }
++static int check_inode_extref(struct extent_buffer *leaf,
++                            struct btrfs_key *key, struct btrfs_key *prev_key,
++                            int slot)
++{
++      unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
++      unsigned long end = ptr + btrfs_item_size(leaf, slot);
++
++      if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
++              return -EUCLEAN;
++
++      while (ptr < end) {
++              struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
++              u16 namelen;
++
++              if (unlikely(ptr + sizeof(*extref)) > end) {
++                      inode_ref_err(leaf, slot,
++                      "inode extref overflow, ptr %lu end %lu inode_extref size %zu",
++                                    ptr, end, sizeof(*extref));
++                      return -EUCLEAN;
++              }
++
++              namelen = btrfs_inode_extref_name_len(leaf, extref);
++              if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
++                      inode_ref_err(leaf, slot,
++                              "inode extref overflow, ptr %lu end %lu namelen %u",
++                              ptr, end, namelen);
++                      return -EUCLEAN;
++              }
++              ptr += sizeof(*extref) + namelen;
++      }
++      return 0;
++}
++
+ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+                                   const struct btrfs_key *key, int slot)
+ {
+@@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
+       case BTRFS_INODE_REF_KEY:
+               ret = check_inode_ref(leaf, key, prev_key, slot);
+               break;
++      case BTRFS_INODE_EXTREF_KEY:
++              ret = check_inode_extref(leaf, key, prev_key, slot);
++              break;
+       case BTRFS_BLOCK_GROUP_ITEM_KEY:
+               ret = check_block_group_item(leaf, key, slot);
+               break;
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
new file mode 100644 (file)
index 0000000..8340925
--- /dev/null
@@ -0,0 +1,50 @@
+From 17181f1cd33cfcd7024c3d0606e424d27ff2a1fe Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Aug 2025 17:46:18 +0100
+Subject: btrfs: use level argument in log tree walk callback
+ replay_one_buffer()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ]
+
+We already have the extent buffer's level in an argument, there's no need
+to first ensure the extent buffer's data is loaded (by calling
+btrfs_read_extent_buffer()) and then call btrfs_header_level() to check
+the level. So use the level argument and do the check before calling
+btrfs_read_extent_buffer().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 518cd74191e77..4f92aa15d9b1d 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2461,15 +2461,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+       int i;
+       int ret;
++      if (level != 0)
++              return 0;
++
+       ret = btrfs_read_extent_buffer(eb, &check);
+       if (ret)
+               return ret;
+-      level = btrfs_header_level(eb);
+-
+-      if (level != 0)
+-              return 0;
+-
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..8cd5320
--- /dev/null
@@ -0,0 +1,58 @@
+From 2315af132a33b20e24b3a740bfc56993b3f29be5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index c5c0d9cf1a808..a4e486a600bed 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1806,7 +1806,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch
new file mode 100644 (file)
index 0000000..4d5bbdd
--- /dev/null
@@ -0,0 +1,59 @@
+From e0264b290f1d5792d4664a0fe27c898716f36a81 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 11:13:15 +0900
+Subject: btrfs: zoned: refine extent allocator hint selection
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ]
+
+The hint block group selection in the extent allocator is wrong in the
+first place, as it can select the dedicated data relocation block group for
+the normal data allocation.
+
+Since we separated the normal data space_info and the data relocation
+space_info, we can easily identify a block group is for data relocation or
+not. Do not choose it for the normal data allocation.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 97d517cdf2df7..682d21a73a67a 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ }
+ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+-                                  struct find_free_extent_ctl *ffe_ctl)
++                                  struct find_free_extent_ctl *ffe_ctl,
++                                  struct btrfs_space_info *space_info)
+ {
+       if (ffe_ctl->for_treelog) {
+               spin_lock(&fs_info->treelog_bg_lock);
+@@ -4321,6 +4322,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+                       u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+                       if (block_group_bits(block_group, ffe_ctl->flags) &&
++                          block_group->space_info == space_info &&
+                           avail >= ffe_ctl->num_bytes) {
+                               ffe_ctl->hint_byte = block_group->start;
+                               break;
+@@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
+               return prepare_allocation_clustered(fs_info, ffe_ctl,
+                                                   space_info, ins);
+       case BTRFS_EXTENT_ALLOC_ZONED:
+-              return prepare_allocation_zoned(fs_info, ffe_ctl);
++              return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
+       default:
+               BUG();
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
new file mode 100644 (file)
index 0000000..970ceb0
--- /dev/null
@@ -0,0 +1,111 @@
+From 4230345a12b197f63729c55c765d44f98c2ca78d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Jul 2025 13:39:11 +0200
+Subject: btrfs: zoned: return error from btrfs_zone_finish_endio()
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ]
+
+Now that btrfs_zone_finish_endio_workfn() is directly calling
+do_zone_finish() the only caller of btrfs_zone_finish_endio() is
+btrfs_finish_one_ordered().
+
+btrfs_finish_one_ordered() already has error handling in-place so
+btrfs_zone_finish_endio() can return an error if the block group lookup
+fails.
+
+Also as btrfs_zone_finish_endio() already checks for zoned filesystems and
+returns early, there's no need to do this in the caller.
+
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 7 ++++---
+ fs/btrfs/zoned.c | 8 +++++---
+ fs/btrfs/zoned.h | 9 ++++++---
+ 3 files changed, 15 insertions(+), 9 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 4031cbdea0740..41da405181b4f 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3107,9 +3107,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
+               goto out;
+       }
+-      if (btrfs_is_zoned(fs_info))
+-              btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+-                                      ordered_extent->disk_num_bytes);
++      ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
++                                    ordered_extent->disk_num_bytes);
++      if (ret)
++              goto out;
+       if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+               truncated = true;
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 87c5dd3ad016e..fcdf7b058a584 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2464,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+       return ret;
+ }
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+ {
+       struct btrfs_block_group *block_group;
+       u64 min_alloc_bytes;
+       if (!btrfs_is_zoned(fs_info))
+-              return;
++              return 0;
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+-      ASSERT(block_group);
++      if (WARN_ON_ONCE(!block_group))
++              return -ENOENT;
+       /* No MIXED_BG on zoned btrfs. */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+@@ -2490,6 +2491,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
+ out:
+       btrfs_put_block_group(block_group);
++      return 0;
+ }
+ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 6e11533b8e14c..17c5656580dd9 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb);
+@@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+       return true;
+ }
+-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+-                                         u64 logical, u64 length) { }
++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
++                                         u64 logical, u64 length)
++{
++      return 0;
++}
+ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                                struct extent_buffer *eb) { }
+-- 
+2.51.0
+
diff --git a/queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch b/queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch
new file mode 100644 (file)
index 0000000..9bbca42
--- /dev/null
@@ -0,0 +1,47 @@
+From 1a27b5e454cf61ce28e3b82e30c4ca2f682381f8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Sep 2025 01:12:27 +0000
+Subject: cpuset: Use new excpus for nocpu error check when enabling root
+ partition
+
+From: Chen Ridong <chenridong@huawei.com>
+
+[ Upstream commit 59d5de3655698679ad8fd2cc82228de4679c4263 ]
+
+A previous patch fixed a bug where new_prs should be assigned before
+checking housekeeping conflicts. This patch addresses another potential
+issue: the nocpu error check currently uses the xcpus which is not updated.
+Although no issue has been observed so far, the check should be performed
+using the new effective exclusive cpus.
+
+The comment has been removed because the function returns an error if
+nocpu checking fails, which is unrelated to the parent.
+
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Reviewed-by: Waiman Long <longman@redhat.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cpuset.c | 6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index fef93032fe7e4..fd890b34a8403 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -1728,11 +1728,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+               if (prstate_housekeeping_conflict(new_prs, xcpus))
+                       return PERR_HKEEPING;
+-              /*
+-               * A parent can be left with no CPU as long as there is no
+-               * task directly associated with the parent partition.
+-               */
+-              if (nocpu)
++              if (tasks_nocpu_error(parent, cs, xcpus))
+                       return PERR_NOCPUS;
+               /*
+-- 
+2.51.0
+
diff --git a/queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch b/queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch
new file mode 100644 (file)
index 0000000..0446052
--- /dev/null
@@ -0,0 +1,43 @@
+From d652aa2ed5235fd64ae767808908b000818e4502 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 29 Aug 2025 03:19:54 +0800
+Subject: EDAC: Fix wrong executable file modes for C source files
+
+From: Kuan-Wei Chiu <visitorckw@gmail.com>
+
+[ Upstream commit 71965cae7db394ff5ba3b2d2befe4e136ceec268 ]
+
+Three EDAC source files were mistakenly marked as executable when adding the
+EDAC scrub controls.
+
+These are plain C source files and should not carry the executable bit.
+Correcting their modes follows the principle of least privilege and avoids
+unnecessary execute permissions in the repository.
+
+  [ bp: Massage commit message. ]
+
+Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250828191954.903125-1-visitorckw@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/ecs.c        | 0
+ drivers/edac/mem_repair.c | 0
+ drivers/edac/scrub.c      | 0
+ 3 files changed, 0 insertions(+), 0 deletions(-)
+ mode change 100755 => 100644 drivers/edac/ecs.c
+ mode change 100755 => 100644 drivers/edac/mem_repair.c
+ mode change 100755 => 100644 drivers/edac/scrub.c
+
+diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c
+old mode 100755
+new mode 100644
+diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c
+old mode 100755
+new mode 100644
+diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c
+old mode 100755
+new mode 100644
+-- 
+2.51.0
+
diff --git a/queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch b/queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch
new file mode 100644 (file)
index 0000000..f587a96
--- /dev/null
@@ -0,0 +1,55 @@
+From 90175ae118a68e96eeb97a03511afe3d8cccbee3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Aug 2025 09:17:39 -0700
+Subject: EDAC/ie31200: Add two more Intel Alder Lake-S SoCs for EDAC support
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Kyle Manna <kyle@kylemanna.com>
+
+[ Upstream commit 71b69f817e91b588030d7d47ddbdc4857a92eb4e ]
+
+Host Device IDs (DID0) correspond to:
+* Intel Core i7-12700K
+* Intel Core i5-12600K
+
+See documentation:
+* 12th Generation Intel® Coreâ„¢ Processors Datasheet
+    * Volume 1 of 2, Doc. No.: 655258, Rev.: 011
+    * https://edc.intel.com/output/DownloadPdfDocument?id=8297 (PDF)
+
+Signed-off-by: Kyle Manna <kyle@kylemanna.com>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Link: https://lore.kernel.org/r/20250819161739.3241152-1-kyle@kylemanna.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/ie31200_edac.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
+index 5c1fa1c0d12e3..5a080ab65476d 100644
+--- a/drivers/edac/ie31200_edac.c
++++ b/drivers/edac/ie31200_edac.c
+@@ -99,6 +99,8 @@
+ /* Alder Lake-S */
+ #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1   0x4660
++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2   0x4668  /* 8P+4E, e.g. i7-12700K */
++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3   0x4648  /* 6P+4E, e.g. i5-12600K */
+ /* Bartlett Lake-S */
+ #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1   0x4639
+@@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = {
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg},
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg},
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg},
++      { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg},
++      { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg},
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg},
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg},
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg},
+-- 
+2.51.0
+
diff --git a/queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
new file mode 100644 (file)
index 0000000..aa682b9
--- /dev/null
@@ -0,0 +1,89 @@
+From f47a7852e3030b5b0c360943fd302ba833f9999f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 20:30:17 +0000
+Subject: EDAC/mc_sysfs: Increase legacy channel support to 16
+
+From: Avadhut Naik <avadhut.naik@amd.com>
+
+[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ]
+
+Newer AMD systems can support up to 16 channels per EDAC "mc" device.
+These are detected by the EDAC module running on the device, and the
+current EDAC interface is appropriately enumerated.
+
+The legacy EDAC sysfs interface however, provides device attributes for
+channels 0 through 11 only. Consequently, the last four channels, 12
+through 15, will not be enumerated and will not be visible through the
+legacy sysfs interface.
+
+Add additional device attributes to ensure that all 16 channels, if
+present, are enumerated by and visible through the legacy EDAC sysfs
+interface.
+
+Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
+index 0f338adf7d937..8689631f19053 100644
+--- a/drivers/edac/edac_mc_sysfs.c
++++ b/drivers/edac/edac_mc_sysfs.c
+@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 10);
+ DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 11);
++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 12);
++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 13);
++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 14);
++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 15);
+ /* Total possible dynamic DIMM Label attribute file table */
+ static struct attribute *dynamic_csrow_dimm_attr[] = {
+@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
+       &dev_attr_legacy_ch9_dimm_label.attr.attr,
+       &dev_attr_legacy_ch10_dimm_label.attr.attr,
+       &dev_attr_legacy_ch11_dimm_label.attr.attr,
++      &dev_attr_legacy_ch12_dimm_label.attr.attr,
++      &dev_attr_legacy_ch13_dimm_label.attr.attr,
++      &dev_attr_legacy_ch14_dimm_label.attr.attr,
++      &dev_attr_legacy_ch15_dimm_label.attr.attr,
+       NULL
+ };
+@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 10);
+ DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 11);
++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 12);
++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 13);
++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 14);
++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 15);
+ /* Total possible dynamic ce_count attribute file table */
+ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+       &dev_attr_legacy_ch9_ce_count.attr.attr,
+       &dev_attr_legacy_ch10_ce_count.attr.attr,
+       &dev_attr_legacy_ch11_ce_count.attr.attr,
++      &dev_attr_legacy_ch12_ce_count.attr.attr,
++      &dev_attr_legacy_ch13_ce_count.attr.attr,
++      &dev_attr_legacy_ch14_ce_count.attr.attr,
++      &dev_attr_legacy_ch15_ce_count.attr.attr,
+       NULL
+ };
+-- 
+2.51.0
+
diff --git a/queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch b/queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch
new file mode 100644 (file)
index 0000000..a0ddbc7
--- /dev/null
@@ -0,0 +1,38 @@
+From 7dc7cb31fbaf707d2d6237c28bc7a3e55e13048d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Oct 2025 16:48:59 +0100
+Subject: genirq/chip: Add buslock back in to irq_set_handler()
+
+From: Charles Keepax <ckeepax@opensource.cirrus.com>
+
+[ Upstream commit 5d7e45dd670e42df4836afeaa9baf9d41ca4b434 ]
+
+The locking was changed from a buslock to a plain lock, but the patch
+description states there was no functional change. Assuming this was
+accidental so reverting to using the buslock.
+
+Fixes: 5cd05f3e2315 ("genirq/chip: Rework irq_set_handler() variants")
+Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://patch.msgid.link/20251023154901.1333755-2-ckeepax@opensource.cirrus.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/chip.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
+index 3ffa0d80ddd19..d1917b28761a3 100644
+--- a/kernel/irq/chip.c
++++ b/kernel/irq/chip.c
+@@ -1030,7 +1030,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+ void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+                      const char *name)
+ {
+-      scoped_irqdesc_get_and_lock(irq, 0)
++      scoped_irqdesc_get_and_buslock(irq, 0)
+               __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
+ }
+ EXPORT_SYMBOL_GPL(__irq_set_handler);
+-- 
+2.51.0
+
diff --git a/queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch b/queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch
new file mode 100644 (file)
index 0000000..f916a50
--- /dev/null
@@ -0,0 +1,38 @@
+From 34c98b6e10f180a7abd2fbcca68ad9546c6625e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Oct 2025 16:49:00 +0100
+Subject: genirq/manage: Add buslock back in to __disable_irq_nosync()
+
+From: Charles Keepax <ckeepax@opensource.cirrus.com>
+
+[ Upstream commit 56363e25f79fe83e63039c5595b8cd9814173d37 ]
+
+The locking was changed from a buslock to a plain lock, but the patch
+description states there was no functional change. Assuming this was
+accidental so reverting to using the buslock.
+
+Fixes: 1b7444446724 ("genirq/manage: Rework __disable_irq_nosync()")
+Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://patch.msgid.link/20251023154901.1333755-3-ckeepax@opensource.cirrus.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/manage.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
+index c94837382037e..7d68fb5dc2428 100644
+--- a/kernel/irq/manage.c
++++ b/kernel/irq/manage.c
+@@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc)
+ static int __disable_irq_nosync(unsigned int irq)
+ {
+-      scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
++      scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+               __disable_irq(scoped_irqdesc);
+               return 0;
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch b/queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch
new file mode 100644 (file)
index 0000000..e3a70ca
--- /dev/null
@@ -0,0 +1,38 @@
+From eefaa63d07aca4d44e91486f0a43039238559741 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Oct 2025 16:49:01 +0100
+Subject: genirq/manage: Add buslock back in to enable_irq()
+
+From: Charles Keepax <ckeepax@opensource.cirrus.com>
+
+[ Upstream commit ef3330b99c01bda53f2a189b58bed8f6b7397f28 ]
+
+The locking was changed from a buslock to a plain lock, but the patch
+description states there was no functional change. Assuming this was
+accidental so reverting to using the buslock.
+
+Fixes: bddd10c55407 ("genirq/manage: Rework enable_irq()")
+Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://patch.msgid.link/20251023154901.1333755-4-ckeepax@opensource.cirrus.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/manage.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
+index 7d68fb5dc2428..400856abf6721 100644
+--- a/kernel/irq/manage.c
++++ b/kernel/irq/manage.c
+@@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc)
+  */
+ void enable_irq(unsigned int irq)
+ {
+-      scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
++      scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+               struct irq_desc *desc = scoped_irqdesc;
+               if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
+-- 
+2.51.0
+
diff --git a/queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch
new file mode 100644 (file)
index 0000000..cd6edd3
--- /dev/null
@@ -0,0 +1,68 @@
+From 82f22876b00c320ed9c7d964eeffcd4e786655ad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:40 -0400
+Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are
+ set
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ]
+
+get_perf_callchain() doesn't support cross-task unwinding for user space
+stacks, have it return NULL if both the crosstask and user arguments are
+set.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index decff7266cfbd..2609998ca07f1 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       struct perf_callchain_entry_ctx ctx;
+       int rctx, start_entry_idx;
++      /* crosstask is not supported for user stacks */
++      if (crosstask && user && !kernel)
++              return NULL;
++
+       entry = get_callchain_entry(&rctx);
+       if (!entry)
+               return NULL;
+@@ -240,7 +244,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               perf_callchain_kernel(&ctx, regs);
+       }
+-      if (user) {
++      if (user && !crosstask) {
+               if (!user_mode(regs)) {
+                       if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
+@@ -249,9 +253,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+               if (regs) {
+-                      if (crosstask)
+-                              goto exit_put;
+-
+                       if (add_mark)
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+@@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+       }
+-exit_put:
+       put_callchain_entry(rctx);
+       return entry;
+-- 
+2.51.0
+
diff --git a/queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
new file mode 100644 (file)
index 0000000..70b4b11
--- /dev/null
@@ -0,0 +1,37 @@
+From 1642fd4e2d5f5e1dc02825acb53fa5f054b913fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:43 -0400
+Subject: perf: Skip user unwind if the task is a kernel thread
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ]
+
+If the task is not a user thread, there's no user stack to unwind.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/core.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index a3dc79ec6f879..c0e938d28758f 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -8192,7 +8192,8 @@ struct perf_callchain_entry *
+ perf_callchain(struct perf_event *event, struct pt_regs *regs)
+ {
+       bool kernel = !event->attr.exclude_callchain_kernel;
+-      bool user   = !event->attr.exclude_callchain_user;
++      bool user   = !event->attr.exclude_callchain_user &&
++              !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
+       /* Disallow cross-task user callchains. */
+       bool crosstask = event->ctx->task && event->ctx->task != current;
+       const u32 max_stack = event->attr.sample_max_stack;
+-- 
+2.51.0
+
diff --git a/queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
new file mode 100644 (file)
index 0000000..649a998
--- /dev/null
@@ -0,0 +1,67 @@
+From 8a38e567c4058e466e5e1b0823fe7b9c902ff337 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:41 -0400
+Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of
+ current->mm == NULL
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ]
+
+To determine if a task is a kernel thread or not, it is more reliable to
+use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on
+current->mm being NULL.  That is because some kernel tasks (io_uring
+helpers) may have a mm field.
+
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 6 +++---
+ kernel/events/core.c      | 4 ++--
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index 6c83ad674d010..decff7266cfbd 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -242,10 +242,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       if (user) {
+               if (!user_mode(regs)) {
+-                      if  (current->mm)
+-                              regs = task_pt_regs(current);
+-                      else
++                      if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
++                      else
++                              regs = task_pt_regs(current);
+               }
+               if (regs) {
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 6e9427c4aaff7..a3dc79ec6f879 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -7440,7 +7440,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
+       if (user_mode(regs)) {
+               regs_user->abi = perf_reg_abi(current);
+               regs_user->regs = regs;
+-      } else if (!(current->flags & PF_KTHREAD)) {
++      } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+               perf_get_regs_user(regs_user, regs);
+       } else {
+               regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+@@ -8080,7 +8080,7 @@ static u64 perf_virt_to_phys(u64 virt)
+                * Try IRQ-safe get_user_page_fast_only first.
+                * If failed, leave phys_addr as 0.
+                */
+-              if (current->mm != NULL) {
++              if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+                       struct page *p;
+                       pagefault_disable();
+-- 
+2.51.0
+
diff --git a/queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch b/queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch
new file mode 100644 (file)
index 0000000..d740688
--- /dev/null
@@ -0,0 +1,101 @@
+From ca9f460d3f2e517fd9f873da6a0d8f17baef1972 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 10:30:31 +0800
+Subject: perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into
+ INTEL_FIXED_BITS_MASK
+
+From: Dapeng Mi <dapeng1.mi@linux.intel.com>
+
+[ Upstream commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a ]
+
+ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK,
+add it.
+
+With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can
+be optimized. The old fixed counter control bits can be unconditionally
+cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on
+new configuration.
+
+Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
+Tested-by: Yi Lai <yi1.lai@intel.com>
+Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/events/intel/core.c      | 10 +++-------
+ arch/x86/include/asm/perf_event.h |  6 +++++-
+ arch/x86/kvm/pmu.h                |  2 +-
+ 3 files changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
+index 15da60cf69f20..046d12281fd94 100644
+--- a/arch/x86/events/intel/core.c
++++ b/arch/x86/events/intel/core.c
+@@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
+ {
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+-      u64 mask, bits = 0;
+       int idx = hwc->idx;
++      u64 bits = 0;
+       if (is_topdown_idx(idx)) {
+               struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+@@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
+       idx -= INTEL_PMC_IDX_FIXED;
+       bits = intel_fixed_bits_by_idx(idx, bits);
+-      mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
+-
+-      if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
++      if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
+               bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
+-              mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
+-      }
+-      cpuc->fixed_ctrl_val &= ~mask;
++      cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
+       cpuc->fixed_ctrl_val |= bits;
+ }
+diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
+index 70d1d94aca7e6..ee943bd1595af 100644
+--- a/arch/x86/include/asm/perf_event.h
++++ b/arch/x86/include/asm/perf_event.h
+@@ -35,7 +35,6 @@
+ #define ARCH_PERFMON_EVENTSEL_EQ                      (1ULL << 36)
+ #define ARCH_PERFMON_EVENTSEL_UMASK2                  (0xFFULL << 40)
+-#define INTEL_FIXED_BITS_MASK                         0xFULL
+ #define INTEL_FIXED_BITS_STRIDE                       4
+ #define INTEL_FIXED_0_KERNEL                          (1ULL << 0)
+ #define INTEL_FIXED_0_USER                            (1ULL << 1)
+@@ -48,6 +47,11 @@
+ #define ICL_EVENTSEL_ADAPTIVE                         (1ULL << 34)
+ #define ICL_FIXED_0_ADAPTIVE                          (1ULL << 32)
++#define INTEL_FIXED_BITS_MASK                                 \
++      (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER |            \
++       INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI |   \
++       ICL_FIXED_0_ADAPTIVE)
++
+ #define intel_fixed_bits_by_idx(_idx, _bits)                  \
+       ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
+diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
+index ad89d0bd60058..103604c4b33b5 100644
+--- a/arch/x86/kvm/pmu.h
++++ b/arch/x86/kvm/pmu.h
+@@ -13,7 +13,7 @@
+ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+                                         MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
++/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
+ #define fixed_ctrl_field(ctrl_reg, idx) \
+       (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch b/queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch
new file mode 100644 (file)
index 0000000..318b3d5
--- /dev/null
@@ -0,0 +1,55 @@
+From cf5b7d7a98bfe5768f519b0840be34ee7ef9d389 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Sep 2025 17:50:42 +0800
+Subject: sched/fair: update_cfs_group() for throttled cfs_rqs
+
+From: Aaron Lu <ziqianlu@bytedance.com>
+
+[ Upstream commit fcd394866e3db344cbe0bb485d7e3f741ac07245 ]
+
+With task based throttle model, tasks in a throttled hierarchy are
+allowed to continue to run if they are running in kernel mode. For this
+reason, PELT clock is not stopped for these cfs_rqs in throttled
+hierarchy when they still have tasks running or queued.
+
+Since PELT clock is not stopped, whether to allow update_cfs_group()
+doing its job for cfs_rqs which are in throttled hierarchy but still
+have tasks running/queued is a question.
+
+The good side is, continue to run update_cfs_group() can get these
+cfs_rq entities with an up2date weight and that up2date weight can be
+useful to derive an accurate load for the CPU as well as ensure fairness
+if multiple tasks of different cgroups are running on the same CPU.
+OTOH, as Benjamin Segall pointed: when unthrottle comes around the most
+likely correct distribution is the distribution we had at the time of
+throttle.
+
+In reality, either way may not matter that much if tasks in throttled
+hierarchy don't run in kernel mode for too long. But in case that
+happens, let these cfs_rq entities have an up2date weight seems a good
+thing to do.
+
+Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 4770d25ae2406..3e0d999e5ee2c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
+       if (!gcfs_rq || !gcfs_rq->load.weight)
+               return;
+-      if (throttled_hierarchy(gcfs_rq))
+-              return;
+-
+       shares = calc_group_shares(gcfs_rq);
+       if (unlikely(se->load.weight != shares))
+               reweight_entity(cfs_rq_of(se), se, shares);
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch b/queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch
new file mode 100644 (file)
index 0000000..2f82f07
--- /dev/null
@@ -0,0 +1,48 @@
+From 89d634457fa0b1abe8647e67fdc54d9c13669cb9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Sep 2025 11:33:28 -1000
+Subject: sched_ext: Keep bypass on between enable failure and
+ scx_disable_workfn()
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 4a1d9d73aabc8f97f48c4f84f936de3b265ffd6f ]
+
+scx_enable() turns on the bypass mode while enable is in progress. If
+enabling fails, it turns off the bypass mode and then triggers scx_error().
+scx_error() will trigger scx_disable_workfn() which will turn on the bypass
+mode again and unload the failed scheduler.
+
+This moves the system out of bypass mode between the enable error path and
+the disable path, which is unnecessary and can be brittle - e.g. the thread
+running scx_enable() may already be on the failed scheduler and can be
+switched out before it triggers scx_error() leading to a stall. The watchdog
+would eventually kick in, so the situation isn't critical but is still
+suboptimal.
+
+There is nothing to be gained by turning off the bypass mode between
+scx_enable() failure and scx_disable_workfn(). Keep bypass on.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index f89894476e51f..14724dae0b795 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -4763,7 +4763,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+ err_disable_unlock_all:
+       scx_cgroup_unlock();
+       percpu_up_write(&scx_fork_rwsem);
+-      scx_bypass(false);
++      /* we'll soon enter disable path, keep bypass on */
+ err_disable:
+       mutex_unlock(&scx_enable_mutex);
+       /*
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch b/queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch
new file mode 100644 (file)
index 0000000..d0d414a
--- /dev/null
@@ -0,0 +1,70 @@
+From d32b2132f0dc2b791769a353ae1bfafc2a0df0e2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Sep 2025 09:03:26 -1000
+Subject: sched_ext: Make qmap dump operation non-destructive
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit d452972858e5cfa4262320ab74fe8f016460b96f ]
+
+The qmap dump operation was destructively consuming queue entries while
+displaying them. As dump can be triggered anytime, this can easily lead to
+stalls. Add a temporary dump_store queue and modify the dump logic to pop
+entries, display them, and then restore them back to the original queue.
+This allows dump operations to be performed without affecting the
+scheduler's queue state.
+
+Note that if racing against new enqueues during dump, ordering can get
+mixed up, but this is acceptable for debugging purposes.
+
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/sched_ext/scx_qmap.bpf.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
+index 69d877501cb72..cd50a94326e3a 100644
+--- a/tools/sched_ext/scx_qmap.bpf.c
++++ b/tools/sched_ext/scx_qmap.bpf.c
+@@ -56,7 +56,8 @@ struct qmap {
+   queue1 SEC(".maps"),
+   queue2 SEC(".maps"),
+   queue3 SEC(".maps"),
+-  queue4 SEC(".maps");
++  queue4 SEC(".maps"),
++  dump_store SEC(".maps");
+ struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+@@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+                       return;
+               scx_bpf_dump("QMAP FIFO[%d]:", i);
++
++              /*
++               * Dump can be invoked anytime and there is no way to iterate in
++               * a non-destructive way. Pop and store in dump_store and then
++               * restore afterwards. If racing against new enqueues, ordering
++               * can get mixed up.
++               */
+               bpf_repeat(4096) {
+                       if (bpf_map_pop_elem(fifo, &pid))
+                               break;
++                      bpf_map_push_elem(&dump_store, &pid, 0);
+                       scx_bpf_dump(" %d", pid);
+               }
++
++              bpf_repeat(4096) {
++                      if (bpf_map_pop_elem(&dump_store, &pid))
++                              break;
++                      bpf_map_push_elem(fifo, &pid, 0);
++              }
++
+               scx_bpf_dump("\n");
+       }
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch b/queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch
new file mode 100644 (file)
index 0000000..410bc23
--- /dev/null
@@ -0,0 +1,2189 @@
+From 63f4bd85d580e08409e9128b0715e253a2e0697f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Sep 2025 11:33:28 -1000
+Subject: sched_ext: Move internal type and accessor definitions to
+ ext_internal.h
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 0c2b8356e430229efef42b03bd765a2a7ecf73fd ]
+
+There currently isn't a place to place SCX-internal types and accessors to
+be shared between ext.c and ext_idle.c. Create kernel/sched/ext_internal.h
+and move internal type and accessor definitions there. This trims ext.c a
+bit and makes future additions easier. Pure code reorganization. No
+functional changes.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Stable-dep-of: efeeaac9ae97 ("sched_ext: Sync error_irq_work before freeing scx_sched")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/build_policy.c |    1 +
+ kernel/sched/ext.c          | 1034 ----------------------------------
+ kernel/sched/ext.h          |   23 -
+ kernel/sched/ext_internal.h | 1061 +++++++++++++++++++++++++++++++++++
+ 4 files changed, 1062 insertions(+), 1057 deletions(-)
+ create mode 100644 kernel/sched/ext_internal.h
+
+diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
+index c4a488e67aa7d..755883faf7518 100644
+--- a/kernel/sched/build_policy.c
++++ b/kernel/sched/build_policy.c
+@@ -58,6 +58,7 @@
+ #include "deadline.c"
+ #ifdef CONFIG_SCHED_CLASS_EXT
++# include "ext_internal.h"
+ # include "ext.c"
+ # include "ext_idle.c"
+ #endif
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 088ceff38c8a4..8ecde1abb4e28 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -9,1040 +9,6 @@
+ #include <linux/btf_ids.h>
+ #include "ext_idle.h"
+-#define SCX_OP_IDX(op)                (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+-
+-enum scx_consts {
+-      SCX_DSP_DFL_MAX_BATCH           = 32,
+-      SCX_DSP_MAX_LOOPS               = 32,
+-      SCX_WATCHDOG_MAX_TIMEOUT        = 30 * HZ,
+-
+-      SCX_EXIT_BT_LEN                 = 64,
+-      SCX_EXIT_MSG_LEN                = 1024,
+-      SCX_EXIT_DUMP_DFL_LEN           = 32768,
+-
+-      SCX_CPUPERF_ONE                 = SCHED_CAPACITY_SCALE,
+-
+-      /*
+-       * Iterating all tasks may take a while. Periodically drop
+-       * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+-       */
+-      SCX_TASK_ITER_BATCH             = 32,
+-};
+-
+-enum scx_exit_kind {
+-      SCX_EXIT_NONE,
+-      SCX_EXIT_DONE,
+-
+-      SCX_EXIT_UNREG = 64,    /* user-space initiated unregistration */
+-      SCX_EXIT_UNREG_BPF,     /* BPF-initiated unregistration */
+-      SCX_EXIT_UNREG_KERN,    /* kernel-initiated unregistration */
+-      SCX_EXIT_SYSRQ,         /* requested by 'S' sysrq */
+-
+-      SCX_EXIT_ERROR = 1024,  /* runtime error, error msg contains details */
+-      SCX_EXIT_ERROR_BPF,     /* ERROR but triggered through scx_bpf_error() */
+-      SCX_EXIT_ERROR_STALL,   /* watchdog detected stalled runnable tasks */
+-};
+-
+-/*
+- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+- * are 64bit of the format:
+- *
+- *   Bits: [63  ..  48 47   ..  32 31 .. 0]
+- *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
+- *
+- *   SYS ACT: System-defined exit actions
+- *   SYS RSN: System-defined exit reasons
+- *   USR    : User-defined exit codes and reasons
+- *
+- * Using the above, users may communicate intention and context by ORing system
+- * actions and/or system reasons with a user-defined exit code.
+- */
+-enum scx_exit_code {
+-      /* Reasons */
+-      SCX_ECODE_RSN_HOTPLUG   = 1LLU << 32,
+-
+-      /* Actions */
+-      SCX_ECODE_ACT_RESTART   = 1LLU << 48,
+-};
+-
+-/*
+- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+- * being disabled.
+- */
+-struct scx_exit_info {
+-      /* %SCX_EXIT_* - broad category of the exit reason */
+-      enum scx_exit_kind      kind;
+-
+-      /* exit code if gracefully exiting */
+-      s64                     exit_code;
+-
+-      /* textual representation of the above */
+-      const char              *reason;
+-
+-      /* backtrace if exiting due to an error */
+-      unsigned long           *bt;
+-      u32                     bt_len;
+-
+-      /* informational message */
+-      char                    *msg;
+-
+-      /* debug dump */
+-      char                    *dump;
+-};
+-
+-/* sched_ext_ops.flags */
+-enum scx_ops_flags {
+-      /*
+-       * Keep built-in idle tracking even if ops.update_idle() is implemented.
+-       */
+-      SCX_OPS_KEEP_BUILTIN_IDLE       = 1LLU << 0,
+-
+-      /*
+-       * By default, if there are no other task to run on the CPU, ext core
+-       * keeps running the current task even after its slice expires. If this
+-       * flag is specified, such tasks are passed to ops.enqueue() with
+-       * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+-       */
+-      SCX_OPS_ENQ_LAST                = 1LLU << 1,
+-
+-      /*
+-       * An exiting task may schedule after PF_EXITING is set. In such cases,
+-       * bpf_task_from_pid() may not be able to find the task and if the BPF
+-       * scheduler depends on pid lookup for dispatching, the task will be
+-       * lost leading to various issues including RCU grace period stalls.
+-       *
+-       * To mask this problem, by default, unhashed tasks are automatically
+-       * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+-       * depend on pid lookups and wants to handle these tasks directly, the
+-       * following flag can be used.
+-       */
+-      SCX_OPS_ENQ_EXITING             = 1LLU << 2,
+-
+-      /*
+-       * If set, only tasks with policy set to SCHED_EXT are attached to
+-       * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+-       */
+-      SCX_OPS_SWITCH_PARTIAL          = 1LLU << 3,
+-
+-      /*
+-       * A migration disabled task can only execute on its current CPU. By
+-       * default, such tasks are automatically put on the CPU's local DSQ with
+-       * the default slice on enqueue. If this ops flag is set, they also go
+-       * through ops.enqueue().
+-       *
+-       * A migration disabled task never invokes ops.select_cpu() as it can
+-       * only select the current CPU. Also, p->cpus_ptr will only contain its
+-       * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+-       * and thus may disagree with cpumask_weight(p->cpus_ptr).
+-       */
+-      SCX_OPS_ENQ_MIGRATION_DISABLED  = 1LLU << 4,
+-
+-      /*
+-       * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+-       * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+-       * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+-       * transfers. When this optimization is enabled, ops.select_cpu() is
+-       * skipped in some cases (when racing against the wakee switching out).
+-       * As the BPF scheduler may depend on ops.select_cpu() being invoked
+-       * during wakeups, queued wakeup is disabled by default.
+-       *
+-       * If this ops flag is set, queued wakeup optimization is enabled and
+-       * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+-       * wakee's CPU without preceding ops.select_cpu() even for tasks which
+-       * may be executed on multiple CPUs.
+-       */
+-      SCX_OPS_ALLOW_QUEUED_WAKEUP     = 1LLU << 5,
+-
+-      /*
+-       * If set, enable per-node idle cpumasks. If clear, use a single global
+-       * flat idle cpumask.
+-       */
+-      SCX_OPS_BUILTIN_IDLE_PER_NODE   = 1LLU << 6,
+-
+-      /*
+-       * CPU cgroup support flags
+-       */
+-      SCX_OPS_HAS_CGROUP_WEIGHT       = 1LLU << 16,   /* DEPRECATED, will be removed on 6.18 */
+-
+-      SCX_OPS_ALL_FLAGS               = SCX_OPS_KEEP_BUILTIN_IDLE |
+-                                        SCX_OPS_ENQ_LAST |
+-                                        SCX_OPS_ENQ_EXITING |
+-                                        SCX_OPS_ENQ_MIGRATION_DISABLED |
+-                                        SCX_OPS_ALLOW_QUEUED_WAKEUP |
+-                                        SCX_OPS_SWITCH_PARTIAL |
+-                                        SCX_OPS_BUILTIN_IDLE_PER_NODE |
+-                                        SCX_OPS_HAS_CGROUP_WEIGHT,
+-
+-      /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+-      __SCX_OPS_INTERNAL_MASK         = 0xffLLU << 56,
+-
+-      SCX_OPS_HAS_CPU_PREEMPT         = 1LLU << 56,
+-};
+-
+-/* argument container for ops.init_task() */
+-struct scx_init_task_args {
+-      /*
+-       * Set if ops.init_task() is being invoked on the fork path, as opposed
+-       * to the scheduler transition path.
+-       */
+-      bool                    fork;
+-#ifdef CONFIG_EXT_GROUP_SCHED
+-      /* the cgroup the task is joining */
+-      struct cgroup           *cgroup;
+-#endif
+-};
+-
+-/* argument container for ops.exit_task() */
+-struct scx_exit_task_args {
+-      /* Whether the task exited before running on sched_ext. */
+-      bool cancelled;
+-};
+-
+-/* argument container for ops->cgroup_init() */
+-struct scx_cgroup_init_args {
+-      /* the weight of the cgroup [1..10000] */
+-      u32                     weight;
+-
+-      /* bandwidth control parameters from cpu.max and cpu.max.burst */
+-      u64                     bw_period_us;
+-      u64                     bw_quota_us;
+-      u64                     bw_burst_us;
+-};
+-
+-enum scx_cpu_preempt_reason {
+-      /* next task is being scheduled by &sched_class_rt */
+-      SCX_CPU_PREEMPT_RT,
+-      /* next task is being scheduled by &sched_class_dl */
+-      SCX_CPU_PREEMPT_DL,
+-      /* next task is being scheduled by &sched_class_stop */
+-      SCX_CPU_PREEMPT_STOP,
+-      /* unknown reason for SCX being preempted */
+-      SCX_CPU_PREEMPT_UNKNOWN,
+-};
+-
+-/*
+- * Argument container for ops->cpu_acquire(). Currently empty, but may be
+- * expanded in the future.
+- */
+-struct scx_cpu_acquire_args {};
+-
+-/* argument container for ops->cpu_release() */
+-struct scx_cpu_release_args {
+-      /* the reason the CPU was preempted */
+-      enum scx_cpu_preempt_reason reason;
+-
+-      /* the task that's going to be scheduled on the CPU */
+-      struct task_struct      *task;
+-};
+-
+-/*
+- * Informational context provided to dump operations.
+- */
+-struct scx_dump_ctx {
+-      enum scx_exit_kind      kind;
+-      s64                     exit_code;
+-      const char              *reason;
+-      u64                     at_ns;
+-      u64                     at_jiffies;
+-};
+-
+-/**
+- * struct sched_ext_ops - Operation table for BPF scheduler implementation
+- *
+- * A BPF scheduler can implement an arbitrary scheduling policy by
+- * implementing and loading operations in this table. Note that a userland
+- * scheduling policy can also be implemented using the BPF scheduler
+- * as a shim layer.
+- */
+-struct sched_ext_ops {
+-      /**
+-       * @select_cpu: Pick the target CPU for a task which is being woken up
+-       * @p: task being woken up
+-       * @prev_cpu: the cpu @p was on before sleeping
+-       * @wake_flags: SCX_WAKE_*
+-       *
+-       * Decision made here isn't final. @p may be moved to any CPU while it
+-       * is getting dispatched for execution later. However, as @p is not on
+-       * the rq at this point, getting the eventual execution CPU right here
+-       * saves a small bit of overhead down the line.
+-       *
+-       * If an idle CPU is returned, the CPU is kicked and will try to
+-       * dispatch. While an explicit custom mechanism can be added,
+-       * select_cpu() serves as the default way to wake up idle CPUs.
+-       *
+-       * @p may be inserted into a DSQ directly by calling
+-       * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+-       * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+-       * of the CPU returned by this operation.
+-       *
+-       * Note that select_cpu() is never called for tasks that can only run
+-       * on a single CPU or tasks with migration disabled, as they don't have
+-       * the option to select a different CPU. See select_task_rq() for
+-       * details.
+-       */
+-      s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+-
+-      /**
+-       * @enqueue: Enqueue a task on the BPF scheduler
+-       * @p: task being enqueued
+-       * @enq_flags: %SCX_ENQ_*
+-       *
+-       * @p is ready to run. Insert directly into a DSQ by calling
+-       * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+-       * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+-       * the task will stall.
+-       *
+-       * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+-       * skipped.
+-       */
+-      void (*enqueue)(struct task_struct *p, u64 enq_flags);
+-
+-      /**
+-       * @dequeue: Remove a task from the BPF scheduler
+-       * @p: task being dequeued
+-       * @deq_flags: %SCX_DEQ_*
+-       *
+-       * Remove @p from the BPF scheduler. This is usually called to isolate
+-       * the task while updating its scheduling properties (e.g. priority).
+-       *
+-       * The ext core keeps track of whether the BPF side owns a given task or
+-       * not and can gracefully ignore spurious dispatches from BPF side,
+-       * which makes it safe to not implement this method. However, depending
+-       * on the scheduling logic, this can lead to confusing behaviors - e.g.
+-       * scheduling position not being updated across a priority change.
+-       */
+-      void (*dequeue)(struct task_struct *p, u64 deq_flags);
+-
+-      /**
+-       * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+-       * @cpu: CPU to dispatch tasks for
+-       * @prev: previous task being switched out
+-       *
+-       * Called when a CPU's local dsq is empty. The operation should dispatch
+-       * one or more tasks from the BPF scheduler into the DSQs using
+-       * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+-       * using scx_bpf_dsq_move_to_local().
+-       *
+-       * The maximum number of times scx_bpf_dsq_insert() can be called
+-       * without an intervening scx_bpf_dsq_move_to_local() is specified by
+-       * ops.dispatch_max_batch. See the comments on top of the two functions
+-       * for more details.
+-       *
+-       * When not %NULL, @prev is an SCX task with its slice depleted. If
+-       * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+-       * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+-       * ops.dispatch() returns. To keep executing @prev, return without
+-       * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+-       */
+-      void (*dispatch)(s32 cpu, struct task_struct *prev);
+-
+-      /**
+-       * @tick: Periodic tick
+-       * @p: task running currently
+-       *
+-       * This operation is called every 1/HZ seconds on CPUs which are
+-       * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+-       * immediate dispatch cycle on the CPU.
+-       */
+-      void (*tick)(struct task_struct *p);
+-
+-      /**
+-       * @runnable: A task is becoming runnable on its associated CPU
+-       * @p: task becoming runnable
+-       * @enq_flags: %SCX_ENQ_*
+-       *
+-       * This and the following three functions can be used to track a task's
+-       * execution state transitions. A task becomes ->runnable() on a CPU,
+-       * and then goes through one or more ->running() and ->stopping() pairs
+-       * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+-       * done running on the CPU.
+-       *
+-       * @p is becoming runnable on the CPU because it's
+-       *
+-       * - waking up (%SCX_ENQ_WAKEUP)
+-       * - being moved from another CPU
+-       * - being restored after temporarily taken off the queue for an
+-       *   attribute change.
+-       *
+-       * This and ->enqueue() are related but not coupled. This operation
+-       * notifies @p's state transition and may not be followed by ->enqueue()
+-       * e.g. when @p is being dispatched to a remote CPU, or when @p is
+-       * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+-       * task may be ->enqueue()'d without being preceded by this operation
+-       * e.g. after exhausting its slice.
+-       */
+-      void (*runnable)(struct task_struct *p, u64 enq_flags);
+-
+-      /**
+-       * @running: A task is starting to run on its associated CPU
+-       * @p: task starting to run
+-       *
+-       * Note that this callback may be called from a CPU other than the
+-       * one the task is going to run on. This can happen when a task
+-       * property is changed (i.e., affinity), since scx_next_task_scx(),
+-       * which triggers this callback, may run on a CPU different from
+-       * the task's assigned CPU.
+-       *
+-       * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+-       * target CPU the task is going to use.
+-       *
+-       * See ->runnable() for explanation on the task state notifiers.
+-       */
+-      void (*running)(struct task_struct *p);
+-
+-      /**
+-       * @stopping: A task is stopping execution
+-       * @p: task stopping to run
+-       * @runnable: is task @p still runnable?
+-       *
+-       * Note that this callback may be called from a CPU other than the
+-       * one the task was running on. This can happen when a task
+-       * property is changed (i.e., affinity), since dequeue_task_scx(),
+-       * which triggers this callback, may run on a CPU different from
+-       * the task's assigned CPU.
+-       *
+-       * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+-       * the task was running on.
+-       *
+-       * See ->runnable() for explanation on the task state notifiers. If
+-       * !@runnable, ->quiescent() will be invoked after this operation
+-       * returns.
+-       */
+-      void (*stopping)(struct task_struct *p, bool runnable);
+-
+-      /**
+-       * @quiescent: A task is becoming not runnable on its associated CPU
+-       * @p: task becoming not runnable
+-       * @deq_flags: %SCX_DEQ_*
+-       *
+-       * See ->runnable() for explanation on the task state notifiers.
+-       *
+-       * @p is becoming quiescent on the CPU because it's
+-       *
+-       * - sleeping (%SCX_DEQ_SLEEP)
+-       * - being moved to another CPU
+-       * - being temporarily taken off the queue for an attribute change
+-       *   (%SCX_DEQ_SAVE)
+-       *
+-       * This and ->dequeue() are related but not coupled. This operation
+-       * notifies @p's state transition and may not be preceded by ->dequeue()
+-       * e.g. when @p is being dispatched to a remote CPU.
+-       */
+-      void (*quiescent)(struct task_struct *p, u64 deq_flags);
+-
+-      /**
+-       * @yield: Yield CPU
+-       * @from: yielding task
+-       * @to: optional yield target task
+-       *
+-       * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+-       * The BPF scheduler should ensure that other available tasks are
+-       * dispatched before the yielding task. Return value is ignored in this
+-       * case.
+-       *
+-       * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+-       * scheduler can implement the request, return %true; otherwise, %false.
+-       */
+-      bool (*yield)(struct task_struct *from, struct task_struct *to);
+-
+-      /**
+-       * @core_sched_before: Task ordering for core-sched
+-       * @a: task A
+-       * @b: task B
+-       *
+-       * Used by core-sched to determine the ordering between two tasks. See
+-       * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+-       * core-sched.
+-       *
+-       * Both @a and @b are runnable and may or may not currently be queued on
+-       * the BPF scheduler. Should return %true if @a should run before @b.
+-       * %false if there's no required ordering or @b should run before @a.
+-       *
+-       * If not specified, the default is ordering them according to when they
+-       * became runnable.
+-       */
+-      bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+-
+-      /**
+-       * @set_weight: Set task weight
+-       * @p: task to set weight for
+-       * @weight: new weight [1..10000]
+-       *
+-       * Update @p's weight to @weight.
+-       */
+-      void (*set_weight)(struct task_struct *p, u32 weight);
+-
+-      /**
+-       * @set_cpumask: Set CPU affinity
+-       * @p: task to set CPU affinity for
+-       * @cpumask: cpumask of cpus that @p can run on
+-       *
+-       * Update @p's CPU affinity to @cpumask.
+-       */
+-      void (*set_cpumask)(struct task_struct *p,
+-                          const struct cpumask *cpumask);
+-
+-      /**
+-       * @update_idle: Update the idle state of a CPU
+-       * @cpu: CPU to update the idle state for
+-       * @idle: whether entering or exiting the idle state
+-       *
+-       * This operation is called when @rq's CPU goes or leaves the idle
+-       * state. By default, implementing this operation disables the built-in
+-       * idle CPU tracking and the following helpers become unavailable:
+-       *
+-       * - scx_bpf_select_cpu_dfl()
+-       * - scx_bpf_select_cpu_and()
+-       * - scx_bpf_test_and_clear_cpu_idle()
+-       * - scx_bpf_pick_idle_cpu()
+-       *
+-       * The user also must implement ops.select_cpu() as the default
+-       * implementation relies on scx_bpf_select_cpu_dfl().
+-       *
+-       * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+-       * tracking.
+-       */
+-      void (*update_idle)(s32 cpu, bool idle);
+-
+-      /**
+-       * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+-       * @cpu: The CPU being acquired by the BPF scheduler.
+-       * @args: Acquire arguments, see the struct definition.
+-       *
+-       * A CPU that was previously released from the BPF scheduler is now once
+-       * again under its control.
+-       */
+-      void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+-
+-      /**
+-       * @cpu_release: A CPU is taken away from the BPF scheduler
+-       * @cpu: The CPU being released by the BPF scheduler.
+-       * @args: Release arguments, see the struct definition.
+-       *
+-       * The specified CPU is no longer under the control of the BPF
+-       * scheduler. This could be because it was preempted by a higher
+-       * priority sched_class, though there may be other reasons as well. The
+-       * caller should consult @args->reason to determine the cause.
+-       */
+-      void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+-
+-      /**
+-       * @init_task: Initialize a task to run in a BPF scheduler
+-       * @p: task to initialize for BPF scheduling
+-       * @args: init arguments, see the struct definition
+-       *
+-       * Either we're loading a BPF scheduler or a new task is being forked.
+-       * Initialize @p for BPF scheduling. This operation may block and can
+-       * be used for allocations, and is called exactly once for a task.
+-       *
+-       * Return 0 for success, -errno for failure. An error return while
+-       * loading will abort loading of the BPF scheduler. During a fork, it
+-       * will abort that specific fork.
+-       */
+-      s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+-
+-      /**
+-       * @exit_task: Exit a previously-running task from the system
+-       * @p: task to exit
+-       * @args: exit arguments, see the struct definition
+-       *
+-       * @p is exiting or the BPF scheduler is being unloaded. Perform any
+-       * necessary cleanup for @p.
+-       */
+-      void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+-
+-      /**
+-       * @enable: Enable BPF scheduling for a task
+-       * @p: task to enable BPF scheduling for
+-       *
+-       * Enable @p for BPF scheduling. enable() is called on @p any time it
+-       * enters SCX, and is always paired with a matching disable().
+-       */
+-      void (*enable)(struct task_struct *p);
+-
+-      /**
+-       * @disable: Disable BPF scheduling for a task
+-       * @p: task to disable BPF scheduling for
+-       *
+-       * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+-       * Disable BPF scheduling for @p. A disable() call is always matched
+-       * with a prior enable() call.
+-       */
+-      void (*disable)(struct task_struct *p);
+-
+-      /**
+-       * @dump: Dump BPF scheduler state on error
+-       * @ctx: debug dump context
+-       *
+-       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+-       */
+-      void (*dump)(struct scx_dump_ctx *ctx);
+-
+-      /**
+-       * @dump_cpu: Dump BPF scheduler state for a CPU on error
+-       * @ctx: debug dump context
+-       * @cpu: CPU to generate debug dump for
+-       * @idle: @cpu is currently idle without any runnable tasks
+-       *
+-       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+-       * @cpu. If @idle is %true and this operation doesn't produce any
+-       * output, @cpu is skipped for dump.
+-       */
+-      void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+-
+-      /**
+-       * @dump_task: Dump BPF scheduler state for a runnable task on error
+-       * @ctx: debug dump context
+-       * @p: runnable task to generate debug dump for
+-       *
+-       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+-       * @p.
+-       */
+-      void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+-
+-#ifdef CONFIG_EXT_GROUP_SCHED
+-      /**
+-       * @cgroup_init: Initialize a cgroup
+-       * @cgrp: cgroup being initialized
+-       * @args: init arguments, see the struct definition
+-       *
+-       * Either the BPF scheduler is being loaded or @cgrp created, initialize
+-       * @cgrp for sched_ext. This operation may block.
+-       *
+-       * Return 0 for success, -errno for failure. An error return while
+-       * loading will abort loading of the BPF scheduler. During cgroup
+-       * creation, it will abort the specific cgroup creation.
+-       */
+-      s32 (*cgroup_init)(struct cgroup *cgrp,
+-                         struct scx_cgroup_init_args *args);
+-
+-      /**
+-       * @cgroup_exit: Exit a cgroup
+-       * @cgrp: cgroup being exited
+-       *
+-       * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+-       * @cgrp for sched_ext. This operation my block.
+-       */
+-      void (*cgroup_exit)(struct cgroup *cgrp);
+-
+-      /**
+-       * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+-       * @p: task being moved
+-       * @from: cgroup @p is being moved from
+-       * @to: cgroup @p is being moved to
+-       *
+-       * Prepare @p for move from cgroup @from to @to. This operation may
+-       * block and can be used for allocations.
+-       *
+-       * Return 0 for success, -errno for failure. An error return aborts the
+-       * migration.
+-       */
+-      s32 (*cgroup_prep_move)(struct task_struct *p,
+-                              struct cgroup *from, struct cgroup *to);
+-
+-      /**
+-       * @cgroup_move: Commit cgroup move
+-       * @p: task being moved
+-       * @from: cgroup @p is being moved from
+-       * @to: cgroup @p is being moved to
+-       *
+-       * Commit the move. @p is dequeued during this operation.
+-       */
+-      void (*cgroup_move)(struct task_struct *p,
+-                          struct cgroup *from, struct cgroup *to);
+-
+-      /**
+-       * @cgroup_cancel_move: Cancel cgroup move
+-       * @p: task whose cgroup move is being canceled
+-       * @from: cgroup @p was being moved from
+-       * @to: cgroup @p was being moved to
+-       *
+-       * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+-       * Undo the preparation.
+-       */
+-      void (*cgroup_cancel_move)(struct task_struct *p,
+-                                 struct cgroup *from, struct cgroup *to);
+-
+-      /**
+-       * @cgroup_set_weight: A cgroup's weight is being changed
+-       * @cgrp: cgroup whose weight is being updated
+-       * @weight: new weight [1..10000]
+-       *
+-       * Update @cgrp's weight to @weight.
+-       */
+-      void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+-
+-      /**
+-       * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+-       * @cgrp: cgroup whose bandwidth is being updated
+-       * @period_us: bandwidth control period
+-       * @quota_us: bandwidth control quota
+-       * @burst_us: bandwidth control burst
+-       *
+-       * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+-       * cgroup interface.
+-       *
+-       * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+-       * to. For example, if @period_us is 1_000_000 and @quota_us is
+-       * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+-       * interpreted in the same fashion and specifies how much @cgrp can
+-       * burst temporarily. The specific control mechanism and thus the
+-       * interpretation of @period_us and burstiness is upto to the BPF
+-       * scheduler.
+-       */
+-      void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+-                                   u64 period_us, u64 quota_us, u64 burst_us);
+-
+-#endif        /* CONFIG_EXT_GROUP_SCHED */
+-
+-      /*
+-       * All online ops must come before ops.cpu_online().
+-       */
+-
+-      /**
+-       * @cpu_online: A CPU became online
+-       * @cpu: CPU which just came up
+-       *
+-       * @cpu just came online. @cpu will not call ops.enqueue() or
+-       * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+-       */
+-      void (*cpu_online)(s32 cpu);
+-
+-      /**
+-       * @cpu_offline: A CPU is going offline
+-       * @cpu: CPU which is going offline
+-       *
+-       * @cpu is going offline. @cpu will not call ops.enqueue() or
+-       * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+-       */
+-      void (*cpu_offline)(s32 cpu);
+-
+-      /*
+-       * All CPU hotplug ops must come before ops.init().
+-       */
+-
+-      /**
+-       * @init: Initialize the BPF scheduler
+-       */
+-      s32 (*init)(void);
+-
+-      /**
+-       * @exit: Clean up after the BPF scheduler
+-       * @info: Exit info
+-       *
+-       * ops.exit() is also called on ops.init() failure, which is a bit
+-       * unusual. This is to allow rich reporting through @info on how
+-       * ops.init() failed.
+-       */
+-      void (*exit)(struct scx_exit_info *info);
+-
+-      /**
+-       * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+-       */
+-      u32 dispatch_max_batch;
+-
+-      /**
+-       * @flags: %SCX_OPS_* flags
+-       */
+-      u64 flags;
+-
+-      /**
+-       * @timeout_ms: The maximum amount of time, in milliseconds, that a
+-       * runnable task should be able to wait before being scheduled. The
+-       * maximum timeout may not exceed the default timeout of 30 seconds.
+-       *
+-       * Defaults to the maximum allowed timeout value of 30 seconds.
+-       */
+-      u32 timeout_ms;
+-
+-      /**
+-       * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+-       * value of 32768 is used.
+-       */
+-      u32 exit_dump_len;
+-
+-      /**
+-       * @hotplug_seq: A sequence number that may be set by the scheduler to
+-       * detect when a hotplug event has occurred during the loading process.
+-       * If 0, no detection occurs. Otherwise, the scheduler will fail to
+-       * load if the sequence number does not match @scx_hotplug_seq on the
+-       * enable path.
+-       */
+-      u64 hotplug_seq;
+-
+-      /**
+-       * @name: BPF scheduler's name
+-       *
+-       * Must be a non-zero valid BPF object name including only isalnum(),
+-       * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+-       * BPF scheduler is enabled.
+-       */
+-      char name[SCX_OPS_NAME_LEN];
+-
+-      /* internal use only, must be NULL */
+-      void *priv;
+-};
+-
+-enum scx_opi {
+-      SCX_OPI_BEGIN                   = 0,
+-      SCX_OPI_NORMAL_BEGIN            = 0,
+-      SCX_OPI_NORMAL_END              = SCX_OP_IDX(cpu_online),
+-      SCX_OPI_CPU_HOTPLUG_BEGIN       = SCX_OP_IDX(cpu_online),
+-      SCX_OPI_CPU_HOTPLUG_END         = SCX_OP_IDX(init),
+-      SCX_OPI_END                     = SCX_OP_IDX(init),
+-};
+-
+-/*
+- * Collection of event counters. Event types are placed in descending order.
+- */
+-struct scx_event_stats {
+-      /*
+-       * If ops.select_cpu() returns a CPU which can't be used by the task,
+-       * the core scheduler code silently picks a fallback CPU.
+-       */
+-      s64             SCX_EV_SELECT_CPU_FALLBACK;
+-
+-      /*
+-       * When dispatching to a local DSQ, the CPU may have gone offline in
+-       * the meantime. In this case, the task is bounced to the global DSQ.
+-       */
+-      s64             SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+-
+-      /*
+-       * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+-       * continued to run because there were no other tasks on the CPU.
+-       */
+-      s64             SCX_EV_DISPATCH_KEEP_LAST;
+-
+-      /*
+-       * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+-       * is dispatched to a local DSQ when exiting.
+-       */
+-      s64             SCX_EV_ENQ_SKIP_EXITING;
+-
+-      /*
+-       * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+-       * migration disabled task skips ops.enqueue() and is dispatched to its
+-       * local DSQ.
+-       */
+-      s64             SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+-
+-      /*
+-       * Total number of times a task's time slice was refilled with the
+-       * default value (SCX_SLICE_DFL).
+-       */
+-      s64             SCX_EV_REFILL_SLICE_DFL;
+-
+-      /*
+-       * The total duration of bypass modes in nanoseconds.
+-       */
+-      s64             SCX_EV_BYPASS_DURATION;
+-
+-      /*
+-       * The number of tasks dispatched in the bypassing mode.
+-       */
+-      s64             SCX_EV_BYPASS_DISPATCH;
+-
+-      /*
+-       * The number of times the bypassing mode has been activated.
+-       */
+-      s64             SCX_EV_BYPASS_ACTIVATE;
+-};
+-
+-struct scx_sched {
+-      struct sched_ext_ops    ops;
+-      DECLARE_BITMAP(has_op, SCX_OPI_END);
+-
+-      /*
+-       * Dispatch queues.
+-       *
+-       * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+-       * This is to avoid live-locking in bypass mode where all tasks are
+-       * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+-       * per-node split isn't sufficient, it can be further split.
+-       */
+-      struct rhashtable       dsq_hash;
+-      struct scx_dispatch_q   **global_dsqs;
+-
+-      /*
+-       * The event counters are in a per-CPU variable to minimize the
+-       * accounting overhead. A system-wide view on the event counter is
+-       * constructed when requested by scx_bpf_events().
+-       */
+-      struct scx_event_stats __percpu *event_stats_cpu;
+-
+-      bool                    warned_zero_slice;
+-
+-      atomic_t                exit_kind;
+-      struct scx_exit_info    *exit_info;
+-
+-      struct kobject          kobj;
+-
+-      struct kthread_worker   *helper;
+-      struct irq_work         error_irq_work;
+-      struct kthread_work     disable_work;
+-      struct rcu_work         rcu_work;
+-};
+-
+-enum scx_wake_flags {
+-      /* expose select WF_* flags as enums */
+-      SCX_WAKE_FORK           = WF_FORK,
+-      SCX_WAKE_TTWU           = WF_TTWU,
+-      SCX_WAKE_SYNC           = WF_SYNC,
+-};
+-
+-enum scx_enq_flags {
+-      /* expose select ENQUEUE_* flags as enums */
+-      SCX_ENQ_WAKEUP          = ENQUEUE_WAKEUP,
+-      SCX_ENQ_HEAD            = ENQUEUE_HEAD,
+-      SCX_ENQ_CPU_SELECTED    = ENQUEUE_RQ_SELECTED,
+-
+-      /* high 32bits are SCX specific */
+-
+-      /*
+-       * Set the following to trigger preemption when calling
+-       * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+-       * current task is cleared to zero and the CPU is kicked into the
+-       * scheduling path. Implies %SCX_ENQ_HEAD.
+-       */
+-      SCX_ENQ_PREEMPT         = 1LLU << 32,
+-
+-      /*
+-       * The task being enqueued was previously enqueued on the current CPU's
+-       * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+-       * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+-       * invoked in a ->cpu_release() callback, and the task is again
+-       * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+-       * task will not be scheduled on the CPU until at least the next invocation
+-       * of the ->cpu_acquire() callback.
+-       */
+-      SCX_ENQ_REENQ           = 1LLU << 40,
+-
+-      /*
+-       * The task being enqueued is the only task available for the cpu. By
+-       * default, ext core keeps executing such tasks but when
+-       * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+-       * %SCX_ENQ_LAST flag set.
+-       *
+-       * The BPF scheduler is responsible for triggering a follow-up
+-       * scheduling event. Otherwise, Execution may stall.
+-       */
+-      SCX_ENQ_LAST            = 1LLU << 41,
+-
+-      /* high 8 bits are internal */
+-      __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+-
+-      SCX_ENQ_CLEAR_OPSS      = 1LLU << 56,
+-      SCX_ENQ_DSQ_PRIQ        = 1LLU << 57,
+-};
+-
+-enum scx_deq_flags {
+-      /* expose select DEQUEUE_* flags as enums */
+-      SCX_DEQ_SLEEP           = DEQUEUE_SLEEP,
+-
+-      /* high 32bits are SCX specific */
+-
+-      /*
+-       * The generic core-sched layer decided to execute the task even though
+-       * it hasn't been dispatched yet. Dequeue from the BPF side.
+-       */
+-      SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+-};
+-
+-enum scx_pick_idle_cpu_flags {
+-      SCX_PICK_IDLE_CORE      = 1LLU << 0,    /* pick a CPU whose SMT siblings are also idle */
+-      SCX_PICK_IDLE_IN_NODE   = 1LLU << 1,    /* pick a CPU in the same target NUMA node */
+-};
+-
+-enum scx_kick_flags {
+-      /*
+-       * Kick the target CPU if idle. Guarantees that the target CPU goes
+-       * through at least one full scheduling cycle before going idle. If the
+-       * target CPU can be determined to be currently not idle and going to go
+-       * through a scheduling cycle before going idle, noop.
+-       */
+-      SCX_KICK_IDLE           = 1LLU << 0,
+-
+-      /*
+-       * Preempt the current task and execute the dispatch path. If the
+-       * current task of the target CPU is an SCX task, its ->scx.slice is
+-       * cleared to zero before the scheduling path is invoked so that the
+-       * task expires and the dispatch path is invoked.
+-       */
+-      SCX_KICK_PREEMPT        = 1LLU << 1,
+-
+-      /*
+-       * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+-       * return after the target CPU finishes picking the next task.
+-       */
+-      SCX_KICK_WAIT           = 1LLU << 2,
+-};
+-
+-enum scx_tg_flags {
+-      SCX_TG_ONLINE           = 1U << 0,
+-      SCX_TG_INITED           = 1U << 1,
+-};
+-
+-enum scx_enable_state {
+-      SCX_ENABLING,
+-      SCX_ENABLED,
+-      SCX_DISABLING,
+-      SCX_DISABLED,
+-};
+-
+-static const char *scx_enable_state_str[] = {
+-      [SCX_ENABLING]          = "enabling",
+-      [SCX_ENABLED]           = "enabled",
+-      [SCX_DISABLING]         = "disabling",
+-      [SCX_DISABLED]          = "disabled",
+-};
+-
+-/*
+- * sched_ext_entity->ops_state
+- *
+- * Used to track the task ownership between the SCX core and the BPF scheduler.
+- * State transitions look as follows:
+- *
+- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+- *   ^              |                 |
+- *   |              v                 v
+- *   \-------------------------------/
+- *
+- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+- * sites for explanations on the conditions being waited upon and why they are
+- * safe. Transitions out of them into NONE or QUEUED must store_release and the
+- * waiters should load_acquire.
+- *
+- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+- * any given task can be dispatched by the BPF scheduler at all times and thus
+- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+- * to try to dispatch any task anytime regardless of its state as the SCX core
+- * can safely reject invalid dispatches.
+- */
+-enum scx_ops_state {
+-      SCX_OPSS_NONE,          /* owned by the SCX core */
+-      SCX_OPSS_QUEUEING,      /* in transit to the BPF scheduler */
+-      SCX_OPSS_QUEUED,        /* owned by the BPF scheduler */
+-      SCX_OPSS_DISPATCHING,   /* in transit back to the SCX core */
+-
+-      /*
+-       * QSEQ brands each QUEUED instance so that, when dispatch races
+-       * dequeue/requeue, the dispatcher can tell whether it still has a claim
+-       * on the task being dispatched.
+-       *
+-       * As some 32bit archs can't do 64bit store_release/load_acquire,
+-       * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+-       * 32bit machines. The dispatch race window QSEQ protects is very narrow
+-       * and runs with IRQ disabled. 30 bits should be sufficient.
+-       */
+-      SCX_OPSS_QSEQ_SHIFT     = 2,
+-};
+-
+-/* Use macros to ensure that the type is unsigned long for the masks */
+-#define SCX_OPSS_STATE_MASK   ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+-#define SCX_OPSS_QSEQ_MASK    (~SCX_OPSS_STATE_MASK)
+-
+ /*
+  * NOTE: sched_ext is in the process of growing multiple scheduler support and
+  * scx_root usage is in a transitional state. Naked dereferences are safe if the
+diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
+index 292bb41a242ec..33858607bc97f 100644
+--- a/kernel/sched/ext.h
++++ b/kernel/sched/ext.h
+@@ -8,29 +8,6 @@
+  */
+ #ifdef CONFIG_SCHED_CLASS_EXT
+-static inline bool scx_kf_allowed_if_unlocked(void)
+-{
+-      return !current->scx.kf_mask;
+-}
+-
+-static inline bool scx_rq_bypassing(struct rq *rq)
+-{
+-      return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+-}
+-
+-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+-
+-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+-
+-/*
+- * Return the rq currently locked from an scx callback, or NULL if no rq is
+- * locked.
+- */
+-static inline struct rq *scx_locked_rq(void)
+-{
+-      return __this_cpu_read(scx_locked_rq_state);
+-}
+-
+ void scx_tick(struct rq *rq);
+ void init_scx_entity(struct sched_ext_entity *scx);
+ void scx_pre_fork(struct task_struct *p);
+diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
+new file mode 100644
+index 0000000000000..76690ede8700f
+--- /dev/null
++++ b/kernel/sched/ext_internal.h
+@@ -0,0 +1,1061 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
++ *
++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
++ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
++ */
++#define SCX_OP_IDX(op)                (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
++
++enum scx_consts {
++      SCX_DSP_DFL_MAX_BATCH           = 32,
++      SCX_DSP_MAX_LOOPS               = 32,
++      SCX_WATCHDOG_MAX_TIMEOUT        = 30 * HZ,
++
++      SCX_EXIT_BT_LEN                 = 64,
++      SCX_EXIT_MSG_LEN                = 1024,
++      SCX_EXIT_DUMP_DFL_LEN           = 32768,
++
++      SCX_CPUPERF_ONE                 = SCHED_CAPACITY_SCALE,
++
++      /*
++       * Iterating all tasks may take a while. Periodically drop
++       * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
++       */
++      SCX_TASK_ITER_BATCH             = 32,
++};
++
++enum scx_exit_kind {
++      SCX_EXIT_NONE,
++      SCX_EXIT_DONE,
++
++      SCX_EXIT_UNREG = 64,    /* user-space initiated unregistration */
++      SCX_EXIT_UNREG_BPF,     /* BPF-initiated unregistration */
++      SCX_EXIT_UNREG_KERN,    /* kernel-initiated unregistration */
++      SCX_EXIT_SYSRQ,         /* requested by 'S' sysrq */
++
++      SCX_EXIT_ERROR = 1024,  /* runtime error, error msg contains details */
++      SCX_EXIT_ERROR_BPF,     /* ERROR but triggered through scx_bpf_error() */
++      SCX_EXIT_ERROR_STALL,   /* watchdog detected stalled runnable tasks */
++};
++
++/*
++ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
++ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
++ * are 64bit of the format:
++ *
++ *   Bits: [63  ..  48 47   ..  32 31 .. 0]
++ *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
++ *
++ *   SYS ACT: System-defined exit actions
++ *   SYS RSN: System-defined exit reasons
++ *   USR    : User-defined exit codes and reasons
++ *
++ * Using the above, users may communicate intention and context by ORing system
++ * actions and/or system reasons with a user-defined exit code.
++ */
++enum scx_exit_code {
++      /* Reasons */
++      SCX_ECODE_RSN_HOTPLUG   = 1LLU << 32,
++
++      /* Actions */
++      SCX_ECODE_ACT_RESTART   = 1LLU << 48,
++};
++
++/*
++ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
++ * being disabled.
++ */
++struct scx_exit_info {
++      /* %SCX_EXIT_* - broad category of the exit reason */
++      enum scx_exit_kind      kind;
++
++      /* exit code if gracefully exiting */
++      s64                     exit_code;
++
++      /* textual representation of the above */
++      const char              *reason;
++
++      /* backtrace if exiting due to an error */
++      unsigned long           *bt;
++      u32                     bt_len;
++
++      /* informational message */
++      char                    *msg;
++
++      /* debug dump */
++      char                    *dump;
++};
++
++/* sched_ext_ops.flags */
++enum scx_ops_flags {
++      /*
++       * Keep built-in idle tracking even if ops.update_idle() is implemented.
++       */
++      SCX_OPS_KEEP_BUILTIN_IDLE       = 1LLU << 0,
++
++      /*
++       * By default, if there are no other task to run on the CPU, ext core
++       * keeps running the current task even after its slice expires. If this
++       * flag is specified, such tasks are passed to ops.enqueue() with
++       * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
++       */
++      SCX_OPS_ENQ_LAST                = 1LLU << 1,
++
++      /*
++       * An exiting task may schedule after PF_EXITING is set. In such cases,
++       * bpf_task_from_pid() may not be able to find the task and if the BPF
++       * scheduler depends on pid lookup for dispatching, the task will be
++       * lost leading to various issues including RCU grace period stalls.
++       *
++       * To mask this problem, by default, unhashed tasks are automatically
++       * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
++       * depend on pid lookups and wants to handle these tasks directly, the
++       * following flag can be used.
++       */
++      SCX_OPS_ENQ_EXITING             = 1LLU << 2,
++
++      /*
++       * If set, only tasks with policy set to SCHED_EXT are attached to
++       * sched_ext. If clear, SCHED_NORMAL tasks are also included.
++       */
++      SCX_OPS_SWITCH_PARTIAL          = 1LLU << 3,
++
++      /*
++       * A migration disabled task can only execute on its current CPU. By
++       * default, such tasks are automatically put on the CPU's local DSQ with
++       * the default slice on enqueue. If this ops flag is set, they also go
++       * through ops.enqueue().
++       *
++       * A migration disabled task never invokes ops.select_cpu() as it can
++       * only select the current CPU. Also, p->cpus_ptr will only contain its
++       * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
++       * and thus may disagree with cpumask_weight(p->cpus_ptr).
++       */
++      SCX_OPS_ENQ_MIGRATION_DISABLED  = 1LLU << 4,
++
++      /*
++       * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
++       * ops.enqueue() on the ops.select_cpu() selected or the wakee's
++       * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
++       * transfers. When this optimization is enabled, ops.select_cpu() is
++       * skipped in some cases (when racing against the wakee switching out).
++       * As the BPF scheduler may depend on ops.select_cpu() being invoked
++       * during wakeups, queued wakeup is disabled by default.
++       *
++       * If this ops flag is set, queued wakeup optimization is enabled and
++       * the BPF scheduler must be able to handle ops.enqueue() invoked on the
++       * wakee's CPU without preceding ops.select_cpu() even for tasks which
++       * may be executed on multiple CPUs.
++       */
++      SCX_OPS_ALLOW_QUEUED_WAKEUP     = 1LLU << 5,
++
++      /*
++       * If set, enable per-node idle cpumasks. If clear, use a single global
++       * flat idle cpumask.
++       */
++      SCX_OPS_BUILTIN_IDLE_PER_NODE   = 1LLU << 6,
++
++      /*
++       * CPU cgroup support flags
++       */
++      SCX_OPS_HAS_CGROUP_WEIGHT       = 1LLU << 16,   /* DEPRECATED, will be removed on 6.18 */
++
++      SCX_OPS_ALL_FLAGS               = SCX_OPS_KEEP_BUILTIN_IDLE |
++                                        SCX_OPS_ENQ_LAST |
++                                        SCX_OPS_ENQ_EXITING |
++                                        SCX_OPS_ENQ_MIGRATION_DISABLED |
++                                        SCX_OPS_ALLOW_QUEUED_WAKEUP |
++                                        SCX_OPS_SWITCH_PARTIAL |
++                                        SCX_OPS_BUILTIN_IDLE_PER_NODE |
++                                        SCX_OPS_HAS_CGROUP_WEIGHT,
++
++      /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
++      __SCX_OPS_INTERNAL_MASK         = 0xffLLU << 56,
++
++      SCX_OPS_HAS_CPU_PREEMPT         = 1LLU << 56,
++};
++
++/* argument container for ops.init_task() */
++struct scx_init_task_args {
++      /*
++       * Set if ops.init_task() is being invoked on the fork path, as opposed
++       * to the scheduler transition path.
++       */
++      bool                    fork;
++#ifdef CONFIG_EXT_GROUP_SCHED
++      /* the cgroup the task is joining */
++      struct cgroup           *cgroup;
++#endif
++};
++
++/* argument container for ops.exit_task() */
++struct scx_exit_task_args {
++      /* Whether the task exited before running on sched_ext. */
++      bool cancelled;
++};
++
++/* argument container for ops->cgroup_init() */
++struct scx_cgroup_init_args {
++      /* the weight of the cgroup [1..10000] */
++      u32                     weight;
++
++      /* bandwidth control parameters from cpu.max and cpu.max.burst */
++      u64                     bw_period_us;
++      u64                     bw_quota_us;
++      u64                     bw_burst_us;
++};
++
++enum scx_cpu_preempt_reason {
++      /* next task is being scheduled by &sched_class_rt */
++      SCX_CPU_PREEMPT_RT,
++      /* next task is being scheduled by &sched_class_dl */
++      SCX_CPU_PREEMPT_DL,
++      /* next task is being scheduled by &sched_class_stop */
++      SCX_CPU_PREEMPT_STOP,
++      /* unknown reason for SCX being preempted */
++      SCX_CPU_PREEMPT_UNKNOWN,
++};
++
++/*
++ * Argument container for ops->cpu_acquire(). Currently empty, but may be
++ * expanded in the future.
++ */
++struct scx_cpu_acquire_args {};
++
++/* argument container for ops->cpu_release() */
++struct scx_cpu_release_args {
++      /* the reason the CPU was preempted */
++      enum scx_cpu_preempt_reason reason;
++
++      /* the task that's going to be scheduled on the CPU */
++      struct task_struct      *task;
++};
++
++/*
++ * Informational context provided to dump operations.
++ */
++struct scx_dump_ctx {
++      enum scx_exit_kind      kind;
++      s64                     exit_code;
++      const char              *reason;
++      u64                     at_ns;
++      u64                     at_jiffies;
++};
++
++/**
++ * struct sched_ext_ops - Operation table for BPF scheduler implementation
++ *
++ * A BPF scheduler can implement an arbitrary scheduling policy by
++ * implementing and loading operations in this table. Note that a userland
++ * scheduling policy can also be implemented using the BPF scheduler
++ * as a shim layer.
++ */
++struct sched_ext_ops {
++      /**
++       * @select_cpu: Pick the target CPU for a task which is being woken up
++       * @p: task being woken up
++       * @prev_cpu: the cpu @p was on before sleeping
++       * @wake_flags: SCX_WAKE_*
++       *
++       * Decision made here isn't final. @p may be moved to any CPU while it
++       * is getting dispatched for execution later. However, as @p is not on
++       * the rq at this point, getting the eventual execution CPU right here
++       * saves a small bit of overhead down the line.
++       *
++       * If an idle CPU is returned, the CPU is kicked and will try to
++       * dispatch. While an explicit custom mechanism can be added,
++       * select_cpu() serves as the default way to wake up idle CPUs.
++       *
++       * @p may be inserted into a DSQ directly by calling
++       * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
++       * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
++       * of the CPU returned by this operation.
++       *
++       * Note that select_cpu() is never called for tasks that can only run
++       * on a single CPU or tasks with migration disabled, as they don't have
++       * the option to select a different CPU. See select_task_rq() for
++       * details.
++       */
++      s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
++
++      /**
++       * @enqueue: Enqueue a task on the BPF scheduler
++       * @p: task being enqueued
++       * @enq_flags: %SCX_ENQ_*
++       *
++       * @p is ready to run. Insert directly into a DSQ by calling
++       * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
++       * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
++       * the task will stall.
++       *
++       * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
++       * skipped.
++       */
++      void (*enqueue)(struct task_struct *p, u64 enq_flags);
++
++      /**
++       * @dequeue: Remove a task from the BPF scheduler
++       * @p: task being dequeued
++       * @deq_flags: %SCX_DEQ_*
++       *
++       * Remove @p from the BPF scheduler. This is usually called to isolate
++       * the task while updating its scheduling properties (e.g. priority).
++       *
++       * The ext core keeps track of whether the BPF side owns a given task or
++       * not and can gracefully ignore spurious dispatches from BPF side,
++       * which makes it safe to not implement this method. However, depending
++       * on the scheduling logic, this can lead to confusing behaviors - e.g.
++       * scheduling position not being updated across a priority change.
++       */
++      void (*dequeue)(struct task_struct *p, u64 deq_flags);
++
++      /**
++       * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
++       * @cpu: CPU to dispatch tasks for
++       * @prev: previous task being switched out
++       *
++       * Called when a CPU's local dsq is empty. The operation should dispatch
++       * one or more tasks from the BPF scheduler into the DSQs using
++       * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
++       * using scx_bpf_dsq_move_to_local().
++       *
++       * The maximum number of times scx_bpf_dsq_insert() can be called
++       * without an intervening scx_bpf_dsq_move_to_local() is specified by
++       * ops.dispatch_max_batch. See the comments on top of the two functions
++       * for more details.
++       *
++       * When not %NULL, @prev is an SCX task with its slice depleted. If
++       * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
++       * @prev->scx.flags, it is not enqueued yet and will be enqueued after
++       * ops.dispatch() returns. To keep executing @prev, return without
++       * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
++       */
++      void (*dispatch)(s32 cpu, struct task_struct *prev);
++
++      /**
++       * @tick: Periodic tick
++       * @p: task running currently
++       *
++       * This operation is called every 1/HZ seconds on CPUs which are
++       * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
++       * immediate dispatch cycle on the CPU.
++       */
++      void (*tick)(struct task_struct *p);
++
++      /**
++       * @runnable: A task is becoming runnable on its associated CPU
++       * @p: task becoming runnable
++       * @enq_flags: %SCX_ENQ_*
++       *
++       * This and the following three functions can be used to track a task's
++       * execution state transitions. A task becomes ->runnable() on a CPU,
++       * and then goes through one or more ->running() and ->stopping() pairs
++       * as it runs on the CPU, and eventually becomes ->quiescent() when it's
++       * done running on the CPU.
++       *
++       * @p is becoming runnable on the CPU because it's
++       *
++       * - waking up (%SCX_ENQ_WAKEUP)
++       * - being moved from another CPU
++       * - being restored after temporarily taken off the queue for an
++       *   attribute change.
++       *
++       * This and ->enqueue() are related but not coupled. This operation
++       * notifies @p's state transition and may not be followed by ->enqueue()
++       * e.g. when @p is being dispatched to a remote CPU, or when @p is
++       * being enqueued on a CPU experiencing a hotplug event. Likewise, a
++       * task may be ->enqueue()'d without being preceded by this operation
++       * e.g. after exhausting its slice.
++       */
++      void (*runnable)(struct task_struct *p, u64 enq_flags);
++
++      /**
++       * @running: A task is starting to run on its associated CPU
++       * @p: task starting to run
++       *
++       * Note that this callback may be called from a CPU other than the
++       * one the task is going to run on. This can happen when a task
++       * property is changed (i.e., affinity), since scx_next_task_scx(),
++       * which triggers this callback, may run on a CPU different from
++       * the task's assigned CPU.
++       *
++       * Therefore, always use scx_bpf_task_cpu(@p) to determine the
++       * target CPU the task is going to use.
++       *
++       * See ->runnable() for explanation on the task state notifiers.
++       */
++      void (*running)(struct task_struct *p);
++
++      /**
++       * @stopping: A task is stopping execution
++       * @p: task stopping to run
++       * @runnable: is task @p still runnable?
++       *
++       * Note that this callback may be called from a CPU other than the
++       * one the task was running on. This can happen when a task
++       * property is changed (i.e., affinity), since dequeue_task_scx(),
++       * which triggers this callback, may run on a CPU different from
++       * the task's assigned CPU.
++       *
++       * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
++       * the task was running on.
++       *
++       * See ->runnable() for explanation on the task state notifiers. If
++       * !@runnable, ->quiescent() will be invoked after this operation
++       * returns.
++       */
++      void (*stopping)(struct task_struct *p, bool runnable);
++
++      /**
++       * @quiescent: A task is becoming not runnable on its associated CPU
++       * @p: task becoming not runnable
++       * @deq_flags: %SCX_DEQ_*
++       *
++       * See ->runnable() for explanation on the task state notifiers.
++       *
++       * @p is becoming quiescent on the CPU because it's
++       *
++       * - sleeping (%SCX_DEQ_SLEEP)
++       * - being moved to another CPU
++       * - being temporarily taken off the queue for an attribute change
++       *   (%SCX_DEQ_SAVE)
++       *
++       * This and ->dequeue() are related but not coupled. This operation
++       * notifies @p's state transition and may not be preceded by ->dequeue()
++       * e.g. when @p is being dispatched to a remote CPU.
++       */
++      void (*quiescent)(struct task_struct *p, u64 deq_flags);
++
++      /**
++       * @yield: Yield CPU
++       * @from: yielding task
++       * @to: optional yield target task
++       *
++       * If @to is NULL, @from is yielding the CPU to other runnable tasks.
++       * The BPF scheduler should ensure that other available tasks are
++       * dispatched before the yielding task. Return value is ignored in this
++       * case.
++       *
++       * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
++       * scheduler can implement the request, return %true; otherwise, %false.
++       */
++      bool (*yield)(struct task_struct *from, struct task_struct *to);
++
++      /**
++       * @core_sched_before: Task ordering for core-sched
++       * @a: task A
++       * @b: task B
++       *
++       * Used by core-sched to determine the ordering between two tasks. See
++       * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
++       * core-sched.
++       *
++       * Both @a and @b are runnable and may or may not currently be queued on
++       * the BPF scheduler. Should return %true if @a should run before @b.
++       * %false if there's no required ordering or @b should run before @a.
++       *
++       * If not specified, the default is ordering them according to when they
++       * became runnable.
++       */
++      bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
++
++      /**
++       * @set_weight: Set task weight
++       * @p: task to set weight for
++       * @weight: new weight [1..10000]
++       *
++       * Update @p's weight to @weight.
++       */
++      void (*set_weight)(struct task_struct *p, u32 weight);
++
++      /**
++       * @set_cpumask: Set CPU affinity
++       * @p: task to set CPU affinity for
++       * @cpumask: cpumask of cpus that @p can run on
++       *
++       * Update @p's CPU affinity to @cpumask.
++       */
++      void (*set_cpumask)(struct task_struct *p,
++                          const struct cpumask *cpumask);
++
++      /**
++       * @update_idle: Update the idle state of a CPU
++       * @cpu: CPU to update the idle state for
++       * @idle: whether entering or exiting the idle state
++       *
++       * This operation is called when @rq's CPU goes or leaves the idle
++       * state. By default, implementing this operation disables the built-in
++       * idle CPU tracking and the following helpers become unavailable:
++       *
++       * - scx_bpf_select_cpu_dfl()
++       * - scx_bpf_select_cpu_and()
++       * - scx_bpf_test_and_clear_cpu_idle()
++       * - scx_bpf_pick_idle_cpu()
++       *
++       * The user also must implement ops.select_cpu() as the default
++       * implementation relies on scx_bpf_select_cpu_dfl().
++       *
++       * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
++       * tracking.
++       */
++      void (*update_idle)(s32 cpu, bool idle);
++
++      /**
++       * @cpu_acquire: A CPU is becoming available to the BPF scheduler
++       * @cpu: The CPU being acquired by the BPF scheduler.
++       * @args: Acquire arguments, see the struct definition.
++       *
++       * A CPU that was previously released from the BPF scheduler is now once
++       * again under its control.
++       */
++      void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
++
++      /**
++       * @cpu_release: A CPU is taken away from the BPF scheduler
++       * @cpu: The CPU being released by the BPF scheduler.
++       * @args: Release arguments, see the struct definition.
++       *
++       * The specified CPU is no longer under the control of the BPF
++       * scheduler. This could be because it was preempted by a higher
++       * priority sched_class, though there may be other reasons as well. The
++       * caller should consult @args->reason to determine the cause.
++       */
++      void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
++
++      /**
++       * @init_task: Initialize a task to run in a BPF scheduler
++       * @p: task to initialize for BPF scheduling
++       * @args: init arguments, see the struct definition
++       *
++       * Either we're loading a BPF scheduler or a new task is being forked.
++       * Initialize @p for BPF scheduling. This operation may block and can
++       * be used for allocations, and is called exactly once for a task.
++       *
++       * Return 0 for success, -errno for failure. An error return while
++       * loading will abort loading of the BPF scheduler. During a fork, it
++       * will abort that specific fork.
++       */
++      s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
++
++      /**
++       * @exit_task: Exit a previously-running task from the system
++       * @p: task to exit
++       * @args: exit arguments, see the struct definition
++       *
++       * @p is exiting or the BPF scheduler is being unloaded. Perform any
++       * necessary cleanup for @p.
++       */
++      void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
++
++      /**
++       * @enable: Enable BPF scheduling for a task
++       * @p: task to enable BPF scheduling for
++       *
++       * Enable @p for BPF scheduling. enable() is called on @p any time it
++       * enters SCX, and is always paired with a matching disable().
++       */
++      void (*enable)(struct task_struct *p);
++
++      /**
++       * @disable: Disable BPF scheduling for a task
++       * @p: task to disable BPF scheduling for
++       *
++       * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
++       * Disable BPF scheduling for @p. A disable() call is always matched
++       * with a prior enable() call.
++       */
++      void (*disable)(struct task_struct *p);
++
++      /**
++       * @dump: Dump BPF scheduler state on error
++       * @ctx: debug dump context
++       *
++       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
++       */
++      void (*dump)(struct scx_dump_ctx *ctx);
++
++      /**
++       * @dump_cpu: Dump BPF scheduler state for a CPU on error
++       * @ctx: debug dump context
++       * @cpu: CPU to generate debug dump for
++       * @idle: @cpu is currently idle without any runnable tasks
++       *
++       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
++       * @cpu. If @idle is %true and this operation doesn't produce any
++       * output, @cpu is skipped for dump.
++       */
++      void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
++
++      /**
++       * @dump_task: Dump BPF scheduler state for a runnable task on error
++       * @ctx: debug dump context
++       * @p: runnable task to generate debug dump for
++       *
++       * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
++       * @p.
++       */
++      void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
++
++#ifdef CONFIG_EXT_GROUP_SCHED
++      /**
++       * @cgroup_init: Initialize a cgroup
++       * @cgrp: cgroup being initialized
++       * @args: init arguments, see the struct definition
++       *
++       * Either the BPF scheduler is being loaded or @cgrp created, initialize
++       * @cgrp for sched_ext. This operation may block.
++       *
++       * Return 0 for success, -errno for failure. An error return while
++       * loading will abort loading of the BPF scheduler. During cgroup
++       * creation, it will abort the specific cgroup creation.
++       */
++      s32 (*cgroup_init)(struct cgroup *cgrp,
++                         struct scx_cgroup_init_args *args);
++
++      /**
++       * @cgroup_exit: Exit a cgroup
++       * @cgrp: cgroup being exited
++       *
++       * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
++       * @cgrp for sched_ext. This operation my block.
++       */
++      void (*cgroup_exit)(struct cgroup *cgrp);
++
++      /**
++       * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
++       * @p: task being moved
++       * @from: cgroup @p is being moved from
++       * @to: cgroup @p is being moved to
++       *
++       * Prepare @p for move from cgroup @from to @to. This operation may
++       * block and can be used for allocations.
++       *
++       * Return 0 for success, -errno for failure. An error return aborts the
++       * migration.
++       */
++      s32 (*cgroup_prep_move)(struct task_struct *p,
++                              struct cgroup *from, struct cgroup *to);
++
++      /**
++       * @cgroup_move: Commit cgroup move
++       * @p: task being moved
++       * @from: cgroup @p is being moved from
++       * @to: cgroup @p is being moved to
++       *
++       * Commit the move. @p is dequeued during this operation.
++       */
++      void (*cgroup_move)(struct task_struct *p,
++                          struct cgroup *from, struct cgroup *to);
++
++      /**
++       * @cgroup_cancel_move: Cancel cgroup move
++       * @p: task whose cgroup move is being canceled
++       * @from: cgroup @p was being moved from
++       * @to: cgroup @p was being moved to
++       *
++       * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
++       * Undo the preparation.
++       */
++      void (*cgroup_cancel_move)(struct task_struct *p,
++                                 struct cgroup *from, struct cgroup *to);
++
++      /**
++       * @cgroup_set_weight: A cgroup's weight is being changed
++       * @cgrp: cgroup whose weight is being updated
++       * @weight: new weight [1..10000]
++       *
++       * Update @cgrp's weight to @weight.
++       */
++      void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
++
++      /**
++       * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
++       * @cgrp: cgroup whose bandwidth is being updated
++       * @period_us: bandwidth control period
++       * @quota_us: bandwidth control quota
++       * @burst_us: bandwidth control burst
++       *
++       * Update @cgrp's bandwidth control parameters. This is from the cpu.max
++       * cgroup interface.
++       *
++       * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
++       * to. For example, if @period_us is 1_000_000 and @quota_us is
++       * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
++       * interpreted in the same fashion and specifies how much @cgrp can
++       * burst temporarily. The specific control mechanism and thus the
++       * interpretation of @period_us and burstiness is upto to the BPF
++       * scheduler.
++       */
++      void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
++                                   u64 period_us, u64 quota_us, u64 burst_us);
++
++#endif        /* CONFIG_EXT_GROUP_SCHED */
++
++      /*
++       * All online ops must come before ops.cpu_online().
++       */
++
++      /**
++       * @cpu_online: A CPU became online
++       * @cpu: CPU which just came up
++       *
++       * @cpu just came online. @cpu will not call ops.enqueue() or
++       * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
++       */
++      void (*cpu_online)(s32 cpu);
++
++      /**
++       * @cpu_offline: A CPU is going offline
++       * @cpu: CPU which is going offline
++       *
++       * @cpu is going offline. @cpu will not call ops.enqueue() or
++       * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
++       */
++      void (*cpu_offline)(s32 cpu);
++
++      /*
++       * All CPU hotplug ops must come before ops.init().
++       */
++
++      /**
++       * @init: Initialize the BPF scheduler
++       */
++      s32 (*init)(void);
++
++      /**
++       * @exit: Clean up after the BPF scheduler
++       * @info: Exit info
++       *
++       * ops.exit() is also called on ops.init() failure, which is a bit
++       * unusual. This is to allow rich reporting through @info on how
++       * ops.init() failed.
++       */
++      void (*exit)(struct scx_exit_info *info);
++
++      /**
++       * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
++       */
++      u32 dispatch_max_batch;
++
++      /**
++       * @flags: %SCX_OPS_* flags
++       */
++      u64 flags;
++
++      /**
++       * @timeout_ms: The maximum amount of time, in milliseconds, that a
++       * runnable task should be able to wait before being scheduled. The
++       * maximum timeout may not exceed the default timeout of 30 seconds.
++       *
++       * Defaults to the maximum allowed timeout value of 30 seconds.
++       */
++      u32 timeout_ms;
++
++      /**
++       * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
++       * value of 32768 is used.
++       */
++      u32 exit_dump_len;
++
++      /**
++       * @hotplug_seq: A sequence number that may be set by the scheduler to
++       * detect when a hotplug event has occurred during the loading process.
++       * If 0, no detection occurs. Otherwise, the scheduler will fail to
++       * load if the sequence number does not match @scx_hotplug_seq on the
++       * enable path.
++       */
++      u64 hotplug_seq;
++
++      /**
++       * @name: BPF scheduler's name
++       *
++       * Must be a non-zero valid BPF object name including only isalnum(),
++       * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
++       * BPF scheduler is enabled.
++       */
++      char name[SCX_OPS_NAME_LEN];
++
++      /* internal use only, must be NULL */
++      void *priv;
++};
++
++enum scx_opi {
++      SCX_OPI_BEGIN                   = 0,
++      SCX_OPI_NORMAL_BEGIN            = 0,
++      SCX_OPI_NORMAL_END              = SCX_OP_IDX(cpu_online),
++      SCX_OPI_CPU_HOTPLUG_BEGIN       = SCX_OP_IDX(cpu_online),
++      SCX_OPI_CPU_HOTPLUG_END         = SCX_OP_IDX(init),
++      SCX_OPI_END                     = SCX_OP_IDX(init),
++};
++
++/*
++ * Collection of event counters. Event types are placed in descending order.
++ */
++struct scx_event_stats {
++      /*
++       * If ops.select_cpu() returns a CPU which can't be used by the task,
++       * the core scheduler code silently picks a fallback CPU.
++       */
++      s64             SCX_EV_SELECT_CPU_FALLBACK;
++
++      /*
++       * When dispatching to a local DSQ, the CPU may have gone offline in
++       * the meantime. In this case, the task is bounced to the global DSQ.
++       */
++      s64             SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
++
++      /*
++       * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
++       * continued to run because there were no other tasks on the CPU.
++       */
++      s64             SCX_EV_DISPATCH_KEEP_LAST;
++
++      /*
++       * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
++       * is dispatched to a local DSQ when exiting.
++       */
++      s64             SCX_EV_ENQ_SKIP_EXITING;
++
++      /*
++       * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
++       * migration disabled task skips ops.enqueue() and is dispatched to its
++       * local DSQ.
++       */
++      s64             SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
++
++      /*
++       * Total number of times a task's time slice was refilled with the
++       * default value (SCX_SLICE_DFL).
++       */
++      s64             SCX_EV_REFILL_SLICE_DFL;
++
++      /*
++       * The total duration of bypass modes in nanoseconds.
++       */
++      s64             SCX_EV_BYPASS_DURATION;
++
++      /*
++       * The number of tasks dispatched in the bypassing mode.
++       */
++      s64             SCX_EV_BYPASS_DISPATCH;
++
++      /*
++       * The number of times the bypassing mode has been activated.
++       */
++      s64             SCX_EV_BYPASS_ACTIVATE;
++};
++
++struct scx_sched {
++      struct sched_ext_ops    ops;
++      DECLARE_BITMAP(has_op, SCX_OPI_END);
++
++      /*
++       * Dispatch queues.
++       *
++       * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
++       * This is to avoid live-locking in bypass mode where all tasks are
++       * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
++       * per-node split isn't sufficient, it can be further split.
++       */
++      struct rhashtable       dsq_hash;
++      struct scx_dispatch_q   **global_dsqs;
++
++      /*
++       * The event counters are in a per-CPU variable to minimize the
++       * accounting overhead. A system-wide view on the event counter is
++       * constructed when requested by scx_bpf_events().
++       */
++      struct scx_event_stats __percpu *event_stats_cpu;
++
++      bool                    warned_zero_slice;
++
++      atomic_t                exit_kind;
++      struct scx_exit_info    *exit_info;
++
++      struct kobject          kobj;
++
++      struct kthread_worker   *helper;
++      struct irq_work         error_irq_work;
++      struct kthread_work     disable_work;
++      struct rcu_work         rcu_work;
++};
++
++enum scx_wake_flags {
++      /* expose select WF_* flags as enums */
++      SCX_WAKE_FORK           = WF_FORK,
++      SCX_WAKE_TTWU           = WF_TTWU,
++      SCX_WAKE_SYNC           = WF_SYNC,
++};
++
++enum scx_enq_flags {
++      /* expose select ENQUEUE_* flags as enums */
++      SCX_ENQ_WAKEUP          = ENQUEUE_WAKEUP,
++      SCX_ENQ_HEAD            = ENQUEUE_HEAD,
++      SCX_ENQ_CPU_SELECTED    = ENQUEUE_RQ_SELECTED,
++
++      /* high 32bits are SCX specific */
++
++      /*
++       * Set the following to trigger preemption when calling
++       * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
++       * current task is cleared to zero and the CPU is kicked into the
++       * scheduling path. Implies %SCX_ENQ_HEAD.
++       */
++      SCX_ENQ_PREEMPT         = 1LLU << 32,
++
++      /*
++       * The task being enqueued was previously enqueued on the current CPU's
++       * %SCX_DSQ_LOCAL, but was removed from it in a call to the
++       * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
++       * invoked in a ->cpu_release() callback, and the task is again
++       * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
++       * task will not be scheduled on the CPU until at least the next invocation
++       * of the ->cpu_acquire() callback.
++       */
++      SCX_ENQ_REENQ           = 1LLU << 40,
++
++      /*
++       * The task being enqueued is the only task available for the cpu. By
++       * default, ext core keeps executing such tasks but when
++       * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
++       * %SCX_ENQ_LAST flag set.
++       *
++       * The BPF scheduler is responsible for triggering a follow-up
++       * scheduling event. Otherwise, Execution may stall.
++       */
++      SCX_ENQ_LAST            = 1LLU << 41,
++
++      /* high 8 bits are internal */
++      __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
++
++      SCX_ENQ_CLEAR_OPSS      = 1LLU << 56,
++      SCX_ENQ_DSQ_PRIQ        = 1LLU << 57,
++};
++
++enum scx_deq_flags {
++      /* expose select DEQUEUE_* flags as enums */
++      SCX_DEQ_SLEEP           = DEQUEUE_SLEEP,
++
++      /* high 32bits are SCX specific */
++
++      /*
++       * The generic core-sched layer decided to execute the task even though
++       * it hasn't been dispatched yet. Dequeue from the BPF side.
++       */
++      SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
++};
++
++enum scx_pick_idle_cpu_flags {
++      SCX_PICK_IDLE_CORE      = 1LLU << 0,    /* pick a CPU whose SMT siblings are also idle */
++      SCX_PICK_IDLE_IN_NODE   = 1LLU << 1,    /* pick a CPU in the same target NUMA node */
++};
++
++enum scx_kick_flags {
++      /*
++       * Kick the target CPU if idle. Guarantees that the target CPU goes
++       * through at least one full scheduling cycle before going idle. If the
++       * target CPU can be determined to be currently not idle and going to go
++       * through a scheduling cycle before going idle, noop.
++       */
++      SCX_KICK_IDLE           = 1LLU << 0,
++
++      /*
++       * Preempt the current task and execute the dispatch path. If the
++       * current task of the target CPU is an SCX task, its ->scx.slice is
++       * cleared to zero before the scheduling path is invoked so that the
++       * task expires and the dispatch path is invoked.
++       */
++      SCX_KICK_PREEMPT        = 1LLU << 1,
++
++      /*
++       * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
++       * return after the target CPU finishes picking the next task.
++       */
++      SCX_KICK_WAIT           = 1LLU << 2,
++};
++
++enum scx_tg_flags {
++      SCX_TG_ONLINE           = 1U << 0,
++      SCX_TG_INITED           = 1U << 1,
++};
++
++enum scx_enable_state {
++      SCX_ENABLING,
++      SCX_ENABLED,
++      SCX_DISABLING,
++      SCX_DISABLED,
++};
++
++static const char *scx_enable_state_str[] = {
++      [SCX_ENABLING]          = "enabling",
++      [SCX_ENABLED]           = "enabled",
++      [SCX_DISABLING]         = "disabling",
++      [SCX_DISABLED]          = "disabled",
++};
++
++/*
++ * sched_ext_entity->ops_state
++ *
++ * Used to track the task ownership between the SCX core and the BPF scheduler.
++ * State transitions look as follows:
++ *
++ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
++ *   ^              |                 |
++ *   |              v                 v
++ *   \-------------------------------/
++ *
++ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
++ * sites for explanations on the conditions being waited upon and why they are
++ * safe. Transitions out of them into NONE or QUEUED must store_release and the
++ * waiters should load_acquire.
++ *
++ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
++ * any given task can be dispatched by the BPF scheduler at all times and thus
++ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
++ * to try to dispatch any task anytime regardless of its state as the SCX core
++ * can safely reject invalid dispatches.
++ */
++enum scx_ops_state {
++      SCX_OPSS_NONE,          /* owned by the SCX core */
++      SCX_OPSS_QUEUEING,      /* in transit to the BPF scheduler */
++      SCX_OPSS_QUEUED,        /* owned by the BPF scheduler */
++      SCX_OPSS_DISPATCHING,   /* in transit back to the SCX core */
++
++      /*
++       * QSEQ brands each QUEUED instance so that, when dispatch races
++       * dequeue/requeue, the dispatcher can tell whether it still has a claim
++       * on the task being dispatched.
++       *
++       * As some 32bit archs can't do 64bit store_release/load_acquire,
++       * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
++       * 32bit machines. The dispatch race window QSEQ protects is very narrow
++       * and runs with IRQ disabled. 30 bits should be sufficient.
++       */
++      SCX_OPSS_QSEQ_SHIFT     = 2,
++};
++
++/* Use macros to ensure that the type is unsigned long for the masks */
++#define SCX_OPSS_STATE_MASK   ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
++#define SCX_OPSS_QSEQ_MASK    (~SCX_OPSS_STATE_MASK)
++
++DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
++
++/*
++ * Return the rq currently locked from an scx callback, or NULL if no rq is
++ * locked.
++ */
++static inline struct rq *scx_locked_rq(void)
++{
++      return __this_cpu_read(scx_locked_rq_state);
++}
++
++static inline bool scx_kf_allowed_if_unlocked(void)
++{
++      return !current->scx.kf_mask;
++}
++
++static inline bool scx_rq_bypassing(struct rq *rq)
++{
++      return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
++}
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch b/queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch
new file mode 100644 (file)
index 0000000..a9b69c8
--- /dev/null
@@ -0,0 +1,128 @@
+From d30e5472caf956fd0d6267d20b2c9f45871ae70a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Sep 2025 11:33:28 -1000
+Subject: sched_ext: Put event_stats_cpu in struct scx_sched_pcpu
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit bcb7c2305682c77a8bfdbfe37106b314ac10110f ]
+
+scx_sched.event_stats_cpu is the percpu counters that are used to track
+stats. Introduce struct scx_sched_pcpu and move the counters inside. This
+will ease adding more per-cpu fields. No functional changes.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Stable-dep-of: efeeaac9ae97 ("sched_ext: Sync error_irq_work before freeing scx_sched")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c          | 18 +++++++++---------
+ kernel/sched/ext_internal.h | 17 ++++++++++-------
+ 2 files changed, 19 insertions(+), 16 deletions(-)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 8ecde1abb4e28..46029050b170f 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -630,7 +630,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+  * This can be used when preemption is not disabled.
+  */
+ #define scx_add_event(sch, name, cnt) do {                                    \
+-      this_cpu_add((sch)->event_stats_cpu->name, (cnt));                      \
++      this_cpu_add((sch)->pcpu->event_stats.name, (cnt));                     \
+       trace_sched_ext_event(#name, (cnt));                                    \
+ } while(0)
+@@ -643,7 +643,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+  * This should be used only when preemption is disabled.
+  */
+ #define __scx_add_event(sch, name, cnt) do {                                  \
+-      __this_cpu_add((sch)->event_stats_cpu->name, (cnt));                    \
++      __this_cpu_add((sch)->pcpu->event_stats.name, (cnt));                   \
+       trace_sched_ext_event(#name, cnt);                                      \
+ } while(0)
+@@ -3538,7 +3538,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
+       int node;
+       kthread_stop(sch->helper->task);
+-      free_percpu(sch->event_stats_cpu);
++      free_percpu(sch->pcpu);
+       for_each_node_state(node, N_POSSIBLE)
+               kfree(sch->global_dsqs[node]);
+@@ -4439,13 +4439,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+               sch->global_dsqs[node] = dsq;
+       }
+-      sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+-      if (!sch->event_stats_cpu)
++      sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
++      if (!sch->pcpu)
+               goto err_free_gdsqs;
+       sch->helper = kthread_run_worker(0, "sched_ext_helper");
+       if (!sch->helper)
+-              goto err_free_event_stats;
++              goto err_free_pcpu;
+       sched_set_fifo(sch->helper->task);
+       atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+@@ -4463,8 +4463,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+ err_stop_helper:
+       kthread_stop(sch->helper->task);
+-err_free_event_stats:
+-      free_percpu(sch->event_stats_cpu);
++err_free_pcpu:
++      free_percpu(sch->pcpu);
+ err_free_gdsqs:
+       for_each_node_state(node, N_POSSIBLE)
+               kfree(sch->global_dsqs[node]);
+@@ -6490,7 +6490,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
+       /* Aggregate per-CPU event counters into @events. */
+       memset(events, 0, sizeof(*events));
+       for_each_possible_cpu(cpu) {
+-              e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
++              e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
+               scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+               scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+               scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
+index 76690ede8700f..af4c054fb6f85 100644
+--- a/kernel/sched/ext_internal.h
++++ b/kernel/sched/ext_internal.h
+@@ -846,6 +846,15 @@ struct scx_event_stats {
+       s64             SCX_EV_BYPASS_ACTIVATE;
+ };
++struct scx_sched_pcpu {
++      /*
++       * The event counters are in a per-CPU variable to minimize the
++       * accounting overhead. A system-wide view on the event counter is
++       * constructed when requested by scx_bpf_events().
++       */
++      struct scx_event_stats  event_stats;
++};
++
+ struct scx_sched {
+       struct sched_ext_ops    ops;
+       DECLARE_BITMAP(has_op, SCX_OPI_END);
+@@ -860,13 +869,7 @@ struct scx_sched {
+        */
+       struct rhashtable       dsq_hash;
+       struct scx_dispatch_q   **global_dsqs;
+-
+-      /*
+-       * The event counters are in a per-CPU variable to minimize the
+-       * accounting overhead. A system-wide view on the event counter is
+-       * constructed when requested by scx_bpf_events().
+-       */
+-      struct scx_event_stats __percpu *event_stats_cpu;
++      struct scx_sched_pcpu __percpu *pcpu;
+       bool                    warned_zero_slice;
+-- 
+2.51.0
+
diff --git a/queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch b/queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch
new file mode 100644 (file)
index 0000000..d0013da
--- /dev/null
@@ -0,0 +1,38 @@
+From 03175244f8b0c9d1e7bae5998f3bcfb7f77c59c7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Oct 2025 13:56:23 -1000
+Subject: sched_ext: Sync error_irq_work before freeing scx_sched
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit efeeaac9ae9763f9c953e69633c86bc3031e39b5 ]
+
+By the time scx_sched_free_rcu_work() runs, the scx_sched is no longer
+reachable. However, a previously queued error_irq_work may still be pending or
+running. Ensure it completes before proceeding with teardown.
+
+Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched")
+Acked-by: Andrea Righi <arighi@nvidia.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/ext.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 46029050b170f..f89894476e51f 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -3537,7 +3537,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
+       struct scx_dispatch_q *dsq;
+       int node;
++      irq_work_sync(&sch->error_irq_work);
+       kthread_stop(sch->helper->task);
++
+       free_percpu(sch->pcpu);
+       for_each_node_state(node, N_POSSIBLE)
+-- 
+2.51.0
+
diff --git a/queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch b/queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch
new file mode 100644 (file)
index 0000000..7c7a311
--- /dev/null
@@ -0,0 +1,85 @@
+From fe915f3331ace294cf2bb31d41fdcb2842b01530 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 20 Jul 2025 13:21:30 +0200
+Subject: seccomp: passthrough uprobe systemcall without filtering
+
+From: Jiri Olsa <jolsa@kernel.org>
+
+[ Upstream commit 89d1d8434d246c96309a6068dfcf9e36dc61227b ]
+
+Adding uprobe as another exception to the seccomp filter alongside
+with the uretprobe syscall.
+
+Same as the uretprobe the uprobe syscall is installed by kernel as
+replacement for the breakpoint exception and is limited to x86_64
+arch and isn't expected to ever be supported in i386.
+
+Signed-off-by: Jiri Olsa <jolsa@kernel.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Kees Cook <kees@kernel.org>
+Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/seccomp.c | 32 +++++++++++++++++++++++++-------
+ 1 file changed, 25 insertions(+), 7 deletions(-)
+
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 3bbfba30a777a..25f62867a16d9 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter)
+ }
+ #ifdef SECCOMP_ARCH_NATIVE
++static bool seccomp_uprobe_exception(struct seccomp_data *sd)
++{
++#if defined __NR_uretprobe || defined __NR_uprobe
++#ifdef SECCOMP_ARCH_COMPAT
++      if (sd->arch == SECCOMP_ARCH_NATIVE)
++#endif
++      {
++#ifdef __NR_uretprobe
++              if (sd->nr == __NR_uretprobe)
++                      return true;
++#endif
++#ifdef __NR_uprobe
++              if (sd->nr == __NR_uprobe)
++                      return true;
++#endif
++      }
++#endif
++      return false;
++}
++
+ /**
+  * seccomp_is_const_allow - check if filter is constant allow with given data
+  * @fprog: The BPF programs
+@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+               return false;
+       /* Our single exception to filtering. */
+-#ifdef __NR_uretprobe
+-#ifdef SECCOMP_ARCH_COMPAT
+-      if (sd->arch == SECCOMP_ARCH_NATIVE)
+-#endif
+-              if (sd->nr == __NR_uretprobe)
+-                      return true;
+-#endif
++      if (seccomp_uprobe_exception(sd))
++              return true;
+       for (pc = 0; pc < fprog->len; pc++) {
+               struct sock_filter *insn = &fprog->filter[pc];
+@@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = {
+       __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+ #ifdef __NR_uretprobe
+       __NR_uretprobe,
++#endif
++#ifdef __NR_uprobe
++      __NR_uprobe,
+ #endif
+       -1, /* negative terminated */
+ };
+-- 
+2.51.0
+
diff --git a/queue-6.17/series b/queue-6.17/series
new file mode 100644 (file)
index 0000000..fc76b3a
--- /dev/null
@@ -0,0 +1,35 @@
+sched_ext-move-internal-type-and-accessor-definition.patch
+sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch
+sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch
+timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch
+x86-bugs-report-correct-retbleed-mitigation-status.patch
+x86-bugs-qualify-retbleed_intel_msg.patch
+genirq-chip-add-buslock-back-in-to-irq_set_handler.patch
+genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch
+genirq-manage-add-buslock-back-in-to-enable_irq.patch
+audit-record-fanotify-event-regardless-of-presence-o.patch
+edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch
+perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch
+perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
+perf-have-get_perf_callchain-return-null-if-crosstas.patch
+perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
+edac-fix-wrong-executable-file-modes-for-c-source-fi.patch
+seccomp-passthrough-uprobe-systemcall-without-filter.patch
+sched_ext-keep-bypass-on-between-enable-failure-and-.patch
+x86-bugs-add-attack-vector-controls-for-vmscape.patch
+sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
+cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch
+btrfs-abort-transaction-on-specific-error-places-whe.patch
+btrfs-abort-transaction-in-the-process_one_buffer-lo.patch
+btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
+btrfs-zoned-refine-extent-allocator-hint-selection.patch
+btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
+btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch
+btrfs-tree-checker-add-inode-extref-checks.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
+sched_ext-make-qmap-dump-operation-non-destructive.patch
+arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
diff --git a/queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch b/queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch
new file mode 100644 (file)
index 0000000..6f0931f
--- /dev/null
@@ -0,0 +1,45 @@
+From 03823cc4dcccf525a9b20bef586082b1dcc89adf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Oct 2025 14:17:53 +0800
+Subject: timekeeping: Fix aux clocks sysfs initialization loop bound
+
+From: Haofeng Li <lihaofeng@kylinos.cn>
+
+[ Upstream commit 39a9ed0fb6dac58547afdf9b6cb032d326a3698f ]
+
+The loop in tk_aux_sysfs_init() uses `i <= MAX_AUX_CLOCKS` as the
+termination condition, which results in 9 iterations (i=0 to 8) when
+MAX_AUX_CLOCKS is defined as 8. However, the kernel is designed to support
+only up to 8 auxiliary clocks.
+
+This off-by-one error causes the creation of a 9th sysfs entry that exceeds
+the intended auxiliary clock range.
+
+Fix the loop bound to use `i < MAX_AUX_CLOCKS` to ensure exactly 8
+auxiliary clock entries are created, matching the design specification.
+
+Fixes: 7b95663a3d96 ("timekeeping: Provide interface to control auxiliary clocks")
+Signed-off-by: Haofeng Li <lihaofeng@kylinos.cn>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://patch.msgid.link/tencent_2376993D9FC06A3616A4F981B3DE1C599607@qq.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/timekeeping.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
+index b6974fce800cd..3a4d3b2e3f740 100644
+--- a/kernel/time/timekeeping.c
++++ b/kernel/time/timekeeping.c
+@@ -3070,7 +3070,7 @@ static int __init tk_aux_sysfs_init(void)
+               return -ENOMEM;
+       }
+-      for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
++      for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
+               char id[2] = { [0] = '0' + i, };
+               struct kobject *clk = kobject_create_and_add(id, auxo);
+-- 
+2.51.0
+
diff --git a/queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch b/queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch
new file mode 100644 (file)
index 0000000..f2dbade
--- /dev/null
@@ -0,0 +1,72 @@
+From 60bd79a607d557eed0d51b5455016a10ea60aafc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Sep 2025 10:24:28 -0500
+Subject: x86/bugs: Add attack vector controls for VMSCAPE
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit 5799d5d8a6c877f03ad5b5a640977053be45059a ]
+
+Use attack vector controls to select whether VMSCAPE requires mitigation,
+similar to other bugs.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../admin-guide/hw-vuln/attack_vector_controls.rst |  1 +
+ arch/x86/kernel/cpu/bugs.c                         | 14 ++++++++++----
+ 2 files changed, 11 insertions(+), 4 deletions(-)
+
+diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+index 5964901d66e31..d0bdbd81dcf9f 100644
+--- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
++++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+@@ -218,6 +218,7 @@ SRSO                  X              X            X              X
+ SSB                                  X
+ TAA                   X              X            X              X            *       (Note 2)
+ TSA                   X              X            X              X
++VMSCAPE                                           X
+ =============== ============== ============ ============= ============== ============ ========
+ Notes:
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 9750ce448e626..c6bb8e76eb984 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug)
+       case X86_BUG_SPEC_STORE_BYPASS:
+               return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER);
++      case X86_BUG_VMSCAPE:
++              return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
++
+       default:
+               WARN(1, "Unknown bug %x\n", bug);
+               return false;
+@@ -3308,15 +3311,18 @@ early_param("vmscape", vmscape_parse_cmdline);
+ static void __init vmscape_select_mitigation(void)
+ {
+-      if (cpu_mitigations_off() ||
+-          !boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
++      if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+           !boot_cpu_has(X86_FEATURE_IBPB)) {
+               vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+               return;
+       }
+-      if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO)
+-              vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
++      if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
++              if (should_mitigate_vuln(X86_BUG_VMSCAPE))
++                      vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
++              else
++                      vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
++      }
+ }
+ static void __init vmscape_update_mitigation(void)
+-- 
+2.51.0
+
diff --git a/queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..3176540
--- /dev/null
@@ -0,0 +1,51 @@
+From 705fc41b44d203b1500a524f3fb04ba1c63cd931 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index c6bb8e76eb984..26ece97011fd7 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -2052,7 +2052,7 @@ static void __init spectre_v2_user_apply_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3636,9 +3636,6 @@ static const char *spectre_bhi_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch b/queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch
new file mode 100644 (file)
index 0000000..40946f7
--- /dev/null
@@ -0,0 +1,47 @@
+From 9965d529966df68e304d4db15a0da58fce023b71 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Oct 2025 12:19:36 -0500
+Subject: x86/bugs: Qualify RETBLEED_INTEL_MSG
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit 204ced4108f5d38f6804968fd9543cc69c3f8da6 ]
+
+When retbleed mitigation is disabled, the kernel already prints an info
+message that the system is vulnerable.  Recent code restructuring also
+inadvertently led to RETBLEED_INTEL_MSG being printed as an error, which is
+unnecessary as retbleed mitigation was already explicitly disabled (by config
+option, cmdline, etc.).
+
+Qualify this print statement so the warning is not printed unless an actual
+retbleed mitigation was selected and is being disabled due to incompatibility
+with spectre_v2.
+
+Fixes: e3b78a7ad5ea ("x86/bugs: Restructure retbleed mitigation")
+Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220624
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://patch.msgid.link/20251003171936.155391-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index bf79ff6a1f662..9750ce448e626 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1461,7 +1461,9 @@ static void __init retbleed_update_mitigation(void)
+                       break;
+               default:
+                       if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+-                              pr_err(RETBLEED_INTEL_MSG);
++                              if (retbleed_mitigation != RETBLEED_MITIGATION_NONE)
++                                      pr_err(RETBLEED_INTEL_MSG);
++
+                               retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+                       }
+               }
+-- 
+2.51.0
+
diff --git a/queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch
new file mode 100644 (file)
index 0000000..c7daac2
--- /dev/null
@@ -0,0 +1,47 @@
+From 029a4346ea7f82d5882b314eca129b1591db28b4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:06 -0500
+Subject: x86/bugs: Report correct retbleed mitigation status
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ]
+
+On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this
+requires that a similar spectre_v2 mitigation is applied.  If the user
+selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a
+warning is printed but sysfs will still report 'Mitigation: IBRS' or
+'Mitigation: Enhanced IBRS'.  This is incorrect because retbleed is not
+mitigated, and IBRS is not actually set.
+
+Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the
+kernel correctly reports the system as vulnerable to retbleed.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Stable-dep-of: 204ced4108f5 ("x86/bugs: Qualify RETBLEED_INTEL_MSG")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 36dcfc5105be9..bf79ff6a1f662 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1460,8 +1460,10 @@ static void __init retbleed_update_mitigation(void)
+                       retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+                       break;
+               default:
+-                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
++                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+                               pr_err(RETBLEED_INTEL_MSG);
++                              retbleed_mitigation = RETBLEED_MITIGATION_NONE;
++                      }
+               }
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
new file mode 100644 (file)
index 0000000..b8d4f9e
--- /dev/null
@@ -0,0 +1,295 @@
+From 54968164c79970c4670228c2de8fd262e28c5c2e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Sep 2025 14:09:13 +0800
+Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c
+
+From: Menglong Dong <menglong8.dong@gmail.com>
+
+[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ]
+
+The include/generated/asm-offsets.h is generated in Kbuild during
+compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to
+generate another similar offset header file, circular dependency can
+happen.
+
+For example, we want to generate a offset file include/generated/test.h,
+which is included in include/sched/sched.h. If we generate asm-offsets.h
+first, it will fail, as include/sched/sched.h is included in asm-offsets.c
+and include/generated/test.h doesn't exist; If we generate test.h first,
+it can't success neither, as include/generated/asm-offsets.h is included
+by it.
+
+In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular
+dependency. We can generate asm-offsets.h first, and if the
+COMPILE_OFFSETS is defined, we don't include the "generated/test.h".
+
+And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this
+purpose.
+
+Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/alpha/kernel/asm-offsets.c      | 1 +
+ arch/arc/kernel/asm-offsets.c        | 1 +
+ arch/arm/kernel/asm-offsets.c        | 2 ++
+ arch/arm64/kernel/asm-offsets.c      | 1 +
+ arch/csky/kernel/asm-offsets.c       | 1 +
+ arch/hexagon/kernel/asm-offsets.c    | 1 +
+ arch/loongarch/kernel/asm-offsets.c  | 2 ++
+ arch/m68k/kernel/asm-offsets.c       | 1 +
+ arch/microblaze/kernel/asm-offsets.c | 1 +
+ arch/mips/kernel/asm-offsets.c       | 2 ++
+ arch/nios2/kernel/asm-offsets.c      | 1 +
+ arch/openrisc/kernel/asm-offsets.c   | 1 +
+ arch/parisc/kernel/asm-offsets.c     | 1 +
+ arch/powerpc/kernel/asm-offsets.c    | 1 +
+ arch/riscv/kernel/asm-offsets.c      | 1 +
+ arch/s390/kernel/asm-offsets.c       | 1 +
+ arch/sh/kernel/asm-offsets.c         | 1 +
+ arch/sparc/kernel/asm-offsets.c      | 1 +
+ arch/um/kernel/asm-offsets.c         | 2 ++
+ arch/xtensa/kernel/asm-offsets.c     | 1 +
+ 20 files changed, 24 insertions(+)
+
+diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
+index 11c35cf45b461..cb205f22096d7 100644
+--- a/arch/alpha/kernel/asm-offsets.c
++++ b/arch/alpha/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c
+index f77deb7991757..2978da85fcb65 100644
+--- a/arch/arc/kernel/asm-offsets.c
++++ b/arch/arc/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
+index 219cbc7e5d134..3840e1e22b751 100644
+--- a/arch/arm/kernel/asm-offsets.c
++++ b/arch/arm/kernel/asm-offsets.c
+@@ -7,6 +7,8 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
+index 5ff1942b04fcf..ea2d740db81c5 100644
+--- a/arch/arm64/kernel/asm-offsets.c
++++ b/arch/arm64/kernel/asm-offsets.c
+@@ -6,6 +6,7 @@
+  *               2001-2002 Keith Owens
+  * Copyright (C) 2012 ARM Ltd.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/arm_sdei.h>
+ #include <linux/sched.h>
+diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c
+index d1e9035794733..5525c8e7e1d9e 100644
+--- a/arch/csky/kernel/asm-offsets.c
++++ b/arch/csky/kernel/asm-offsets.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/kernel_stat.h>
+diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
+index 03a7063f94561..50eea9fa6f137 100644
+--- a/arch/hexagon/kernel/asm-offsets.c
++++ b/arch/hexagon/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  *
+  * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/types.h>
+diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
+index 8da0726777edb..110afd3cc8f34 100644
+--- a/arch/loongarch/kernel/asm-offsets.c
++++ b/arch/loongarch/kernel/asm-offsets.c
+@@ -4,6 +4,8 @@
+  *
+  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/types.h>
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
+index 906d732305374..67a1990f9d748 100644
+--- a/arch/m68k/kernel/asm-offsets.c
++++ b/arch/m68k/kernel/asm-offsets.c
+@@ -9,6 +9,7 @@
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+ #include <linux/stddef.h>
+diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c
+index 104c3ac5f30c8..b4b67d58e7f6a 100644
+--- a/arch/microblaze/kernel/asm-offsets.c
++++ b/arch/microblaze/kernel/asm-offsets.c
+@@ -7,6 +7,7 @@
+  * License. See the file "COPYING" in the main directory of this archive
+  * for more details.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/init.h>
+ #include <linux/stddef.h>
+diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
+index cb1045ebab062..22c99a2cd5707 100644
+--- a/arch/mips/kernel/asm-offsets.c
++++ b/arch/mips/kernel/asm-offsets.c
+@@ -9,6 +9,8 @@
+  * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000 MIPS Technologies, Inc.
+  */
++#define COMPILE_OFFSETS
++
+ #include <linux/compat.h>
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c
+index e3d9b7b6fb48a..88190b503ce5d 100644
+--- a/arch/nios2/kernel/asm-offsets.c
++++ b/arch/nios2/kernel/asm-offsets.c
+@@ -2,6 +2,7 @@
+ /*
+  * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/sched.h>
+diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c
+index 710651d5aaae1..3cc826f2216b1 100644
+--- a/arch/openrisc/kernel/asm-offsets.c
++++ b/arch/openrisc/kernel/asm-offsets.c
+@@ -18,6 +18,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/signal.h>
+ #include <linux/sched.h>
+diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
+index 757816a7bd4b2..9abfe65492c65 100644
+--- a/arch/parisc/kernel/asm-offsets.c
++++ b/arch/parisc/kernel/asm-offsets.c
+@@ -13,6 +13,7 @@
+  *    Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
+  *    Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org>
+  */
++#define COMPILE_OFFSETS
+ #include <linux/types.h>
+ #include <linux/sched.h>
+diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
+index 2affd30468bc4..e2cee2f2ededd 100644
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/compat.h>
+ #include <linux/signal.h>
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index 6a992cba2f287..e4589457e6085 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -3,6 +3,7 @@
+  * Copyright (C) 2012 Regents of the University of California
+  * Copyright (C) 2017 SiFive
+  */
++#define COMPILE_OFFSETS
+ #include <linux/kbuild.h>
+ #include <linux/mm.h>
+diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
+index fa5f6885c74aa..73a989dcfe208 100644
+--- a/arch/s390/kernel/asm-offsets.c
++++ b/arch/s390/kernel/asm-offsets.c
+@@ -4,6 +4,7 @@
+  * This code generates raw asm output which is post-processed to extract
+  * and format the required data.
+  */
++#define COMPILE_OFFSETS
+ #define ASM_OFFSETS_C
+diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
+index a0322e8328456..429b6a7631468 100644
+--- a/arch/sh/kernel/asm-offsets.c
++++ b/arch/sh/kernel/asm-offsets.c
+@@ -8,6 +8,7 @@
+  * compile this file to assembler, and then extract the
+  * #defines from the assembly-language output.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/stddef.h>
+ #include <linux/types.h>
+diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
+index 5784f2df489a4..f1e27a7f800f4 100644
+--- a/arch/sparc/kernel/asm-offsets.c
++++ b/arch/sparc/kernel/asm-offsets.c
+@@ -10,6 +10,7 @@
+  *
+  * On sparc, thread_info data is static and TI_XXX offsets are computed by hand.
+  */
++#define COMPILE_OFFSETS
+ #include <linux/sched.h>
+ #include <linux/mm_types.h>
+diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
+index 1fb12235ab9c8..a69873aa697f4 100644
+--- a/arch/um/kernel/asm-offsets.c
++++ b/arch/um/kernel/asm-offsets.c
+@@ -1 +1,3 @@
++#define COMPILE_OFFSETS
++
+ #include <sysdep/kernel-offsets.h>
+diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
+index da38de20ae598..cfbced95e944a 100644
+--- a/arch/xtensa/kernel/asm-offsets.c
++++ b/arch/xtensa/kernel/asm-offsets.c
+@@ -11,6 +11,7 @@
+  *
+  * Chris Zankel <chris@zankel.net>
+  */
++#define COMPILE_OFFSETS
+ #include <asm/processor.h>
+ #include <asm/coprocessor.h>
+-- 
+2.51.0
+
diff --git a/queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch
new file mode 100644 (file)
index 0000000..5c81f75
--- /dev/null
@@ -0,0 +1,44 @@
+From 7a103238fdb26a551efc3eec1a75f8d386103d02 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Aug 2025 17:04:07 -0400
+Subject: audit: record fanotify event regardless of presence of rules
+
+From: Richard Guy Briggs <rgb@redhat.com>
+
+[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ]
+
+When no audit rules are in place, fanotify event results are
+unconditionally dropped due to an explicit check for the existence of
+any audit rules.  Given this is a report from another security
+sub-system, allow it to be recorded regardless of the existence of any
+audit rules.
+
+To test, install and run the fapolicyd daemon with default config.  Then
+as an unprivileged user, create and run a very simple binary that should
+be denied.  Then check for an event with
+       ausearch -m FANOTIFY -ts recent
+
+Link: https://issues.redhat.com/browse/RHEL-9065
+Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/audit.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/linux/audit.h b/include/linux/audit.h
+index 335e1ba5a2327..7ca75f8873799 100644
+--- a/include/linux/audit.h
++++ b/include/linux/audit.h
+@@ -526,7 +526,7 @@ static inline void audit_log_kern_module(const char *name)
+ static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
+ {
+-      if (!audit_dummy_context())
++      if (audit_enabled)
+               __audit_fanotify(response, friar);
+ }
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
new file mode 100644 (file)
index 0000000..0b54c20
--- /dev/null
@@ -0,0 +1,63 @@
+From 012af0d8a5f2d9c3d7e993a07113cefeca540801 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Aug 2025 12:10:28 +0100
+Subject: btrfs: always drop log root tree reference in btrfs_replay_log()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ]
+
+Currently we have this odd behaviour:
+
+1) At btrfs_replay_log() we drop the reference of the log root tree if
+   the call to btrfs_recover_log_trees() failed;
+
+2) But if the call to btrfs_recover_log_trees() did not fail, we don't
+   drop the reference in btrfs_replay_log() - we expect that
+   btrfs_recover_log_trees() does it in case it returns success.
+
+Let's simplify this and make btrfs_replay_log() always drop the reference
+on the log root tree, not only this simplifies code as it's what makes
+sense since it's btrfs_replay_log() who grabbed the reference in the first
+place.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c  | 2 +-
+ fs/btrfs/tree-log.c | 1 -
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index bb5f7911d473c..7ad1734cbbfc9 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2080,10 +2080,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+       /* returns with log_tree_root freed on success */
+       ret = btrfs_recover_log_trees(log_tree_root);
++      btrfs_put_root(log_tree_root);
+       if (ret) {
+               btrfs_handle_fs_error(fs_info, ret,
+                                     "Failed to recover log tree");
+-              btrfs_put_root(log_tree_root);
+               return ret;
+       }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 4b53e19f7520f..e00298c6c30a1 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -7422,7 +7422,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+       log_root_tree->log_root = NULL;
+       clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+-      btrfs_put_root(log_root_tree);
+       return 0;
+ error:
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
new file mode 100644 (file)
index 0000000..ebdb01f
--- /dev/null
@@ -0,0 +1,44 @@
+From 33914610d5e0981512b297973a9438c04ce73add Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Sep 2025 17:01:44 +0200
+Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in
+ scrub_throttle_dev_io()
+
+From: Thorsten Blum <thorsten.blum@linux.dev>
+
+[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ]
+
+Replace max_t() followed by min_t() with a single clamp().
+
+As was pointed by David Laight in
+https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/
+the calculation may overflow u32 when the input value is too large, so
+clamp_t() is not used.  In practice the expected values are in range of
+megabytes to gigabytes (throughput limit) so the bug would not happen.
+
+Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ Use clamp() and add explanation. ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 7632d652a1257..4a5a5ee360e57 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1271,8 +1271,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
+        * Slice is divided into intervals when the IO is submitted, adjust by
+        * bwlimit and maximum of 64 intervals.
+        */
+-      div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-      div = min_t(u32, 64, div);
++      div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
+       /* Start new epoch, set deadline */
+       now = ktime_get();
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
new file mode 100644 (file)
index 0000000..f56907d
--- /dev/null
@@ -0,0 +1,50 @@
+From 2a84dc26e9fa0d7a677021843e9d860d72f6a485 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Aug 2025 17:46:18 +0100
+Subject: btrfs: use level argument in log tree walk callback
+ replay_one_buffer()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ]
+
+We already have the extent buffer's level in an argument, there's no need
+to first ensure the extent buffer's data is loaded (by calling
+btrfs_read_extent_buffer()) and then call btrfs_header_level() to check
+the level. So use the level argument and do the check before calling
+btrfs_read_extent_buffer().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index e00298c6c30a1..5512991b24faa 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2493,15 +2493,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+       int i;
+       int ret;
++      if (level != 0)
++              return 0;
++
+       ret = btrfs_read_extent_buffer(eb, &check);
+       if (ret)
+               return ret;
+-      level = btrfs_header_level(eb);
+-
+-      if (level != 0)
+-              return 0;
+-
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
new file mode 100644 (file)
index 0000000..4c76fca
--- /dev/null
@@ -0,0 +1,58 @@
+From 4e2e37c8c157fbd155fedbe333bfb6f4e13941f1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Sep 2025 12:09:14 +0100
+Subject: btrfs: use smp_mb__after_atomic() when forcing COW in
+ create_pending_snapshot()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ]
+
+After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a
+full write barrier, smp_wmb(), but we don't need to, all we need is a
+smp_mb__after_atomic().  The use of the smp_wmb() is from the old days
+when we didn't use a bit and used instead an int field in the root to
+signal if cow is forced. After the int field was changed to a bit in
+the root's state (flags field), we forgot to update the memory barrier
+in create_pending_snapshot() to smp_mb__after_atomic(), but we did the
+change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That
+happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer
+data type for the some variants in btrfs_root"). On the reader side, in
+should_cow_block(), we also use the counterpart smp_mb__before_atomic()
+which generates further confusion.
+
+So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't
+even need any barrier at all since create_pending_snapshot() is called
+in the critical section of a transaction commit and therefore no one
+can concurrently join/attach the transaction, or start a new one, until
+the transaction is unblocked. By the time someone starts a new transaction
+and enters should_cow_block(), a lot of implicit memory barriers already
+took place by having acquired several locks such as fs_info->trans_lock
+and extent buffer locks on the root node at least. Nevertlheless, for
+consistency use smp_mb__after_atomic() after setting the force cow bit
+in create_pending_snapshot().
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 3989cb19cdae7..20add63421b3d 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1796,7 +1796,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+       }
+       /* see comments in should_cow_block() */
+       set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+-      smp_wmb();
++      smp_mb__after_atomic();
+       btrfs_set_root_node(new_root_item, tmp);
+       /* record when the snapshot was created in key.offset */
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch
new file mode 100644 (file)
index 0000000..ac945a1
--- /dev/null
@@ -0,0 +1,59 @@
+From ffa3e67ee0a42fb8a270a866991cde00d27090a1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Jul 2025 11:13:15 +0900
+Subject: btrfs: zoned: refine extent allocator hint selection
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ]
+
+The hint block group selection in the extent allocator is wrong in the
+first place, as it can select the dedicated data relocation block group for
+the normal data allocation.
+
+Since we separated the normal data space_info and the data relocation
+space_info, we can easily identify a block group is for data relocation or
+not. Do not choose it for the normal data allocation.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 8248113eb067f..5e3d1a87b7e9d 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4175,7 +4175,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ }
+ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+-                                  struct find_free_extent_ctl *ffe_ctl)
++                                  struct find_free_extent_ctl *ffe_ctl,
++                                  struct btrfs_space_info *space_info)
+ {
+       if (ffe_ctl->for_treelog) {
+               spin_lock(&fs_info->treelog_bg_lock);
+@@ -4199,6 +4200,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+                       u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+                       if (block_group_bits(block_group, ffe_ctl->flags) &&
++                          block_group->space_info == space_info &&
+                           avail >= ffe_ctl->num_bytes) {
+                               ffe_ctl->hint_byte = block_group->start;
+                               break;
+@@ -4220,7 +4222,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
+               return prepare_allocation_clustered(fs_info, ffe_ctl,
+                                                   space_info, ins);
+       case BTRFS_EXTENT_ALLOC_ZONED:
+-              return prepare_allocation_zoned(fs_info, ffe_ctl);
++              return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
+       default:
+               BUG();
+       }
+-- 
+2.51.0
+
diff --git a/queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
new file mode 100644 (file)
index 0000000..592fdf9
--- /dev/null
@@ -0,0 +1,111 @@
+From 34e6b449c4d896095ab2e81088393f75b5995c52 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Jul 2025 13:39:11 +0200
+Subject: btrfs: zoned: return error from btrfs_zone_finish_endio()
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ]
+
+Now that btrfs_zone_finish_endio_workfn() is directly calling
+do_zone_finish() the only caller of btrfs_zone_finish_endio() is
+btrfs_finish_one_ordered().
+
+btrfs_finish_one_ordered() already has error handling in-place so
+btrfs_zone_finish_endio() can return an error if the block group lookup
+fails.
+
+Also as btrfs_zone_finish_endio() already checks for zoned filesystems and
+returns early, there's no need to do this in the caller.
+
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 7 ++++---
+ fs/btrfs/zoned.c | 8 +++++---
+ fs/btrfs/zoned.h | 9 ++++++---
+ 3 files changed, 15 insertions(+), 9 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index ee5ffeab85bb7..b1be3e0fe7282 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3051,9 +3051,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
+               goto out;
+       }
+-      if (btrfs_is_zoned(fs_info))
+-              btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+-                                      ordered_extent->disk_num_bytes);
++      ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
++                                    ordered_extent->disk_num_bytes);
++      if (ret)
++              goto out;
+       if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+               truncated = true;
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 3622ba1d8e09f..6e8b8c46ba18f 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2263,16 +2263,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+       return ret;
+ }
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+ {
+       struct btrfs_block_group *block_group;
+       u64 min_alloc_bytes;
+       if (!btrfs_is_zoned(fs_info))
+-              return;
++              return 0;
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+-      ASSERT(block_group);
++      if (WARN_ON_ONCE(!block_group))
++              return -ENOENT;
+       /* No MIXED_BG on zoned btrfs. */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+@@ -2289,6 +2290,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
+ out:
+       btrfs_put_block_group(block_group);
++      return 0;
+ }
+ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 448955641d114..c18f31d3dc25f 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -71,7 +71,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb);
+@@ -227,8 +227,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+       return true;
+ }
+-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+-                                         u64 logical, u64 length) { }
++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
++                                         u64 logical, u64 length)
++{
++      return 0;
++}
+ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                                struct extent_buffer *eb) { }
+-- 
+2.51.0
+
diff --git a/queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
new file mode 100644 (file)
index 0000000..b07bb8b
--- /dev/null
@@ -0,0 +1,89 @@
+From 762f27a0e18a98a9747de23af1f4ad13e83ccecf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 16 Sep 2025 20:30:17 +0000
+Subject: EDAC/mc_sysfs: Increase legacy channel support to 16
+
+From: Avadhut Naik <avadhut.naik@amd.com>
+
+[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ]
+
+Newer AMD systems can support up to 16 channels per EDAC "mc" device.
+These are detected by the EDAC module running on the device, and the
+current EDAC interface is appropriately enumerated.
+
+The legacy EDAC sysfs interface however, provides device attributes for
+channels 0 through 11 only. Consequently, the last four channels, 12
+through 15, will not be enumerated and will not be visible through the
+legacy sysfs interface.
+
+Add additional device attributes to ensure that all 16 channels, if
+present, are enumerated by and visible through the legacy EDAC sysfs
+interface.
+
+Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
+index 15f63452a9bec..b01436d9ddaed 100644
+--- a/drivers/edac/edac_mc_sysfs.c
++++ b/drivers/edac/edac_mc_sysfs.c
+@@ -306,6 +306,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 10);
+ DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
+       channel_dimm_label_show, channel_dimm_label_store, 11);
++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 12);
++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 13);
++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 14);
++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR,
++      channel_dimm_label_show, channel_dimm_label_store, 15);
+ /* Total possible dynamic DIMM Label attribute file table */
+ static struct attribute *dynamic_csrow_dimm_attr[] = {
+@@ -321,6 +329,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
+       &dev_attr_legacy_ch9_dimm_label.attr.attr,
+       &dev_attr_legacy_ch10_dimm_label.attr.attr,
+       &dev_attr_legacy_ch11_dimm_label.attr.attr,
++      &dev_attr_legacy_ch12_dimm_label.attr.attr,
++      &dev_attr_legacy_ch13_dimm_label.attr.attr,
++      &dev_attr_legacy_ch14_dimm_label.attr.attr,
++      &dev_attr_legacy_ch15_dimm_label.attr.attr,
+       NULL
+ };
+@@ -349,6 +361,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 10);
+ DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
+                  channel_ce_count_show, NULL, 11);
++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 12);
++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 13);
++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 14);
++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO,
++                 channel_ce_count_show, NULL, 15);
+ /* Total possible dynamic ce_count attribute file table */
+ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+@@ -364,6 +384,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
+       &dev_attr_legacy_ch9_ce_count.attr.attr,
+       &dev_attr_legacy_ch10_ce_count.attr.attr,
+       &dev_attr_legacy_ch11_ce_count.attr.attr,
++      &dev_attr_legacy_ch12_ce_count.attr.attr,
++      &dev_attr_legacy_ch13_ce_count.attr.attr,
++      &dev_attr_legacy_ch14_ce_count.attr.attr,
++      &dev_attr_legacy_ch15_ce_count.attr.attr,
+       NULL
+ };
+-- 
+2.51.0
+
diff --git a/queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch
new file mode 100644 (file)
index 0000000..56ee589
--- /dev/null
@@ -0,0 +1,68 @@
+From 802862b01265aaa81829f67331259dede4a3354b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:40 -0400
+Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are
+ set
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ]
+
+get_perf_callchain() doesn't support cross-task unwinding for user space
+stacks, have it return NULL if both the crosstask and user arguments are
+set.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index 65fea424874c5..ee01cfcc35064 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -184,6 +184,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       struct perf_callchain_entry_ctx ctx;
+       int rctx;
++      /* crosstask is not supported for user stacks */
++      if (crosstask && user && !kernel)
++              return NULL;
++
+       entry = get_callchain_entry(&rctx);
+       if (!entry)
+               return NULL;
+@@ -200,7 +204,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               perf_callchain_kernel(&ctx, regs);
+       }
+-      if (user) {
++      if (user && !crosstask) {
+               if (!user_mode(regs)) {
+                       if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
+@@ -209,9 +213,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+               if (regs) {
+-                      if (crosstask)
+-                              goto exit_put;
+-
+                       if (add_mark)
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+@@ -219,7 +220,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+               }
+       }
+-exit_put:
+       put_callchain_entry(rctx);
+       return entry;
+-- 
+2.51.0
+
diff --git a/queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
new file mode 100644 (file)
index 0000000..1e5cbfc
--- /dev/null
@@ -0,0 +1,37 @@
+From f7b7c12558c04bfe26dfb17cefd5143e4e8e98f9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:43 -0400
+Subject: perf: Skip user unwind if the task is a kernel thread
+
+From: Josh Poimboeuf <jpoimboe@kernel.org>
+
+[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ]
+
+If the task is not a user thread, there's no user stack to unwind.
+
+Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/core.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 3eb9125431b43..c9a3fb6fdb2f6 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -7724,7 +7724,8 @@ struct perf_callchain_entry *
+ perf_callchain(struct perf_event *event, struct pt_regs *regs)
+ {
+       bool kernel = !event->attr.exclude_callchain_kernel;
+-      bool user   = !event->attr.exclude_callchain_user;
++      bool user   = !event->attr.exclude_callchain_user &&
++              !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
+       /* Disallow cross-task user callchains. */
+       bool crosstask = event->ctx->task && event->ctx->task != current;
+       const u32 max_stack = event->attr.sample_max_stack;
+-- 
+2.51.0
+
diff --git a/queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
new file mode 100644 (file)
index 0000000..42659ad
--- /dev/null
@@ -0,0 +1,67 @@
+From a77239aceb868da27cb7d047c23b0e2c38130faf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Aug 2025 14:03:41 -0400
+Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of
+ current->mm == NULL
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ]
+
+To determine if a task is a kernel thread or not, it is more reliable to
+use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on
+current->mm being NULL.  That is because some kernel tasks (io_uring
+helpers) may have a mm field.
+
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/callchain.c | 6 +++---
+ kernel/events/core.c      | 4 ++--
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
+index 1273be84392cf..65fea424874c5 100644
+--- a/kernel/events/callchain.c
++++ b/kernel/events/callchain.c
+@@ -202,10 +202,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+       if (user) {
+               if (!user_mode(regs)) {
+-                      if  (current->mm)
+-                              regs = task_pt_regs(current);
+-                      else
++                      if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+                               regs = NULL;
++                      else
++                              regs = task_pt_regs(current);
+               }
+               if (regs) {
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index b73f5c44113d6..3eb9125431b43 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -6985,7 +6985,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
+       if (user_mode(regs)) {
+               regs_user->abi = perf_reg_abi(current);
+               regs_user->regs = regs;
+-      } else if (!(current->flags & PF_KTHREAD)) {
++      } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+               perf_get_regs_user(regs_user, regs);
+       } else {
+               regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+@@ -7612,7 +7612,7 @@ static u64 perf_virt_to_phys(u64 virt)
+                * Try IRQ-safe get_user_page_fast_only first.
+                * If failed, leave phys_addr as 0.
+                */
+-              if (current->mm != NULL) {
++              if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+                       struct page *p;
+                       pagefault_disable();
+-- 
+2.51.0
+
index 64bc2c1eea0a4e8a555bb14ebe15152b3d8d903c..00661666af6de7a20a8b2d15dcd4496e3007b5c0 100644 (file)
@@ -1 +1,15 @@
 net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch
+audit-record-fanotify-event-regardless-of-presence-o.patch
+perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch
+perf-have-get_perf_callchain-return-null-if-crosstas.patch
+perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch
+x86-bugs-report-correct-retbleed-mitigation-status.patch
+x86-bugs-fix-reporting-of-lfence-retpoline.patch
+edac-mc_sysfs-increase-legacy-channel-support-to-16.patch
+btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch
+btrfs-zoned-refine-extent-allocator-hint-selection.patch
+btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch
+btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch
+btrfs-use-level-argument-in-log-tree-walk-callback-r.patch
+btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch
+arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch
diff --git a/queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch
new file mode 100644 (file)
index 0000000..cdc1afc
--- /dev/null
@@ -0,0 +1,51 @@
+From 9e19126d42b14a90eced153d573608f84adf3db2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:05 -0500
+Subject: x86/bugs: Fix reporting of LFENCE retpoline
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ]
+
+The LFENCE retpoline mitigation is not secure but the kernel prints
+inconsistent messages about this fact.  The dmesg log says 'Mitigation:
+LFENCE', implying the system is mitigated.  But sysfs reports 'Vulnerable:
+LFENCE' implying the system (correctly) is not mitigated.
+
+Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere
+when this mitigation is selected.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index f66e0e5b49eb1..ef1d3a5024ed4 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1594,7 +1594,7 @@ spectre_v2_user_select_mitigation(void)
+ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_NONE]                       = "Vulnerable",
+       [SPECTRE_V2_RETPOLINE]                  = "Mitigation: Retpolines",
+-      [SPECTRE_V2_LFENCE]                     = "Mitigation: LFENCE",
++      [SPECTRE_V2_LFENCE]                     = "Vulnerable: LFENCE",
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced / Automatic IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+@@ -3222,9 +3222,6 @@ static const char *spectre_bhi_state(void)
+ static ssize_t spectre_v2_show_state(char *buf)
+ {
+-      if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+-              return sysfs_emit(buf, "Vulnerable: LFENCE\n");
+-
+       if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+-- 
+2.51.0
+
diff --git a/queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch
new file mode 100644 (file)
index 0000000..cece479
--- /dev/null
@@ -0,0 +1,46 @@
+From c0f1ee31d227222ce9307b53bc63615f5c5fc2b6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Sep 2025 08:47:06 -0500
+Subject: x86/bugs: Report correct retbleed mitigation status
+
+From: David Kaplan <david.kaplan@amd.com>
+
+[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ]
+
+On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this
+requires that a similar spectre_v2 mitigation is applied.  If the user
+selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a
+warning is printed but sysfs will still report 'Mitigation: IBRS' or
+'Mitigation: Enhanced IBRS'.  This is incorrect because retbleed is not
+mitigated, and IBRS is not actually set.
+
+Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the
+kernel correctly reports the system as vulnerable to retbleed.
+
+Signed-off-by: David Kaplan <david.kaplan@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 315926ccea0fa..f66e0e5b49eb1 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -1185,8 +1185,10 @@ static void __init retbleed_select_mitigation(void)
+                       retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+                       break;
+               default:
+-                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
++                      if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+                               pr_err(RETBLEED_INTEL_MSG);
++                              retbleed_mitigation = RETBLEED_MITIGATION_NONE;
++                      }
+               }
+       }
+-- 
+2.51.0
+