From 3e97359aa27271d651fa9b91136a645f261d89db Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 29 Oct 2025 21:57:59 -0400 Subject: [PATCH] Fixes for all trees Signed-off-by: Sasha Levin --- ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + queue-5.10/series | 3 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...ace-max_t-min_t-with-clamp-in-scrub_.patch | 44 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + queue-5.15/series | 4 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + queue-5.4/series | 2 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...ro-compile_offsets-to-all-the-asm-of.patch | 295 +++ ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...ace-max_t-min_t-with-clamp-in-scrub_.patch | 44 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + ...fine-extent-allocator-hint-selection.patch | 59 + ...ncrease-legacy-channel-support-to-16.patch | 89 + ...rf_callchain-return-null-if-crosstas.patch | 68 + queue-6.1/series | 8 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...ro-compile_offsets-to-all-the-asm-of.patch | 295 +++ ...otify-event-regardless-of-presence-o.patch | 44 + ...saction-if-we-fail-to-update-inode-i.patch | 39 + ...saction-in-the-process_one_buffer-lo.patch | 77 + ...saction-on-specific-error-places-whe.patch | 111 + ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...ace-max_t-min_t-with-clamp-in-scrub_.patch | 44 + ...tree-checker-add-inode-extref-checks.patch | 90 + ...argument-in-log-tree-walk-callback-r.patch | 50 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + ...fine-extent-allocator-hint-selection.patch | 59 + ...rn-error-from-btrfs_zone_finish_endi.patch | 111 + ...xcpus-for-nocpu-error-check-when-ena.patch | 47 + ...ncrease-legacy-channel-support-to-16.patch | 89 + ...rf_callchain-return-null-if-crosstas.patch | 68 + ...nwind-if-the-task-is-a-kernel-thread.patch | 37 + ...-flags-pf_kthread-pf_user_worker-ins.patch | 67 + ...dd-icl_fixed_0_adaptive-bit-into-int.patch | 101 + ...-qmap-dump-operation-non-destructive.patch | 70 + ...ugh-uprobe-systemcall-without-filter.patch | 85 + queue-6.12/series | 22 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...t-correct-retbleed-mitigation-status.patch | 46 + ...ro-compile_offsets-to-all-the-asm-of.patch | 295 +++ ...otify-event-regardless-of-presence-o.patch | 44 + ...saction-if-we-fail-to-update-inode-i.patch | 39 + ...saction-in-the-process_one_buffer-lo.patch | 77 + ...saction-on-specific-error-places-whe.patch | 111 + ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...ace-max_t-min_t-with-clamp-in-scrub_.patch | 44 + ...tree-checker-add-inode-extref-checks.patch | 90 + ...argument-in-log-tree-walk-callback-r.patch | 50 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + ...fine-extent-allocator-hint-selection.patch | 59 + ...rn-error-from-btrfs_zone_finish_endi.patch | 111 + ...xcpus-for-nocpu-error-check-when-ena.patch | 47 + ...xecutable-file-modes-for-c-source-fi.patch | 43 + ...-two-more-intel-alder-lake-s-socs-fo.patch | 55 + ...ncrease-legacy-channel-support-to-16.patch | 89 + ...d-buslock-back-in-to-irq_set_handler.patch | 38 + ...d-buslock-back-in-to-__disable_irq_n.patch | 38 + ...ge-add-buslock-back-in-to-enable_irq.patch | 38 + ...rf_callchain-return-null-if-crosstas.patch | 68 + ...nwind-if-the-task-is-a-kernel-thread.patch | 37 + ...-flags-pf_kthread-pf_user_worker-ins.patch | 67 + ...dd-icl_fixed_0_adaptive-bit-into-int.patch | 101 + ...date_cfs_group-for-throttled-cfs_rqs.patch | 55 + ...ypass-on-between-enable-failure-and-.patch | 48 + ...-qmap-dump-operation-non-destructive.patch | 70 + ...nternal-type-and-accessor-definition.patch | 2189 +++++++++++++++++ ...ent_stats_cpu-in-struct-scx_sched_pc.patch | 128 + ...rror_irq_work-before-freeing-scx_sch.patch | 38 + ...ugh-uprobe-systemcall-without-filter.patch | 85 + queue-6.17/series | 35 + ...aux-clocks-sysfs-initialization-loop.patch | 45 + ...d-attack-vector-controls-for-vmscape.patch | 72 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + .../x86-bugs-qualify-retbleed_intel_msg.patch | 47 + ...t-correct-retbleed-mitigation-status.patch | 47 + ...ro-compile_offsets-to-all-the-asm-of.patch | 295 +++ ...otify-event-regardless-of-presence-o.patch | 44 + ...p-log-root-tree-reference-in-btrfs_r.patch | 63 + ...ace-max_t-min_t-with-clamp-in-scrub_.patch | 44 + ...argument-in-log-tree-walk-callback-r.patch | 50 + ...__after_atomic-when-forcing-cow-in-c.patch | 58 + ...fine-extent-allocator-hint-selection.patch | 59 + ...rn-error-from-btrfs_zone_finish_endi.patch | 111 + ...ncrease-legacy-channel-support-to-16.patch | 89 + ...rf_callchain-return-null-if-crosstas.patch | 68 + ...nwind-if-the-task-is-a-kernel-thread.patch | 37 + ...-flags-pf_kthread-pf_user_worker-ins.patch | 67 + queue-6.6/series | 14 + ...gs-fix-reporting-of-lfence-retpoline.patch | 51 + ...t-correct-retbleed-mitigation-status.patch | 46 + 95 files changed, 8633 insertions(+) create mode 100644 queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch create mode 100644 queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch create mode 100644 queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch create mode 100644 queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch create mode 100644 queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch create mode 100644 queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch create mode 100644 queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch create mode 100644 queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch create mode 100644 queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch create mode 100644 queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch create mode 100644 queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch create mode 100644 queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch create mode 100644 queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch create mode 100644 queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch create mode 100644 queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch create mode 100644 queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch create mode 100644 queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch create mode 100644 queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch create mode 100644 queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch create mode 100644 queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch create mode 100644 queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch create mode 100644 queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch create mode 100644 queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch create mode 100644 queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch create mode 100644 queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch create mode 100644 queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch create mode 100644 queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch create mode 100644 queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch create mode 100644 queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch create mode 100644 queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch create mode 100644 queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch create mode 100644 queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch create mode 100644 queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch create mode 100644 queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch create mode 100644 queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch create mode 100644 queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch create mode 100644 queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch create mode 100644 queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch create mode 100644 queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch create mode 100644 queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch create mode 100644 queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch create mode 100644 queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch create mode 100644 queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch create mode 100644 queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch create mode 100644 queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch create mode 100644 queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch create mode 100644 queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch create mode 100644 queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch create mode 100644 queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch create mode 100644 queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch create mode 100644 queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch create mode 100644 queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch create mode 100644 queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch create mode 100644 queue-6.17/series create mode 100644 queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch create mode 100644 queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch create mode 100644 queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch create mode 100644 queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch create mode 100644 queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch create mode 100644 queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch create mode 100644 queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch create mode 100644 queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch create mode 100644 queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch create mode 100644 queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch create mode 100644 queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch create mode 100644 queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch create mode 100644 queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch create mode 100644 queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch create mode 100644 queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch create mode 100644 queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch create mode 100644 queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch create mode 100644 queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch diff --git a/queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..9ccc4f8b06 --- /dev/null +++ b/queue-5.10/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From 7f87f0b776b9a3722815a1bc1b527e3d1c90f646 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 91475cb7d568b..29f0ba4adfbce 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2309,10 +2309,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 6d715bb773643..cdb5a2770faf3 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -6432,7 +6432,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..d21460e3e2 --- /dev/null +++ b/queue-5.10/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From 581461587a3316a3c933c4e90962e0a37cad44a6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index f68cfcc1f8300..d558f354b8b82 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1660,7 +1660,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-5.10/series b/queue-5.10/series index 64bc2c1eea..a95895e839 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -1 +1,4 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch diff --git a/queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..b7fb2b5346 --- /dev/null +++ b/queue-5.10/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From b65ea0af0079f32b432165b33550d4e95b18c8ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 8794e3f4974b3..57ba697e29180 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1508,7 +1508,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3011,9 +3011,6 @@ static char *pbrsb_eibrs_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..9528641145 --- /dev/null +++ b/queue-5.15/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From 8b977547797ba2015b11f98dbf944df250edf3c0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 9c2d6f96f46da..136902f27e441 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2387,10 +2387,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 34fedac4e1864..445c7a5641b62 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -6513,7 +6513,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch new file mode 100644 index 0000000000..4f329bf42f --- /dev/null +++ b/queue-5.15/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch @@ -0,0 +1,44 @@ +From f476a1c6de77130f5290f607f2fd2094dd813876 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Sep 2025 17:01:44 +0200 +Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + +From: Thorsten Blum + +[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ] + +Replace max_t() followed by min_t() with a single clamp(). + +As was pointed by David Laight in +https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ +the calculation may overflow u32 when the input value is too large, so +clamp_t() is not used. In practice the expected values are in range of +megabytes to gigabytes (throughput limit) so the bug would not happen. + +Signed-off-by: Thorsten Blum +Reviewed-by: David Sterba +[ Use clamp() and add explanation. ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 6ffd34d39e992..aac4ee5880952 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -2017,8 +2017,7 @@ static void scrub_throttle(struct scrub_ctx *sctx) + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +-- +2.51.0 + diff --git a/queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..2c6a645bbe --- /dev/null +++ b/queue-5.15/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From 9bd34bcc56053310ef3ea6b6c0255bd75c8227be Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index 4fb5e12c87d1b..d96221ed835e9 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1765,7 +1765,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-5.15/series b/queue-5.15/series index 64bc2c1eea..447d39e94a 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -1 +1,5 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch diff --git a/queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..3f37fa6a9a --- /dev/null +++ b/queue-5.15/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From 28fcb9a170c8be4e2920ecb17a4b5f15e5681b81 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 1628c00145892..8df48691f4910 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1539,7 +1539,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3168,9 +3168,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..438a7aa2d2 --- /dev/null +++ b/queue-5.4/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From 2524d3603db07d1acaf0af4f49c597e74dcd7b07 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index 094b024bbf0cf..6618b42defed7 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1546,7 +1546,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-5.4/series b/queue-5.4/series index 64bc2c1eea..342fc0f590 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -1 +1,3 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch diff --git a/queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..b03e77fea6 --- /dev/null +++ b/queue-5.4/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From e39da45843868dd70b02049cb59d491ab47105f3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 4f803aed2ef0e..b10e257799c16 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1188,7 +1188,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -2280,9 +2280,6 @@ static char *pbrsb_eibrs_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch new file mode 100644 index 0000000000..60ccf17b8a --- /dev/null +++ b/queue-6.1/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch @@ -0,0 +1,295 @@ +From ffbe1930ee87e820eb9bd3809625807a5535f61c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Sep 2025 14:09:13 +0800 +Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c + +From: Menglong Dong + +[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ] + +The include/generated/asm-offsets.h is generated in Kbuild during +compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to +generate another similar offset header file, circular dependency can +happen. + +For example, we want to generate a offset file include/generated/test.h, +which is included in include/sched/sched.h. If we generate asm-offsets.h +first, it will fail, as include/sched/sched.h is included in asm-offsets.c +and include/generated/test.h doesn't exist; If we generate test.h first, +it can't success neither, as include/generated/asm-offsets.h is included +by it. + +In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular +dependency. We can generate asm-offsets.h first, and if the +COMPILE_OFFSETS is defined, we don't include the "generated/test.h". + +And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this +purpose. + +Signed-off-by: Menglong Dong +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sasha Levin +--- + arch/alpha/kernel/asm-offsets.c | 1 + + arch/arc/kernel/asm-offsets.c | 1 + + arch/arm/kernel/asm-offsets.c | 2 ++ + arch/arm64/kernel/asm-offsets.c | 1 + + arch/csky/kernel/asm-offsets.c | 1 + + arch/hexagon/kernel/asm-offsets.c | 1 + + arch/loongarch/kernel/asm-offsets.c | 2 ++ + arch/m68k/kernel/asm-offsets.c | 1 + + arch/microblaze/kernel/asm-offsets.c | 1 + + arch/mips/kernel/asm-offsets.c | 2 ++ + arch/nios2/kernel/asm-offsets.c | 1 + + arch/openrisc/kernel/asm-offsets.c | 1 + + arch/parisc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/riscv/kernel/asm-offsets.c | 1 + + arch/s390/kernel/asm-offsets.c | 1 + + arch/sh/kernel/asm-offsets.c | 1 + + arch/sparc/kernel/asm-offsets.c | 1 + + arch/um/kernel/asm-offsets.c | 2 ++ + arch/xtensa/kernel/asm-offsets.c | 1 + + 20 files changed, 24 insertions(+) + +diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c +index 05d9296af5ea6..a251f1bc74acf 100644 +--- a/arch/alpha/kernel/asm-offsets.c ++++ b/arch/alpha/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c +index 0e884036ab743..897dcfc7c9fa0 100644 +--- a/arch/arc/kernel/asm-offsets.c ++++ b/arch/arc/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 2c8d76fd7c662..820bc05685bab 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -7,6 +7,8 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 1197e7679882e..4785e8947f520 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -6,6 +6,7 @@ + * 2001-2002 Keith Owens + * Copyright (C) 2012 ARM Ltd. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c +index d1e9035794733..5525c8e7e1d9e 100644 +--- a/arch/csky/kernel/asm-offsets.c ++++ b/arch/csky/kernel/asm-offsets.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c +index 03a7063f94561..50eea9fa6f137 100644 +--- a/arch/hexagon/kernel/asm-offsets.c ++++ b/arch/hexagon/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * + * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c +index bdd88eda9513f..91b3eae9414f7 100644 +--- a/arch/loongarch/kernel/asm-offsets.c ++++ b/arch/loongarch/kernel/asm-offsets.c +@@ -4,6 +4,8 @@ + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c +index 906d732305374..67a1990f9d748 100644 +--- a/arch/m68k/kernel/asm-offsets.c ++++ b/arch/m68k/kernel/asm-offsets.c +@@ -9,6 +9,7 @@ + * #defines from the assembly-language output. + */ + ++#define COMPILE_OFFSETS + #define ASM_OFFSETS_C + + #include +diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c +index 104c3ac5f30c8..b4b67d58e7f6a 100644 +--- a/arch/microblaze/kernel/asm-offsets.c ++++ b/arch/microblaze/kernel/asm-offsets.c +@@ -7,6 +7,7 @@ + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c +index 08342b9eccdbd..0f9ed454faf19 100644 +--- a/arch/mips/kernel/asm-offsets.c ++++ b/arch/mips/kernel/asm-offsets.c +@@ -9,6 +9,8 @@ + * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000 MIPS Technologies, Inc. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c +index e3d9b7b6fb48a..88190b503ce5d 100644 +--- a/arch/nios2/kernel/asm-offsets.c ++++ b/arch/nios2/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2011 Tobias Klauser + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c +index 710651d5aaae1..3cc826f2216b1 100644 +--- a/arch/openrisc/kernel/asm-offsets.c ++++ b/arch/openrisc/kernel/asm-offsets.c +@@ -18,6 +18,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c +index 94652e13c2603..21e900c0aa958 100644 +--- a/arch/parisc/kernel/asm-offsets.c ++++ b/arch/parisc/kernel/asm-offsets.c +@@ -13,6 +13,7 @@ + * Copyright (C) 2002 Randolph Chung + * Copyright (C) 2003 James Bottomley + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index 65d79dd0c92ce..5a4edc1e5504f 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index 1ecafbcee9a0a..21f034b3fdbeb 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -3,6 +3,7 @@ + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c +index d8ce965c0a97c..9ff68c7f61cc0 100644 +--- a/arch/s390/kernel/asm-offsets.c ++++ b/arch/s390/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #define ASM_OFFSETS_C + +diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c +index a0322e8328456..429b6a7631468 100644 +--- a/arch/sh/kernel/asm-offsets.c ++++ b/arch/sh/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c +index 5784f2df489a4..f1e27a7f800f4 100644 +--- a/arch/sparc/kernel/asm-offsets.c ++++ b/arch/sparc/kernel/asm-offsets.c +@@ -10,6 +10,7 @@ + * + * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c +index 1fb12235ab9c8..a69873aa697f4 100644 +--- a/arch/um/kernel/asm-offsets.c ++++ b/arch/um/kernel/asm-offsets.c +@@ -1 +1,3 @@ ++#define COMPILE_OFFSETS ++ + #include +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index da38de20ae598..cfbced95e944a 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -11,6 +11,7 @@ + * + * Chris Zankel + */ ++#define COMPILE_OFFSETS + + #include + #include +-- +2.51.0 + diff --git a/queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..e7cdf7f455 --- /dev/null +++ b/queue-6.1/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From 1a4b6afa3244441ca0aea69a7dde7f080c6686da Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 76a261cbf39d6..8576ba4aa0b7d 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2413,10 +2413,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index e4cc287eee993..fdcf66ba318ad 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -7366,7 +7366,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch new file mode 100644 index 0000000000..87c2bbfa97 --- /dev/null +++ b/queue-6.1/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch @@ -0,0 +1,44 @@ +From 6afe9f968816990cae616be5a5ce679304c90cec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Sep 2025 17:01:44 +0200 +Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + +From: Thorsten Blum + +[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ] + +Replace max_t() followed by min_t() with a single clamp(). + +As was pointed by David Laight in +https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ +the calculation may overflow u32 when the input value is too large, so +clamp_t() is not used. In practice the expected values are in range of +megabytes to gigabytes (throughput limit) so the bug would not happen. + +Signed-off-by: Thorsten Blum +Reviewed-by: David Sterba +[ Use clamp() and add explanation. ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index f48895a9b165e..ce8a9c226534f 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -2191,8 +2191,7 @@ static void scrub_throttle(struct scrub_ctx *sctx) + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +-- +2.51.0 + diff --git a/queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..01d02c987c --- /dev/null +++ b/queue-6.1/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From db12f5ea1c6e8ad6962fd524fdc61c9a6f3158ba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index ff3e0d4cf4b48..54894a950c6f7 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1787,7 +1787,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch new file mode 100644 index 0000000000..20c909b5bd --- /dev/null +++ b/queue-6.1/btrfs-zoned-refine-extent-allocator-hint-selection.patch @@ -0,0 +1,59 @@ +From 138ae4c24306cd63476f082697a728b825b23160 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 11:13:15 +0900 +Subject: btrfs: zoned: refine extent allocator hint selection + +From: Naohiro Aota + +[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ] + +The hint block group selection in the extent allocator is wrong in the +first place, as it can select the dedicated data relocation block group for +the normal data allocation. + +Since we separated the normal data space_info and the data relocation +space_info, we can easily identify a block group is for data relocation or +not. Do not choose it for the normal data allocation. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 5395e27f9e89a..7985ca56f6b70 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4224,7 +4224,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + } + + static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++ struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_space_info *space_info) + { + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); +@@ -4248,6 +4249,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && ++ block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; +@@ -4269,7 +4271,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: +- return prepare_allocation_zoned(fs_info, ffe_ctl); ++ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); + default: + BUG(); + } +-- +2.51.0 + diff --git a/queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch new file mode 100644 index 0000000000..2923101afa --- /dev/null +++ b/queue-6.1/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch @@ -0,0 +1,89 @@ +From 0eb6e482475c0c284a317ac5c506a4b8996c084e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 20:30:17 +0000 +Subject: EDAC/mc_sysfs: Increase legacy channel support to 16 + +From: Avadhut Naik + +[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ] + +Newer AMD systems can support up to 16 channels per EDAC "mc" device. +These are detected by the EDAC module running on the device, and the +current EDAC interface is appropriately enumerated. + +The legacy EDAC sysfs interface however, provides device attributes for +channels 0 through 11 only. Consequently, the last four channels, 12 +through 15, will not be enumerated and will not be visible through the +legacy sysfs interface. + +Add additional device attributes to ensure that all 16 channels, if +present, are enumerated by and visible through the legacy EDAC sysfs +interface. + +Signed-off-by: Avadhut Naik +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com +Signed-off-by: Sasha Levin +--- + drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c +index 15f63452a9bec..b01436d9ddaed 100644 +--- a/drivers/edac/edac_mc_sysfs.c ++++ b/drivers/edac/edac_mc_sysfs.c +@@ -306,6 +306,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); + DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); ++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 12); ++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 13); ++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 14); ++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 15); + + /* Total possible dynamic DIMM Label attribute file table */ + static struct attribute *dynamic_csrow_dimm_attr[] = { +@@ -321,6 +329,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, ++ &dev_attr_legacy_ch12_dimm_label.attr.attr, ++ &dev_attr_legacy_ch13_dimm_label.attr.attr, ++ &dev_attr_legacy_ch14_dimm_label.attr.attr, ++ &dev_attr_legacy_ch15_dimm_label.attr.attr, + NULL + }; + +@@ -349,6 +361,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); + DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); ++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 12); ++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 13); ++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 14); ++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 15); + + /* Total possible dynamic ce_count attribute file table */ + static struct attribute *dynamic_csrow_ce_count_attr[] = { +@@ -364,6 +384,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, ++ &dev_attr_legacy_ch12_ce_count.attr.attr, ++ &dev_attr_legacy_ch13_ce_count.attr.attr, ++ &dev_attr_legacy_ch14_ce_count.attr.attr, ++ &dev_attr_legacy_ch15_ce_count.attr.attr, + NULL + }; + +-- +2.51.0 + diff --git a/queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch new file mode 100644 index 0000000000..4a19fd1f75 --- /dev/null +++ b/queue-6.1/perf-have-get_perf_callchain-return-null-if-crosstas.patch @@ -0,0 +1,68 @@ +From db01bc1334e68bb784336c7c6a17f5330fe8bd7a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:40 -0400 +Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are + set + +From: Josh Poimboeuf + +[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ] + +get_perf_callchain() doesn't support cross-task unwinding for user space +stacks, have it return NULL if both the crosstask and user arguments are +set. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 1273be84392cf..ce5534c97cd1d 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -184,6 +184,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + struct perf_callchain_entry_ctx ctx; + int rctx; + ++ /* crosstask is not supported for user stacks */ ++ if (crosstask && user && !kernel) ++ return NULL; ++ + entry = get_callchain_entry(&rctx); + if (!entry) + return NULL; +@@ -200,7 +204,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + perf_callchain_kernel(&ctx, regs); + } + +- if (user) { ++ if (user && !crosstask) { + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); +@@ -209,9 +213,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + + if (regs) { +- if (crosstask) +- goto exit_put; +- + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + +@@ -219,7 +220,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + } + +-exit_put: + put_callchain_entry(rctx); + + return entry; +-- +2.51.0 + diff --git a/queue-6.1/series b/queue-6.1/series index 64bc2c1eea..3c8a42c3f4 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -1 +1,9 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +perf-have-get_perf_callchain-return-null-if-crosstas.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +edac-mc_sysfs-increase-legacy-channel-support-to-16.patch +btrfs-zoned-refine-extent-allocator-hint-selection.patch +btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch +arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch diff --git a/queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..93e2653ffc --- /dev/null +++ b/queue-6.1/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From 83bed2b0363895cb49a0657a2060b85a29fbee20 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index ff8965bce6c90..a0b362ac50a1b 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1539,7 +1539,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3169,9 +3169,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch new file mode 100644 index 0000000000..f6370bd6f3 --- /dev/null +++ b/queue-6.12/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch @@ -0,0 +1,295 @@ +From 8905c9ee0afca1bccc75668f38209611ee29903d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Sep 2025 14:09:13 +0800 +Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c + +From: Menglong Dong + +[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ] + +The include/generated/asm-offsets.h is generated in Kbuild during +compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to +generate another similar offset header file, circular dependency can +happen. + +For example, we want to generate a offset file include/generated/test.h, +which is included in include/sched/sched.h. If we generate asm-offsets.h +first, it will fail, as include/sched/sched.h is included in asm-offsets.c +and include/generated/test.h doesn't exist; If we generate test.h first, +it can't success neither, as include/generated/asm-offsets.h is included +by it. + +In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular +dependency. We can generate asm-offsets.h first, and if the +COMPILE_OFFSETS is defined, we don't include the "generated/test.h". + +And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this +purpose. + +Signed-off-by: Menglong Dong +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sasha Levin +--- + arch/alpha/kernel/asm-offsets.c | 1 + + arch/arc/kernel/asm-offsets.c | 1 + + arch/arm/kernel/asm-offsets.c | 2 ++ + arch/arm64/kernel/asm-offsets.c | 1 + + arch/csky/kernel/asm-offsets.c | 1 + + arch/hexagon/kernel/asm-offsets.c | 1 + + arch/loongarch/kernel/asm-offsets.c | 2 ++ + arch/m68k/kernel/asm-offsets.c | 1 + + arch/microblaze/kernel/asm-offsets.c | 1 + + arch/mips/kernel/asm-offsets.c | 2 ++ + arch/nios2/kernel/asm-offsets.c | 1 + + arch/openrisc/kernel/asm-offsets.c | 1 + + arch/parisc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/riscv/kernel/asm-offsets.c | 1 + + arch/s390/kernel/asm-offsets.c | 1 + + arch/sh/kernel/asm-offsets.c | 1 + + arch/sparc/kernel/asm-offsets.c | 1 + + arch/um/kernel/asm-offsets.c | 2 ++ + arch/xtensa/kernel/asm-offsets.c | 1 + + 20 files changed, 24 insertions(+) + +diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c +index e9dad60b147f3..1ebb058904992 100644 +--- a/arch/alpha/kernel/asm-offsets.c ++++ b/arch/alpha/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c +index f77deb7991757..2978da85fcb65 100644 +--- a/arch/arc/kernel/asm-offsets.c ++++ b/arch/arc/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 4853875740d0f..d9f129c584b1d 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -7,6 +7,8 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index b21dd24b8efc3..020e01181a0f1 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -6,6 +6,7 @@ + * 2001-2002 Keith Owens + * Copyright (C) 2012 ARM Ltd. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c +index d1e9035794733..5525c8e7e1d9e 100644 +--- a/arch/csky/kernel/asm-offsets.c ++++ b/arch/csky/kernel/asm-offsets.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c +index 03a7063f94561..50eea9fa6f137 100644 +--- a/arch/hexagon/kernel/asm-offsets.c ++++ b/arch/hexagon/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * + * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c +index bee9f7a3108f0..d20d71d4bcae6 100644 +--- a/arch/loongarch/kernel/asm-offsets.c ++++ b/arch/loongarch/kernel/asm-offsets.c +@@ -4,6 +4,8 @@ + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c +index 906d732305374..67a1990f9d748 100644 +--- a/arch/m68k/kernel/asm-offsets.c ++++ b/arch/m68k/kernel/asm-offsets.c +@@ -9,6 +9,7 @@ + * #defines from the assembly-language output. + */ + ++#define COMPILE_OFFSETS + #define ASM_OFFSETS_C + + #include +diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c +index 104c3ac5f30c8..b4b67d58e7f6a 100644 +--- a/arch/microblaze/kernel/asm-offsets.c ++++ b/arch/microblaze/kernel/asm-offsets.c +@@ -7,6 +7,7 @@ + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c +index cb1045ebab062..22c99a2cd5707 100644 +--- a/arch/mips/kernel/asm-offsets.c ++++ b/arch/mips/kernel/asm-offsets.c +@@ -9,6 +9,8 @@ + * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000 MIPS Technologies, Inc. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c +index e3d9b7b6fb48a..88190b503ce5d 100644 +--- a/arch/nios2/kernel/asm-offsets.c ++++ b/arch/nios2/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2011 Tobias Klauser + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c +index 710651d5aaae1..3cc826f2216b1 100644 +--- a/arch/openrisc/kernel/asm-offsets.c ++++ b/arch/openrisc/kernel/asm-offsets.c +@@ -18,6 +18,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c +index 757816a7bd4b2..9abfe65492c65 100644 +--- a/arch/parisc/kernel/asm-offsets.c ++++ b/arch/parisc/kernel/asm-offsets.c +@@ -13,6 +13,7 @@ + * Copyright (C) 2002 Randolph Chung + * Copyright (C) 2003 James Bottomley + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index 131a8cc10dbe8..cbeeda45c00a2 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index c2f3129a8e5cf..05c6152a65310 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -3,6 +3,7 @@ + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c +index 5529248d84fb8..3cfc4939033c9 100644 +--- a/arch/s390/kernel/asm-offsets.c ++++ b/arch/s390/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #define ASM_OFFSETS_C + +diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c +index a0322e8328456..429b6a7631468 100644 +--- a/arch/sh/kernel/asm-offsets.c ++++ b/arch/sh/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c +index 3d9b9855dce91..6e660bde48dd8 100644 +--- a/arch/sparc/kernel/asm-offsets.c ++++ b/arch/sparc/kernel/asm-offsets.c +@@ -10,6 +10,7 @@ + * + * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c +index 1fb12235ab9c8..a69873aa697f4 100644 +--- a/arch/um/kernel/asm-offsets.c ++++ b/arch/um/kernel/asm-offsets.c +@@ -1 +1,3 @@ ++#define COMPILE_OFFSETS ++ + #include +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index da38de20ae598..cfbced95e944a 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -11,6 +11,7 @@ + * + * Chris Zankel + */ ++#define COMPILE_OFFSETS + + #include + #include +-- +2.51.0 + diff --git a/queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch new file mode 100644 index 0000000000..d2d02b83ac --- /dev/null +++ b/queue-6.12/audit-record-fanotify-event-regardless-of-presence-o.patch @@ -0,0 +1,44 @@ +From 9c66e5dad6997a22b7ffbbecaed782d5db5c2542 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 17:04:07 -0400 +Subject: audit: record fanotify event regardless of presence of rules + +From: Richard Guy Briggs + +[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ] + +When no audit rules are in place, fanotify event results are +unconditionally dropped due to an explicit check for the existence of +any audit rules. Given this is a report from another security +sub-system, allow it to be recorded regardless of the existence of any +audit rules. + +To test, install and run the fapolicyd daemon with default config. Then +as an unprivileged user, create and run a very simple binary that should +be denied. Then check for an event with + ausearch -m FANOTIFY -ts recent + +Link: https://issues.redhat.com/browse/RHEL-9065 +Signed-off-by: Richard Guy Briggs +Signed-off-by: Paul Moore +Signed-off-by: Sasha Levin +--- + include/linux/audit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/audit.h b/include/linux/audit.h +index a394614ccd0b8..e3f06eba9c6e6 100644 +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) + + static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) + { +- if (!audit_dummy_context()) ++ if (audit_enabled) + __audit_fanotify(response, friar); + } + +-- +2.51.0 + diff --git a/queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch b/queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch new file mode 100644 index 0000000000..c2f5abe713 --- /dev/null +++ b/queue-6.12/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch @@ -0,0 +1,39 @@ +From 88d6ab1b772266a93ddd13ce9c15485bedc4322c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Sep 2025 17:43:04 +0100 +Subject: btrfs: abort transaction if we fail to update inode in log replay dir + fixup + +From: Filipe Manana + +[ Upstream commit 5a0565cad3ef7cbf4cf43d1dd1e849b156205292 ] + +If we fail to update the inode at link_to_fixup_dir(), we don't abort the +transaction and propagate the error up the call chain, which makes it hard +to pinpoint the error to the inode update. So abort the transaction if the +inode update call fails, so that if it happens we known immediately. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index b43a7c0c7cb7a..173e13e1d5b88 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1778,6 +1778,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + else + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); + } else if (ret == -EEXIST) { + ret = 0; + } +-- +2.51.0 + diff --git a/queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch b/queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch new file mode 100644 index 0000000000..2157590d3d --- /dev/null +++ b/queue-6.12/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch @@ -0,0 +1,77 @@ +From 9bdeac056de0e4eb2dfe729346d08037d6e5c175 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 15:49:31 +0100 +Subject: btrfs: abort transaction in the process_one_buffer() log tree walk + callback + +From: Filipe Manana + +[ Upstream commit e6dd405b6671b9753b98d8bdf76f8f0ed36c11cd ] + +In the process_one_buffer() log tree walk callback we return errors to the +log tree walk caller and then the caller aborts the transaction, if we +have one, or turns the fs into error state if we don't have one. While +this reduces code it makes it harder to figure out where exactly an error +came from. So add the transaction aborts after every failure inside the +process_one_buffer() callback, so that it helps figuring out why failures +happen. + +Reviewed-by: Boris Burkov +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index f3ca530f032df..1c207a6d71ecf 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -350,6 +350,7 @@ static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level) + { ++ struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = log->fs_info; + int ret = 0; + +@@ -364,18 +365,29 @@ static int process_one_buffer(struct btrfs_root *log, + }; + + ret = btrfs_read_extent_buffer(eb, &check); +- if (ret) ++ if (ret) { ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; ++ } + } + + if (wc->pin) { +- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); +- if (ret) ++ ASSERT(trans != NULL); ++ ret = btrfs_pin_extent_for_log_replay(trans, eb); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); + return ret; ++ } + + if (btrfs_buffer_uptodate(eb, gen, 0) && +- btrfs_header_level(eb) == 0) ++ btrfs_header_level(eb) == 0) { + ret = btrfs_exclude_logged_extents(eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ } + } + return ret; + } +-- +2.51.0 + diff --git a/queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch b/queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch new file mode 100644 index 0000000000..fc41af4e11 --- /dev/null +++ b/queue-6.12/btrfs-abort-transaction-on-specific-error-places-whe.patch @@ -0,0 +1,111 @@ +From 81186743181cc300f669cb0bfb781e773fa6ea6b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 14:56:11 +0100 +Subject: btrfs: abort transaction on specific error places when walking log + tree + +From: Filipe Manana + +[ Upstream commit 6ebd726b104fa99d47c0d45979e6a6109844ac18 ] + +We do several things while walking a log tree (for replaying and for +freeing a log tree) like reading extent buffers and cleaning them up, +but we don't immediately abort the transaction, or turn the fs into an +error state, when one of these things fails. Instead we the transaction +abort or turn the fs into error state in the caller of the entry point +function that walks a log tree - walk_log_tree() - which means we don't +get to know exactly where an error came from. + +Improve on this by doing a transaction abort / turn fs into error state +after each such failure so that when it happens we have a better +understanding where the failure comes from. This deliberately leaves +the transaction abort / turn fs into error state in the callers of +walk_log_tree() as to ensure we don't get into an inconsistent state in +case we forget to do it deeper in call chain. It also deliberately does +not do it after errors from the calls to the callback defined in +struct walk_control::process_func(), as we will do it later on another +patch. + +Reviewed-by: Boris Burkov +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 33 ++++++++++++++++++++++++++++----- + 1 file changed, 28 insertions(+), 5 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 0022ad003791f..f3ca530f032df 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2612,15 +2612,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) + static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) + { ++ int ret; ++ + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + +- if (trans) +- return btrfs_pin_reserved_extent(trans, eb); ++ if (trans) { ++ ret = btrfs_pin_reserved_extent(trans, eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ return ret; ++ } + +- return unaccount_log_buffer(eb->fs_info, eb->start); ++ ret = unaccount_log_buffer(eb->fs_info, eb->start); ++ if (ret) ++ btrfs_handle_fs_error(eb->fs_info, ret, NULL); ++ return ret; + } + + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, +@@ -2656,8 +2665,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); +- if (IS_ERR(next)) +- return PTR_ERR(next); ++ if (IS_ERR(next)) { ++ ret = PTR_ERR(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); ++ return ret; ++ } + + if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen, +@@ -2672,6 +2687,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +@@ -2687,6 +2706,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +-- +2.51.0 + diff --git a/queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..15c93c8f7d --- /dev/null +++ b/queue-6.12/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From c6fd6b1021ea066aa250a87df9a586cacfc01851 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index e655fa3bfd9be..3a73d218af464 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2100,10 +2100,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 1c207a6d71ecf..63b14005f5066 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -7457,7 +7457,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch new file mode 100644 index 0000000000..4104ceca36 --- /dev/null +++ b/queue-6.12/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch @@ -0,0 +1,44 @@ +From 8110fe15ebe327b95bcf726f399a81011b0fa47e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Sep 2025 17:01:44 +0200 +Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + +From: Thorsten Blum + +[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ] + +Replace max_t() followed by min_t() with a single clamp(). + +As was pointed by David Laight in +https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ +the calculation may overflow u32 when the input value is too large, so +clamp_t() is not used. In practice the expected values are in range of +megabytes to gigabytes (throughput limit) so the bug would not happen. + +Signed-off-by: Thorsten Blum +Reviewed-by: David Sterba +[ Use clamp() and add explanation. ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 3fcc7c092c5ec..9a6e0b047d3b6 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1270,8 +1270,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +-- +2.51.0 + diff --git a/queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch b/queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch new file mode 100644 index 0000000000..030d2d5b05 --- /dev/null +++ b/queue-6.12/btrfs-tree-checker-add-inode-extref-checks.patch @@ -0,0 +1,90 @@ +From 277716e473637786bb6a577d628c45e2e3465378 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 08:34:05 +0930 +Subject: btrfs: tree-checker: add inode extref checks + +From: Qu Wenruo + +[ Upstream commit aab9458b9f0019e97fae394c2d6d9d1a03addfb3 ] + +Like inode refs, inode extrefs have a variable length name, which means +we have to do a proper check to make sure no header nor name can exceed +the item limits. + +The check itself is very similar to check_inode_ref(), just a different +structure (btrfs_inode_extref vs btrfs_inode_ref). + +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-checker.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +index 14f96d217e6e1..986b1612d5b04 100644 +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || ++ key->type == BTRFS_INODE_EXTREF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); +@@ -1770,6 +1771,39 @@ static int check_inode_ref(struct extent_buffer *leaf, + return 0; + } + ++static int check_inode_extref(struct extent_buffer *leaf, ++ struct btrfs_key *key, struct btrfs_key *prev_key, ++ int slot) ++{ ++ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); ++ unsigned long end = ptr + btrfs_item_size(leaf, slot); ++ ++ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) ++ return -EUCLEAN; ++ ++ while (ptr < end) { ++ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; ++ u16 namelen; ++ ++ if (unlikely(ptr + sizeof(*extref)) > end) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ++ ptr, end, sizeof(*extref)); ++ return -EUCLEAN; ++ } ++ ++ namelen = btrfs_inode_extref_name_len(leaf, extref); ++ if (unlikely(ptr + sizeof(*extref) + namelen > end)) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu namelen %u", ++ ptr, end, namelen); ++ return -EUCLEAN; ++ } ++ ptr += sizeof(*extref) + namelen; ++ } ++ return 0; ++} ++ + static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) + { +@@ -1881,6 +1915,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); + break; ++ case BTRFS_INODE_EXTREF_KEY: ++ ret = check_inode_extref(leaf, key, prev_key, slot); ++ break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(leaf, key, slot); + break; +-- +2.51.0 + diff --git a/queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch new file mode 100644 index 0000000000..011f9ea57b --- /dev/null +++ b/queue-6.12/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch @@ -0,0 +1,50 @@ +From bbe02836089d717f5f340b95731ecc35b7434a60 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Aug 2025 17:46:18 +0100 +Subject: btrfs: use level argument in log tree walk callback + replay_one_buffer() + +From: Filipe Manana + +[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ] + +We already have the extent buffer's level in an argument, there's no need +to first ensure the extent buffer's data is loaded (by calling +btrfs_read_extent_buffer()) and then call btrfs_header_level() to check +the level. So use the level argument and do the check before calling +btrfs_read_extent_buffer(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 63b14005f5066..b43a7c0c7cb7a 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2443,15 +2443,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + int i; + int ret; + ++ if (level != 0) ++ return 0; ++ + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + +- level = btrfs_header_level(eb); +- +- if (level != 0) +- return 0; +- + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +-- +2.51.0 + diff --git a/queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..16b2efd495 --- /dev/null +++ b/queue-6.12/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From b6543dc88730987fe194ba4d74a9fb0de649075c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index 1a029392eac52..f4dda72491feb 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1810,7 +1810,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch new file mode 100644 index 0000000000..0a8157714b --- /dev/null +++ b/queue-6.12/btrfs-zoned-refine-extent-allocator-hint-selection.patch @@ -0,0 +1,59 @@ +From b99faec603c7d9385177f7858dac67ed1d08c110 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 11:13:15 +0900 +Subject: btrfs: zoned: refine extent allocator hint selection + +From: Naohiro Aota + +[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ] + +The hint block group selection in the extent allocator is wrong in the +first place, as it can select the dedicated data relocation block group for +the normal data allocation. + +Since we separated the normal data space_info and the data relocation +space_info, we can easily identify a block group is for data relocation or +not. Do not choose it for the normal data allocation. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index bb3602059906d..7bab2512468d5 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4299,7 +4299,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + } + + static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++ struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_space_info *space_info) + { + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); +@@ -4323,6 +4324,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && ++ block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; +@@ -4344,7 +4346,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: +- return prepare_allocation_zoned(fs_info, ffe_ctl); ++ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); + default: + BUG(); + } +-- +2.51.0 + diff --git a/queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch new file mode 100644 index 0000000000..97d2401604 --- /dev/null +++ b/queue-6.12/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch @@ -0,0 +1,111 @@ +From 4a48cb581a19341097dba34421a1cb023b4d9e32 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Jul 2025 13:39:11 +0200 +Subject: btrfs: zoned: return error from btrfs_zone_finish_endio() + +From: Johannes Thumshirn + +[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ] + +Now that btrfs_zone_finish_endio_workfn() is directly calling +do_zone_finish() the only caller of btrfs_zone_finish_endio() is +btrfs_finish_one_ordered(). + +btrfs_finish_one_ordered() already has error handling in-place so +btrfs_zone_finish_endio() can return an error if the block group lookup +fails. + +Also as btrfs_zone_finish_endio() already checks for zoned filesystems and +returns early, there's no need to do this in the caller. + +Reviewed-by: Damien Le Moal +Signed-off-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 7 ++++--- + fs/btrfs/zoned.c | 8 +++++--- + fs/btrfs/zoned.h | 9 ++++++--- + 3 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 19c0ec9c327c1..e32dd4193aea1 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3174,9 +3174,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) + goto out; + } + +- if (btrfs_is_zoned(fs_info)) +- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, +- ordered_extent->disk_num_bytes); ++ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ++ ordered_extent->disk_num_bytes); ++ if (ret) ++ goto out; + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 4966b4f5a7d24..64e0a5bf5f9a5 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2384,16 +2384,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + return ret; + } + +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) + { + struct btrfs_block_group *block_group; + u64 min_alloc_bytes; + + if (!btrfs_is_zoned(fs_info)) +- return; ++ return 0; + + block_group = btrfs_lookup_block_group(fs_info, logical); +- ASSERT(block_group); ++ if (WARN_ON_ONCE(!block_group)) ++ return -ENOENT; + + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) +@@ -2410,6 +2411,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len + + out: + btrfs_put_block_group(block_group); ++ return 0; + } + + static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 7612e65726053..f7171ab6ed71e 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); +@@ -232,8 +232,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + return true; + } + +-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) { } ++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, ++ u64 logical, u64 length) ++{ ++ return 0; ++} + + static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } +-- +2.51.0 + diff --git a/queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch b/queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch new file mode 100644 index 0000000000..146f79215b --- /dev/null +++ b/queue-6.12/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch @@ -0,0 +1,47 @@ +From e566c5390297c67166b64ac35818038e9c63be55 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Sep 2025 01:12:27 +0000 +Subject: cpuset: Use new excpus for nocpu error check when enabling root + partition + +From: Chen Ridong + +[ Upstream commit 59d5de3655698679ad8fd2cc82228de4679c4263 ] + +A previous patch fixed a bug where new_prs should be assigned before +checking housekeeping conflicts. This patch addresses another potential +issue: the nocpu error check currently uses the xcpus which is not updated. +Although no issue has been observed so far, the check should be performed +using the new effective exclusive cpus. + +The comment has been removed because the function returns an error if +nocpu checking fails, which is unrelated to the parent. + +Signed-off-by: Chen Ridong +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/cgroup/cpuset.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 25f9565f798d4..13eb986172499 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -1679,11 +1679,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + +- /* +- * A parent can be left with no CPU as long as there is no +- * task directly associated with the parent partition. +- */ +- if (nocpu) ++ if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + + deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus); +-- +2.51.0 + diff --git a/queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch new file mode 100644 index 0000000000..be8aab992b --- /dev/null +++ b/queue-6.12/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch @@ -0,0 +1,89 @@ +From 71bc3080d5e3c82d57a7d368802c0eeb3fe35796 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 20:30:17 +0000 +Subject: EDAC/mc_sysfs: Increase legacy channel support to 16 + +From: Avadhut Naik + +[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ] + +Newer AMD systems can support up to 16 channels per EDAC "mc" device. +These are detected by the EDAC module running on the device, and the +current EDAC interface is appropriately enumerated. + +The legacy EDAC sysfs interface however, provides device attributes for +channels 0 through 11 only. Consequently, the last four channels, 12 +through 15, will not be enumerated and will not be visible through the +legacy sysfs interface. + +Add additional device attributes to ensure that all 16 channels, if +present, are enumerated by and visible through the legacy EDAC sysfs +interface. + +Signed-off-by: Avadhut Naik +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com +Signed-off-by: Sasha Levin +--- + drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c +index 4200aec048318..70dc0ee1cc08f 100644 +--- a/drivers/edac/edac_mc_sysfs.c ++++ b/drivers/edac/edac_mc_sysfs.c +@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); + DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); ++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 12); ++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 13); ++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 14); ++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 15); + + /* Total possible dynamic DIMM Label attribute file table */ + static struct attribute *dynamic_csrow_dimm_attr[] = { +@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, ++ &dev_attr_legacy_ch12_dimm_label.attr.attr, ++ &dev_attr_legacy_ch13_dimm_label.attr.attr, ++ &dev_attr_legacy_ch14_dimm_label.attr.attr, ++ &dev_attr_legacy_ch15_dimm_label.attr.attr, + NULL + }; + +@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); + DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); ++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 12); ++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 13); ++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 14); ++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 15); + + /* Total possible dynamic ce_count attribute file table */ + static struct attribute *dynamic_csrow_ce_count_attr[] = { +@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, ++ &dev_attr_legacy_ch12_ce_count.attr.attr, ++ &dev_attr_legacy_ch13_ce_count.attr.attr, ++ &dev_attr_legacy_ch14_ce_count.attr.attr, ++ &dev_attr_legacy_ch15_ce_count.attr.attr, + NULL + }; + +-- +2.51.0 + diff --git a/queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch new file mode 100644 index 0000000000..b781a482a8 --- /dev/null +++ b/queue-6.12/perf-have-get_perf_callchain-return-null-if-crosstas.patch @@ -0,0 +1,68 @@ +From 52f1a40706a3d0a7fe1f00e045735ecd4d752fa9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:40 -0400 +Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are + set + +From: Josh Poimboeuf + +[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ] + +get_perf_callchain() doesn't support cross-task unwinding for user space +stacks, have it return NULL if both the crosstask and user arguments are +set. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index d1a09e6f514c9..49d87e6db553f 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -223,6 +223,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + struct perf_callchain_entry_ctx ctx; + int rctx, start_entry_idx; + ++ /* crosstask is not supported for user stacks */ ++ if (crosstask && user && !kernel) ++ return NULL; ++ + entry = get_callchain_entry(&rctx); + if (!entry) + return NULL; +@@ -239,7 +243,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + perf_callchain_kernel(&ctx, regs); + } + +- if (user) { ++ if (user && !crosstask) { + if (!user_mode(regs)) { + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; +@@ -248,9 +252,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + + if (regs) { +- if (crosstask) +- goto exit_put; +- + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + +@@ -260,7 +261,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + } + +-exit_put: + put_callchain_entry(rctx); + + return entry; +-- +2.51.0 + diff --git a/queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch new file mode 100644 index 0000000000..8cf342baad --- /dev/null +++ b/queue-6.12/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch @@ -0,0 +1,37 @@ +From a41a16b8fdc8a8c7396811aa98fdc15b7d36235f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:43 -0400 +Subject: perf: Skip user unwind if the task is a kernel thread + +From: Josh Poimboeuf + +[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ] + +If the task is not a user thread, there's no user stack to unwind. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/core.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 0339f60e34981..d6a86d8e9e59b 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -7847,7 +7847,8 @@ struct perf_callchain_entry * + perf_callchain(struct perf_event *event, struct pt_regs *regs) + { + bool kernel = !event->attr.exclude_callchain_kernel; +- bool user = !event->attr.exclude_callchain_user; ++ bool user = !event->attr.exclude_callchain_user && ++ !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; +-- +2.51.0 + diff --git a/queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch new file mode 100644 index 0000000000..459f745386 --- /dev/null +++ b/queue-6.12/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch @@ -0,0 +1,67 @@ +From 5513a16de55c2fc4c77969e1300f73c3962cc296 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:41 -0400 +Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of + current->mm == NULL + +From: Steven Rostedt + +[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ] + +To determine if a task is a kernel thread or not, it is more reliable to +use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on +current->mm being NULL. That is because some kernel tasks (io_uring +helpers) may have a mm field. + +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 6 +++--- + kernel/events/core.c | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 8a47e52a454f4..d1a09e6f514c9 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -241,10 +241,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + + if (user) { + if (!user_mode(regs)) { +- if (current->mm) +- regs = task_pt_regs(current); +- else ++ if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; ++ else ++ regs = task_pt_regs(current); + } + + if (regs) { +diff --git a/kernel/events/core.c b/kernel/events/core.c +index d60d48d482b01..0339f60e34981 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -7095,7 +7095,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; +- } else if (!(current->flags & PF_KTHREAD)) { ++ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + perf_get_regs_user(regs_user, regs); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; +@@ -7735,7 +7735,7 @@ static u64 perf_virt_to_phys(u64 virt) + * Try IRQ-safe get_user_page_fast_only first. + * If failed, leave phys_addr as 0. + */ +- if (current->mm != NULL) { ++ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct page *p; + + pagefault_disable(); +-- +2.51.0 + diff --git a/queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch b/queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch new file mode 100644 index 0000000000..e778d5c1ee --- /dev/null +++ b/queue-6.12/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch @@ -0,0 +1,101 @@ +From 5d4fbce76c5677e241e63851a259064ad6435df3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 10:30:31 +0800 +Subject: perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into + INTEL_FIXED_BITS_MASK + +From: Dapeng Mi + +[ Upstream commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a ] + +ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, +add it. + +With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can +be optimized. The old fixed counter control bits can be unconditionally +cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on +new configuration. + +Signed-off-by: Dapeng Mi +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Kan Liang +Tested-by: Yi Lai +Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com +Signed-off-by: Sasha Levin +--- + arch/x86/events/intel/core.c | 10 +++------- + arch/x86/include/asm/perf_event.h | 6 +++++- + arch/x86/kvm/pmu.h | 2 +- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 36d8404f406de..acc0774519ce2 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2812,8 +2812,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; +- u64 mask, bits = 0; + int idx = hwc->idx; ++ u64 bits = 0; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); +@@ -2849,14 +2849,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + + idx -= INTEL_PMC_IDX_FIXED; + bits = intel_fixed_bits_by_idx(idx, bits); +- mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); +- +- if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { ++ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) + bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- } + +- cpuc->fixed_ctrl_val &= ~mask; ++ cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); + cpuc->fixed_ctrl_val |= bits; + } + +diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h +index aa351c4a20eee..c69b6498f6eaa 100644 +--- a/arch/x86/include/asm/perf_event.h ++++ b/arch/x86/include/asm/perf_event.h +@@ -35,7 +35,6 @@ + #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) + #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) + +-#define INTEL_FIXED_BITS_MASK 0xFULL + #define INTEL_FIXED_BITS_STRIDE 4 + #define INTEL_FIXED_0_KERNEL (1ULL << 0) + #define INTEL_FIXED_0_USER (1ULL << 1) +@@ -47,6 +46,11 @@ + #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) + #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) + ++#define INTEL_FIXED_BITS_MASK \ ++ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ ++ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ ++ ICL_FIXED_0_ADAPTIVE) ++ + #define intel_fixed_bits_by_idx(_idx, _bits) \ + ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) + +diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h +index ad89d0bd60058..103604c4b33b5 100644 +--- a/arch/x86/kvm/pmu.h ++++ b/arch/x86/kvm/pmu.h +@@ -13,7 +13,7 @@ + #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + +-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ ++/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ + #define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) + +-- +2.51.0 + diff --git a/queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch b/queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch new file mode 100644 index 0000000000..55de656edf --- /dev/null +++ b/queue-6.12/sched_ext-make-qmap-dump-operation-non-destructive.patch @@ -0,0 +1,70 @@ +From cbcae6872c4865b170a63648115d9a0c9d6b8783 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Sep 2025 09:03:26 -1000 +Subject: sched_ext: Make qmap dump operation non-destructive + +From: Tejun Heo + +[ Upstream commit d452972858e5cfa4262320ab74fe8f016460b96f ] + +The qmap dump operation was destructively consuming queue entries while +displaying them. As dump can be triggered anytime, this can easily lead to +stalls. Add a temporary dump_store queue and modify the dump logic to pop +entries, display them, and then restore them back to the original queue. +This allows dump operations to be performed without affecting the +scheduler's queue state. + +Note that if racing against new enqueues during dump, ordering can get +mixed up, but this is acceptable for debugging purposes. + +Acked-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + tools/sched_ext/scx_qmap.bpf.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c +index 5d1f880d1149e..e952f525599bd 100644 +--- a/tools/sched_ext/scx_qmap.bpf.c ++++ b/tools/sched_ext/scx_qmap.bpf.c +@@ -56,7 +56,8 @@ struct qmap { + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), +- queue4 SEC(".maps"); ++ queue4 SEC(".maps"), ++ dump_store SEC(".maps"); + + struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +@@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); ++ ++ /* ++ * Dump can be invoked anytime and there is no way to iterate in ++ * a non-destructive way. Pop and store in dump_store and then ++ * restore afterwards. If racing against new enqueues, ordering ++ * can get mixed up. ++ */ + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; ++ bpf_map_push_elem(&dump_store, &pid, 0); + scx_bpf_dump(" %d", pid); + } ++ ++ bpf_repeat(4096) { ++ if (bpf_map_pop_elem(&dump_store, &pid)) ++ break; ++ bpf_map_push_elem(fifo, &pid, 0); ++ } ++ + scx_bpf_dump("\n"); + } + } +-- +2.51.0 + diff --git a/queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch b/queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch new file mode 100644 index 0000000000..ba43d7f475 --- /dev/null +++ b/queue-6.12/seccomp-passthrough-uprobe-systemcall-without-filter.patch @@ -0,0 +1,85 @@ +From 5d620bd1214b6de59a55396c50bcce763e0e69e6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 20 Jul 2025 13:21:30 +0200 +Subject: seccomp: passthrough uprobe systemcall without filtering + +From: Jiri Olsa + +[ Upstream commit 89d1d8434d246c96309a6068dfcf9e36dc61227b ] + +Adding uprobe as another exception to the seccomp filter alongside +with the uretprobe syscall. + +Same as the uretprobe the uprobe syscall is installed by kernel as +replacement for the breakpoint exception and is limited to x86_64 +arch and isn't expected to ever be supported in i386. + +Signed-off-by: Jiri Olsa +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Kees Cook +Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/seccomp.c | 32 +++++++++++++++++++++++++------- + 1 file changed, 25 insertions(+), 7 deletions(-) + +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index 267b00005eaf2..1eac0d2b8ecbe 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -733,6 +733,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) + } + + #ifdef SECCOMP_ARCH_NATIVE ++static bool seccomp_uprobe_exception(struct seccomp_data *sd) ++{ ++#if defined __NR_uretprobe || defined __NR_uprobe ++#ifdef SECCOMP_ARCH_COMPAT ++ if (sd->arch == SECCOMP_ARCH_NATIVE) ++#endif ++ { ++#ifdef __NR_uretprobe ++ if (sd->nr == __NR_uretprobe) ++ return true; ++#endif ++#ifdef __NR_uprobe ++ if (sd->nr == __NR_uprobe) ++ return true; ++#endif ++ } ++#endif ++ return false; ++} ++ + /** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs +@@ -750,13 +770,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + return false; + + /* Our single exception to filtering. */ +-#ifdef __NR_uretprobe +-#ifdef SECCOMP_ARCH_COMPAT +- if (sd->arch == SECCOMP_ARCH_NATIVE) +-#endif +- if (sd->nr == __NR_uretprobe) +- return true; +-#endif ++ if (seccomp_uprobe_exception(sd)) ++ return true; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; +@@ -1034,6 +1049,9 @@ static const int mode1_syscalls[] = { + __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, + #ifdef __NR_uretprobe + __NR_uretprobe, ++#endif ++#ifdef __NR_uprobe ++ __NR_uprobe, + #endif + -1, /* negative terminated */ + }; +-- +2.51.0 + diff --git a/queue-6.12/series b/queue-6.12/series index 64bc2c1eea..128ef58b13 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -1 +1,23 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +audit-record-fanotify-event-regardless-of-presence-o.patch +perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch +perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch +perf-have-get_perf_callchain-return-null-if-crosstas.patch +perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch +seccomp-passthrough-uprobe-systemcall-without-filter.patch +x86-bugs-report-correct-retbleed-mitigation-status.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +edac-mc_sysfs-increase-legacy-channel-support-to-16.patch +cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch +btrfs-abort-transaction-on-specific-error-places-whe.patch +btrfs-abort-transaction-in-the-process_one_buffer-lo.patch +btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch +btrfs-zoned-refine-extent-allocator-hint-selection.patch +btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-level-argument-in-log-tree-walk-callback-r.patch +btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch +btrfs-tree-checker-add-inode-extref-checks.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch +sched_ext-make-qmap-dump-operation-non-destructive.patch +arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch diff --git a/queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..73653acf18 --- /dev/null +++ b/queue-6.12/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From a036c27a3f7cf40ed8c2a1810edd5984a792ad7c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 0c16457e06543..939401b5d2ef0 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1598,7 +1598,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3251,9 +3251,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch new file mode 100644 index 0000000000..b7db3acefb --- /dev/null +++ b/queue-6.12/x86-bugs-report-correct-retbleed-mitigation-status.patch @@ -0,0 +1,46 @@ +From f6920d5581ee7b7a5f0f69ff94e67cf03510b89e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:06 -0500 +Subject: x86/bugs: Report correct retbleed mitigation status + +From: David Kaplan + +[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ] + +On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this +requires that a similar spectre_v2 mitigation is applied. If the user +selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a +warning is printed but sysfs will still report 'Mitigation: IBRS' or +'Mitigation: Enhanced IBRS'. This is incorrect because retbleed is not +mitigated, and IBRS is not actually set. + +Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the +kernel correctly reports the system as vulnerable to retbleed. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index f3cb559a598df..0c16457e06543 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1186,8 +1186,10 @@ static void __init retbleed_select_mitigation(void) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: +- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) ++ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { + pr_err(RETBLEED_INTEL_MSG); ++ retbleed_mitigation = RETBLEED_MITIGATION_NONE; ++ } + } + } + +-- +2.51.0 + diff --git a/queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch new file mode 100644 index 0000000000..51a10b9752 --- /dev/null +++ b/queue-6.17/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch @@ -0,0 +1,295 @@ +From 09755c95623ff205c25cd78c7f08e57b187626b1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Sep 2025 14:09:13 +0800 +Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c + +From: Menglong Dong + +[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ] + +The include/generated/asm-offsets.h is generated in Kbuild during +compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to +generate another similar offset header file, circular dependency can +happen. + +For example, we want to generate a offset file include/generated/test.h, +which is included in include/sched/sched.h. If we generate asm-offsets.h +first, it will fail, as include/sched/sched.h is included in asm-offsets.c +and include/generated/test.h doesn't exist; If we generate test.h first, +it can't success neither, as include/generated/asm-offsets.h is included +by it. + +In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular +dependency. We can generate asm-offsets.h first, and if the +COMPILE_OFFSETS is defined, we don't include the "generated/test.h". + +And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this +purpose. + +Signed-off-by: Menglong Dong +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sasha Levin +--- + arch/alpha/kernel/asm-offsets.c | 1 + + arch/arc/kernel/asm-offsets.c | 1 + + arch/arm/kernel/asm-offsets.c | 2 ++ + arch/arm64/kernel/asm-offsets.c | 1 + + arch/csky/kernel/asm-offsets.c | 1 + + arch/hexagon/kernel/asm-offsets.c | 1 + + arch/loongarch/kernel/asm-offsets.c | 2 ++ + arch/m68k/kernel/asm-offsets.c | 1 + + arch/microblaze/kernel/asm-offsets.c | 1 + + arch/mips/kernel/asm-offsets.c | 2 ++ + arch/nios2/kernel/asm-offsets.c | 1 + + arch/openrisc/kernel/asm-offsets.c | 1 + + arch/parisc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/riscv/kernel/asm-offsets.c | 1 + + arch/s390/kernel/asm-offsets.c | 1 + + arch/sh/kernel/asm-offsets.c | 1 + + arch/sparc/kernel/asm-offsets.c | 1 + + arch/um/kernel/asm-offsets.c | 2 ++ + arch/xtensa/kernel/asm-offsets.c | 1 + + 20 files changed, 24 insertions(+) + +diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c +index e9dad60b147f3..1ebb058904992 100644 +--- a/arch/alpha/kernel/asm-offsets.c ++++ b/arch/alpha/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c +index f77deb7991757..2978da85fcb65 100644 +--- a/arch/arc/kernel/asm-offsets.c ++++ b/arch/arc/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 123f4a8ef4466..2101938d27fcb 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -7,6 +7,8 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 30d4bbe68661f..b6367ff3a49ca 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -6,6 +6,7 @@ + * 2001-2002 Keith Owens + * Copyright (C) 2012 ARM Ltd. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c +index d1e9035794733..5525c8e7e1d9e 100644 +--- a/arch/csky/kernel/asm-offsets.c ++++ b/arch/csky/kernel/asm-offsets.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c +index 03a7063f94561..50eea9fa6f137 100644 +--- a/arch/hexagon/kernel/asm-offsets.c ++++ b/arch/hexagon/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * + * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c +index db1e4bb26b6a0..3017c71576009 100644 +--- a/arch/loongarch/kernel/asm-offsets.c ++++ b/arch/loongarch/kernel/asm-offsets.c +@@ -4,6 +4,8 @@ + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c +index 906d732305374..67a1990f9d748 100644 +--- a/arch/m68k/kernel/asm-offsets.c ++++ b/arch/m68k/kernel/asm-offsets.c +@@ -9,6 +9,7 @@ + * #defines from the assembly-language output. + */ + ++#define COMPILE_OFFSETS + #define ASM_OFFSETS_C + + #include +diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c +index 104c3ac5f30c8..b4b67d58e7f6a 100644 +--- a/arch/microblaze/kernel/asm-offsets.c ++++ b/arch/microblaze/kernel/asm-offsets.c +@@ -7,6 +7,7 @@ + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c +index 1e29efcba46e5..5debd9a3854a9 100644 +--- a/arch/mips/kernel/asm-offsets.c ++++ b/arch/mips/kernel/asm-offsets.c +@@ -9,6 +9,8 @@ + * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000 MIPS Technologies, Inc. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c +index e3d9b7b6fb48a..88190b503ce5d 100644 +--- a/arch/nios2/kernel/asm-offsets.c ++++ b/arch/nios2/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2011 Tobias Klauser + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c +index 710651d5aaae1..3cc826f2216b1 100644 +--- a/arch/openrisc/kernel/asm-offsets.c ++++ b/arch/openrisc/kernel/asm-offsets.c +@@ -18,6 +18,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c +index 757816a7bd4b2..9abfe65492c65 100644 +--- a/arch/parisc/kernel/asm-offsets.c ++++ b/arch/parisc/kernel/asm-offsets.c +@@ -13,6 +13,7 @@ + * Copyright (C) 2002 Randolph Chung + * Copyright (C) 2003 James Bottomley + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index b3048f6d3822c..a4bc80b30410a 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index 6e8c0d6feae9e..7d42d3b8a32a7 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -3,6 +3,7 @@ + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c +index 95ecad9c7d7d2..a8915663e917f 100644 +--- a/arch/s390/kernel/asm-offsets.c ++++ b/arch/s390/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c +index a0322e8328456..429b6a7631468 100644 +--- a/arch/sh/kernel/asm-offsets.c ++++ b/arch/sh/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c +index 3d9b9855dce91..6e660bde48dd8 100644 +--- a/arch/sparc/kernel/asm-offsets.c ++++ b/arch/sparc/kernel/asm-offsets.c +@@ -10,6 +10,7 @@ + * + * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c +index 1fb12235ab9c8..a69873aa697f4 100644 +--- a/arch/um/kernel/asm-offsets.c ++++ b/arch/um/kernel/asm-offsets.c +@@ -1 +1,3 @@ ++#define COMPILE_OFFSETS ++ + #include +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index da38de20ae598..cfbced95e944a 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -11,6 +11,7 @@ + * + * Chris Zankel + */ ++#define COMPILE_OFFSETS + + #include + #include +-- +2.51.0 + diff --git a/queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch new file mode 100644 index 0000000000..13f6234479 --- /dev/null +++ b/queue-6.17/audit-record-fanotify-event-regardless-of-presence-o.patch @@ -0,0 +1,44 @@ +From a6297c943b52f9458b8ca489dfbe9bfdd26dce75 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 17:04:07 -0400 +Subject: audit: record fanotify event regardless of presence of rules + +From: Richard Guy Briggs + +[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ] + +When no audit rules are in place, fanotify event results are +unconditionally dropped due to an explicit check for the existence of +any audit rules. Given this is a report from another security +sub-system, allow it to be recorded regardless of the existence of any +audit rules. + +To test, install and run the fapolicyd daemon with default config. Then +as an unprivileged user, create and run a very simple binary that should +be denied. Then check for an event with + ausearch -m FANOTIFY -ts recent + +Link: https://issues.redhat.com/browse/RHEL-9065 +Signed-off-by: Richard Guy Briggs +Signed-off-by: Paul Moore +Signed-off-by: Sasha Levin +--- + include/linux/audit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/audit.h b/include/linux/audit.h +index a394614ccd0b8..e3f06eba9c6e6 100644 +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) + + static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) + { +- if (!audit_dummy_context()) ++ if (audit_enabled) + __audit_fanotify(response, friar); + } + +-- +2.51.0 + diff --git a/queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch b/queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch new file mode 100644 index 0000000000..c61e15c232 --- /dev/null +++ b/queue-6.17/btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch @@ -0,0 +1,39 @@ +From 496727f1f5bd4315290c755db82f0460635f17b2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Sep 2025 17:43:04 +0100 +Subject: btrfs: abort transaction if we fail to update inode in log replay dir + fixup + +From: Filipe Manana + +[ Upstream commit 5a0565cad3ef7cbf4cf43d1dd1e849b156205292 ] + +If we fail to update the inode at link_to_fixup_dir(), we don't abort the +transaction and propagate the error up the call chain, which makes it hard +to pinpoint the error to the inode update. So abort the transaction if the +inode update call fails, so that if it happens we known immediately. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 4f92aa15d9b1d..165d2ee500ca3 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1796,6 +1796,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + else + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); + } else if (ret == -EEXIST) { + ret = 0; + } +-- +2.51.0 + diff --git a/queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch b/queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch new file mode 100644 index 0000000000..a8bb08e4c7 --- /dev/null +++ b/queue-6.17/btrfs-abort-transaction-in-the-process_one_buffer-lo.patch @@ -0,0 +1,77 @@ +From b2c31af40dd6f88a468a8613c542de8306f31b47 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 15:49:31 +0100 +Subject: btrfs: abort transaction in the process_one_buffer() log tree walk + callback + +From: Filipe Manana + +[ Upstream commit e6dd405b6671b9753b98d8bdf76f8f0ed36c11cd ] + +In the process_one_buffer() log tree walk callback we return errors to the +log tree walk caller and then the caller aborts the transaction, if we +have one, or turns the fs into error state if we don't have one. While +this reduces code it makes it harder to figure out where exactly an error +came from. So add the transaction aborts after every failure inside the +process_one_buffer() callback, so that it helps figuring out why failures +happen. + +Reviewed-by: Boris Burkov +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 6d92326a1a0c7..50ed84cb68a69 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -347,6 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level) + { ++ struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = log->fs_info; + int ret = 0; + +@@ -361,18 +362,29 @@ static int process_one_buffer(struct btrfs_root *log, + }; + + ret = btrfs_read_extent_buffer(eb, &check); +- if (ret) ++ if (ret) { ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; ++ } + } + + if (wc->pin) { +- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); +- if (ret) ++ ASSERT(trans != NULL); ++ ret = btrfs_pin_extent_for_log_replay(trans, eb); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); + return ret; ++ } + + if (btrfs_buffer_uptodate(eb, gen, 0) && +- btrfs_header_level(eb) == 0) ++ btrfs_header_level(eb) == 0) { + ret = btrfs_exclude_logged_extents(eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ } + } + return ret; + } +-- +2.51.0 + diff --git a/queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch b/queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch new file mode 100644 index 0000000000..71b71b020b --- /dev/null +++ b/queue-6.17/btrfs-abort-transaction-on-specific-error-places-whe.patch @@ -0,0 +1,111 @@ +From b70c9bc307275209743b38f5f7c7507bef7b311d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 14:56:11 +0100 +Subject: btrfs: abort transaction on specific error places when walking log + tree + +From: Filipe Manana + +[ Upstream commit 6ebd726b104fa99d47c0d45979e6a6109844ac18 ] + +We do several things while walking a log tree (for replaying and for +freeing a log tree) like reading extent buffers and cleaning them up, +but we don't immediately abort the transaction, or turn the fs into an +error state, when one of these things fails. Instead we the transaction +abort or turn the fs into error state in the caller of the entry point +function that walks a log tree - walk_log_tree() - which means we don't +get to know exactly where an error came from. + +Improve on this by doing a transaction abort / turn fs into error state +after each such failure so that when it happens we have a better +understanding where the failure comes from. This deliberately leaves +the transaction abort / turn fs into error state in the callers of +walk_log_tree() as to ensure we don't get into an inconsistent state in +case we forget to do it deeper in call chain. It also deliberately does +not do it after errors from the calls to the callback defined in +struct walk_control::process_func(), as we will do it later on another +patch. + +Reviewed-by: Boris Burkov +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 33 ++++++++++++++++++++++++++++----- + 1 file changed, 28 insertions(+), 5 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 7a63afedd01e6..6d92326a1a0c7 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2630,15 +2630,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) + static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) + { ++ int ret; ++ + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + +- if (trans) +- return btrfs_pin_reserved_extent(trans, eb); ++ if (trans) { ++ ret = btrfs_pin_reserved_extent(trans, eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ return ret; ++ } + +- return unaccount_log_buffer(eb->fs_info, eb->start); ++ ret = unaccount_log_buffer(eb->fs_info, eb->start); ++ if (ret) ++ btrfs_handle_fs_error(eb->fs_info, ret, NULL); ++ return ret; + } + + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, +@@ -2674,8 +2683,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); +- if (IS_ERR(next)) +- return PTR_ERR(next); ++ if (IS_ERR(next)) { ++ ret = PTR_ERR(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); ++ return ret; ++ } + + if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen, +@@ -2690,6 +2705,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +@@ -2705,6 +2724,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +-- +2.51.0 + diff --git a/queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..d35c4eb7bc --- /dev/null +++ b/queue-6.17/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From 043347135f8442ac0b1bf5b6bfc12f2c35374b9c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 70fc4e7cc5a0e..0b02e36b30558 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 50ed84cb68a69..518cd74191e77 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -7469,7 +7469,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch new file mode 100644 index 0000000000..4de154a42e --- /dev/null +++ b/queue-6.17/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch @@ -0,0 +1,44 @@ +From 6961fd2310f25663e1cc6a8e7977438fa016289f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Sep 2025 17:01:44 +0200 +Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + +From: Thorsten Blum + +[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ] + +Replace max_t() followed by min_t() with a single clamp(). + +As was pointed by David Laight in +https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ +the calculation may overflow u32 when the input value is too large, so +clamp_t() is not used. In practice the expected values are in range of +megabytes to gigabytes (throughput limit) so the bug would not happen. + +Signed-off-by: Thorsten Blum +Reviewed-by: David Sterba +[ Use clamp() and add explanation. ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 6776e6ab8d108..fd4c1ca34b5e4 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1369,8 +1369,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +-- +2.51.0 + diff --git a/queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch b/queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch new file mode 100644 index 0000000000..1678a118df --- /dev/null +++ b/queue-6.17/btrfs-tree-checker-add-inode-extref-checks.patch @@ -0,0 +1,90 @@ +From 5613ea5ed3366b504037789c8bb8cebb30a3524f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 08:34:05 +0930 +Subject: btrfs: tree-checker: add inode extref checks + +From: Qu Wenruo + +[ Upstream commit aab9458b9f0019e97fae394c2d6d9d1a03addfb3 ] + +Like inode refs, inode extrefs have a variable length name, which means +we have to do a proper check to make sure no header nor name can exceed +the item limits. + +The check itself is very similar to check_inode_ref(), just a different +structure (btrfs_inode_extref vs btrfs_inode_ref). + +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-checker.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +index a997c7cc35a26..a83e455f813bf 100644 +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || ++ key->type == BTRFS_INODE_EXTREF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); +@@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, + return 0; + } + ++static int check_inode_extref(struct extent_buffer *leaf, ++ struct btrfs_key *key, struct btrfs_key *prev_key, ++ int slot) ++{ ++ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); ++ unsigned long end = ptr + btrfs_item_size(leaf, slot); ++ ++ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) ++ return -EUCLEAN; ++ ++ while (ptr < end) { ++ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; ++ u16 namelen; ++ ++ if (unlikely(ptr + sizeof(*extref)) > end) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ++ ptr, end, sizeof(*extref)); ++ return -EUCLEAN; ++ } ++ ++ namelen = btrfs_inode_extref_name_len(leaf, extref); ++ if (unlikely(ptr + sizeof(*extref) + namelen > end)) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu namelen %u", ++ ptr, end, namelen); ++ return -EUCLEAN; ++ } ++ ptr += sizeof(*extref) + namelen; ++ } ++ return 0; ++} ++ + static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) + { +@@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); + break; ++ case BTRFS_INODE_EXTREF_KEY: ++ ret = check_inode_extref(leaf, key, prev_key, slot); ++ break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(leaf, key, slot); + break; +-- +2.51.0 + diff --git a/queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch new file mode 100644 index 0000000000..83409255d2 --- /dev/null +++ b/queue-6.17/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch @@ -0,0 +1,50 @@ +From 17181f1cd33cfcd7024c3d0606e424d27ff2a1fe Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Aug 2025 17:46:18 +0100 +Subject: btrfs: use level argument in log tree walk callback + replay_one_buffer() + +From: Filipe Manana + +[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ] + +We already have the extent buffer's level in an argument, there's no need +to first ensure the extent buffer's data is loaded (by calling +btrfs_read_extent_buffer()) and then call btrfs_header_level() to check +the level. So use the level argument and do the check before calling +btrfs_read_extent_buffer(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 518cd74191e77..4f92aa15d9b1d 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2461,15 +2461,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + int i; + int ret; + ++ if (level != 0) ++ return 0; ++ + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + +- level = btrfs_header_level(eb); +- +- if (level != 0) +- return 0; +- + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +-- +2.51.0 + diff --git a/queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..8cd5320996 --- /dev/null +++ b/queue-6.17/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From 2315af132a33b20e24b3a740bfc56993b3f29be5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index c5c0d9cf1a808..a4e486a600bed 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1806,7 +1806,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch new file mode 100644 index 0000000000..4d5bbddf32 --- /dev/null +++ b/queue-6.17/btrfs-zoned-refine-extent-allocator-hint-selection.patch @@ -0,0 +1,59 @@ +From e0264b290f1d5792d4664a0fe27c898716f36a81 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 11:13:15 +0900 +Subject: btrfs: zoned: refine extent allocator hint selection + +From: Naohiro Aota + +[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ] + +The hint block group selection in the extent allocator is wrong in the +first place, as it can select the dedicated data relocation block group for +the normal data allocation. + +Since we separated the normal data space_info and the data relocation +space_info, we can easily identify a block group is for data relocation or +not. Do not choose it for the normal data allocation. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 97d517cdf2df7..682d21a73a67a 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + } + + static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++ struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_space_info *space_info) + { + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); +@@ -4321,6 +4322,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && ++ block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; +@@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: +- return prepare_allocation_zoned(fs_info, ffe_ctl); ++ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); + default: + BUG(); + } +-- +2.51.0 + diff --git a/queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch new file mode 100644 index 0000000000..970ceb08bf --- /dev/null +++ b/queue-6.17/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch @@ -0,0 +1,111 @@ +From 4230345a12b197f63729c55c765d44f98c2ca78d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Jul 2025 13:39:11 +0200 +Subject: btrfs: zoned: return error from btrfs_zone_finish_endio() + +From: Johannes Thumshirn + +[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ] + +Now that btrfs_zone_finish_endio_workfn() is directly calling +do_zone_finish() the only caller of btrfs_zone_finish_endio() is +btrfs_finish_one_ordered(). + +btrfs_finish_one_ordered() already has error handling in-place so +btrfs_zone_finish_endio() can return an error if the block group lookup +fails. + +Also as btrfs_zone_finish_endio() already checks for zoned filesystems and +returns early, there's no need to do this in the caller. + +Reviewed-by: Damien Le Moal +Signed-off-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 7 ++++--- + fs/btrfs/zoned.c | 8 +++++--- + fs/btrfs/zoned.h | 9 ++++++--- + 3 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 4031cbdea0740..41da405181b4f 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3107,9 +3107,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) + goto out; + } + +- if (btrfs_is_zoned(fs_info)) +- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, +- ordered_extent->disk_num_bytes); ++ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ++ ordered_extent->disk_num_bytes); ++ if (ret) ++ goto out; + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 87c5dd3ad016e..fcdf7b058a584 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2464,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + return ret; + } + +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) + { + struct btrfs_block_group *block_group; + u64 min_alloc_bytes; + + if (!btrfs_is_zoned(fs_info)) +- return; ++ return 0; + + block_group = btrfs_lookup_block_group(fs_info, logical); +- ASSERT(block_group); ++ if (WARN_ON_ONCE(!block_group)) ++ return -ENOENT; + + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) +@@ -2490,6 +2491,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len + + out: + btrfs_put_block_group(block_group); ++ return 0; + } + + static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 6e11533b8e14c..17c5656580dd9 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); +@@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + return true; + } + +-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) { } ++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, ++ u64 logical, u64 length) ++{ ++ return 0; ++} + + static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } +-- +2.51.0 + diff --git a/queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch b/queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch new file mode 100644 index 0000000000..9bbca42c19 --- /dev/null +++ b/queue-6.17/cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch @@ -0,0 +1,47 @@ +From 1a27b5e454cf61ce28e3b82e30c4ca2f682381f8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Sep 2025 01:12:27 +0000 +Subject: cpuset: Use new excpus for nocpu error check when enabling root + partition + +From: Chen Ridong + +[ Upstream commit 59d5de3655698679ad8fd2cc82228de4679c4263 ] + +A previous patch fixed a bug where new_prs should be assigned before +checking housekeeping conflicts. This patch addresses another potential +issue: the nocpu error check currently uses the xcpus which is not updated. +Although no issue has been observed so far, the check should be performed +using the new effective exclusive cpus. + +The comment has been removed because the function returns an error if +nocpu checking fails, which is unrelated to the parent. + +Signed-off-by: Chen Ridong +Reviewed-by: Waiman Long +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/cgroup/cpuset.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index fef93032fe7e4..fd890b34a8403 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -1728,11 +1728,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + +- /* +- * A parent can be left with no CPU as long as there is no +- * task directly associated with the parent partition. +- */ +- if (nocpu) ++ if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + + /* +-- +2.51.0 + diff --git a/queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch b/queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch new file mode 100644 index 0000000000..04460529c1 --- /dev/null +++ b/queue-6.17/edac-fix-wrong-executable-file-modes-for-c-source-fi.patch @@ -0,0 +1,43 @@ +From d652aa2ed5235fd64ae767808908b000818e4502 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 29 Aug 2025 03:19:54 +0800 +Subject: EDAC: Fix wrong executable file modes for C source files + +From: Kuan-Wei Chiu + +[ Upstream commit 71965cae7db394ff5ba3b2d2befe4e136ceec268 ] + +Three EDAC source files were mistakenly marked as executable when adding the +EDAC scrub controls. + +These are plain C source files and should not carry the executable bit. +Correcting their modes follows the principle of least privilege and avoids +unnecessary execute permissions in the repository. + + [ bp: Massage commit message. ] + +Signed-off-by: Kuan-Wei Chiu +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250828191954.903125-1-visitorckw@gmail.com +Signed-off-by: Sasha Levin +--- + drivers/edac/ecs.c | 0 + drivers/edac/mem_repair.c | 0 + drivers/edac/scrub.c | 0 + 3 files changed, 0 insertions(+), 0 deletions(-) + mode change 100755 => 100644 drivers/edac/ecs.c + mode change 100755 => 100644 drivers/edac/mem_repair.c + mode change 100755 => 100644 drivers/edac/scrub.c + +diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c +old mode 100755 +new mode 100644 +diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c +old mode 100755 +new mode 100644 +diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c +old mode 100755 +new mode 100644 +-- +2.51.0 + diff --git a/queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch b/queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch new file mode 100644 index 0000000000..f587a96b02 --- /dev/null +++ b/queue-6.17/edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch @@ -0,0 +1,55 @@ +From 90175ae118a68e96eeb97a03511afe3d8cccbee3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 19 Aug 2025 09:17:39 -0700 +Subject: EDAC/ie31200: Add two more Intel Alder Lake-S SoCs for EDAC support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Kyle Manna + +[ Upstream commit 71b69f817e91b588030d7d47ddbdc4857a92eb4e ] + +Host Device IDs (DID0) correspond to: +* Intel Core i7-12700K +* Intel Core i5-12600K + +See documentation: +* 12th Generation Intel® Core™ Processors Datasheet + * Volume 1 of 2, Doc. No.: 655258, Rev.: 011 + * https://edc.intel.com/output/DownloadPdfDocument?id=8297 (PDF) + +Signed-off-by: Kyle Manna +Signed-off-by: Tony Luck +Reviewed-by: Qiuxu Zhuo +Link: https://lore.kernel.org/r/20250819161739.3241152-1-kyle@kylemanna.com +Signed-off-by: Sasha Levin +--- + drivers/edac/ie31200_edac.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c +index 5c1fa1c0d12e3..5a080ab65476d 100644 +--- a/drivers/edac/ie31200_edac.c ++++ b/drivers/edac/ie31200_edac.c +@@ -99,6 +99,8 @@ + + /* Alder Lake-S */ + #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 ++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ ++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ + + /* Bartlett Lake-S */ + #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 +@@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = { + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, ++ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, ++ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, +-- +2.51.0 + diff --git a/queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch new file mode 100644 index 0000000000..aa682b9c87 --- /dev/null +++ b/queue-6.17/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch @@ -0,0 +1,89 @@ +From f47a7852e3030b5b0c360943fd302ba833f9999f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 20:30:17 +0000 +Subject: EDAC/mc_sysfs: Increase legacy channel support to 16 + +From: Avadhut Naik + +[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ] + +Newer AMD systems can support up to 16 channels per EDAC "mc" device. +These are detected by the EDAC module running on the device, and the +current EDAC interface is appropriately enumerated. + +The legacy EDAC sysfs interface however, provides device attributes for +channels 0 through 11 only. Consequently, the last four channels, 12 +through 15, will not be enumerated and will not be visible through the +legacy sysfs interface. + +Add additional device attributes to ensure that all 16 channels, if +present, are enumerated by and visible through the legacy EDAC sysfs +interface. + +Signed-off-by: Avadhut Naik +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com +Signed-off-by: Sasha Levin +--- + drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c +index 0f338adf7d937..8689631f19053 100644 +--- a/drivers/edac/edac_mc_sysfs.c ++++ b/drivers/edac/edac_mc_sysfs.c +@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); + DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); ++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 12); ++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 13); ++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 14); ++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 15); + + /* Total possible dynamic DIMM Label attribute file table */ + static struct attribute *dynamic_csrow_dimm_attr[] = { +@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, ++ &dev_attr_legacy_ch12_dimm_label.attr.attr, ++ &dev_attr_legacy_ch13_dimm_label.attr.attr, ++ &dev_attr_legacy_ch14_dimm_label.attr.attr, ++ &dev_attr_legacy_ch15_dimm_label.attr.attr, + NULL + }; + +@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); + DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); ++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 12); ++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 13); ++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 14); ++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 15); + + /* Total possible dynamic ce_count attribute file table */ + static struct attribute *dynamic_csrow_ce_count_attr[] = { +@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, ++ &dev_attr_legacy_ch12_ce_count.attr.attr, ++ &dev_attr_legacy_ch13_ce_count.attr.attr, ++ &dev_attr_legacy_ch14_ce_count.attr.attr, ++ &dev_attr_legacy_ch15_ce_count.attr.attr, + NULL + }; + +-- +2.51.0 + diff --git a/queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch b/queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch new file mode 100644 index 0000000000..a0ddbc7ec3 --- /dev/null +++ b/queue-6.17/genirq-chip-add-buslock-back-in-to-irq_set_handler.patch @@ -0,0 +1,38 @@ +From 7dc7cb31fbaf707d2d6237c28bc7a3e55e13048d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 23 Oct 2025 16:48:59 +0100 +Subject: genirq/chip: Add buslock back in to irq_set_handler() + +From: Charles Keepax + +[ Upstream commit 5d7e45dd670e42df4836afeaa9baf9d41ca4b434 ] + +The locking was changed from a buslock to a plain lock, but the patch +description states there was no functional change. Assuming this was +accidental so reverting to using the buslock. + +Fixes: 5cd05f3e2315 ("genirq/chip: Rework irq_set_handler() variants") +Signed-off-by: Charles Keepax +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251023154901.1333755-2-ckeepax@opensource.cirrus.com +Signed-off-by: Sasha Levin +--- + kernel/irq/chip.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c +index 3ffa0d80ddd19..d1917b28761a3 100644 +--- a/kernel/irq/chip.c ++++ b/kernel/irq/chip.c +@@ -1030,7 +1030,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, + void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) + { +- scoped_irqdesc_get_and_lock(irq, 0) ++ scoped_irqdesc_get_and_buslock(irq, 0) + __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); + } + EXPORT_SYMBOL_GPL(__irq_set_handler); +-- +2.51.0 + diff --git a/queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch b/queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch new file mode 100644 index 0000000000..f916a50635 --- /dev/null +++ b/queue-6.17/genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch @@ -0,0 +1,38 @@ +From 34c98b6e10f180a7abd2fbcca68ad9546c6625e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 23 Oct 2025 16:49:00 +0100 +Subject: genirq/manage: Add buslock back in to __disable_irq_nosync() + +From: Charles Keepax + +[ Upstream commit 56363e25f79fe83e63039c5595b8cd9814173d37 ] + +The locking was changed from a buslock to a plain lock, but the patch +description states there was no functional change. Assuming this was +accidental so reverting to using the buslock. + +Fixes: 1b7444446724 ("genirq/manage: Rework __disable_irq_nosync()") +Signed-off-by: Charles Keepax +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251023154901.1333755-3-ckeepax@opensource.cirrus.com +Signed-off-by: Sasha Levin +--- + kernel/irq/manage.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index c94837382037e..7d68fb5dc2428 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc) + + static int __disable_irq_nosync(unsigned int irq) + { +- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { ++ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + __disable_irq(scoped_irqdesc); + return 0; + } +-- +2.51.0 + diff --git a/queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch b/queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch new file mode 100644 index 0000000000..e3a70ca210 --- /dev/null +++ b/queue-6.17/genirq-manage-add-buslock-back-in-to-enable_irq.patch @@ -0,0 +1,38 @@ +From eefaa63d07aca4d44e91486f0a43039238559741 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 23 Oct 2025 16:49:01 +0100 +Subject: genirq/manage: Add buslock back in to enable_irq() + +From: Charles Keepax + +[ Upstream commit ef3330b99c01bda53f2a189b58bed8f6b7397f28 ] + +The locking was changed from a buslock to a plain lock, but the patch +description states there was no functional change. Assuming this was +accidental so reverting to using the buslock. + +Fixes: bddd10c55407 ("genirq/manage: Rework enable_irq()") +Signed-off-by: Charles Keepax +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251023154901.1333755-4-ckeepax@opensource.cirrus.com +Signed-off-by: Sasha Levin +--- + kernel/irq/manage.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 7d68fb5dc2428..400856abf6721 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc) + */ + void enable_irq(unsigned int irq) + { +- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { ++ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; + + if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) +-- +2.51.0 + diff --git a/queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch new file mode 100644 index 0000000000..cd6edd3f7a --- /dev/null +++ b/queue-6.17/perf-have-get_perf_callchain-return-null-if-crosstas.patch @@ -0,0 +1,68 @@ +From 82f22876b00c320ed9c7d964eeffcd4e786655ad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:40 -0400 +Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are + set + +From: Josh Poimboeuf + +[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ] + +get_perf_callchain() doesn't support cross-task unwinding for user space +stacks, have it return NULL if both the crosstask and user arguments are +set. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index decff7266cfbd..2609998ca07f1 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + struct perf_callchain_entry_ctx ctx; + int rctx, start_entry_idx; + ++ /* crosstask is not supported for user stacks */ ++ if (crosstask && user && !kernel) ++ return NULL; ++ + entry = get_callchain_entry(&rctx); + if (!entry) + return NULL; +@@ -240,7 +244,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + perf_callchain_kernel(&ctx, regs); + } + +- if (user) { ++ if (user && !crosstask) { + if (!user_mode(regs)) { + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; +@@ -249,9 +253,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + + if (regs) { +- if (crosstask) +- goto exit_put; +- + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + +@@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + } + +-exit_put: + put_callchain_entry(rctx); + + return entry; +-- +2.51.0 + diff --git a/queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch new file mode 100644 index 0000000000..70b4b11ba2 --- /dev/null +++ b/queue-6.17/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch @@ -0,0 +1,37 @@ +From 1642fd4e2d5f5e1dc02825acb53fa5f054b913fb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:43 -0400 +Subject: perf: Skip user unwind if the task is a kernel thread + +From: Josh Poimboeuf + +[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ] + +If the task is not a user thread, there's no user stack to unwind. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/core.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index a3dc79ec6f879..c0e938d28758f 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -8192,7 +8192,8 @@ struct perf_callchain_entry * + perf_callchain(struct perf_event *event, struct pt_regs *regs) + { + bool kernel = !event->attr.exclude_callchain_kernel; +- bool user = !event->attr.exclude_callchain_user; ++ bool user = !event->attr.exclude_callchain_user && ++ !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; +-- +2.51.0 + diff --git a/queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch new file mode 100644 index 0000000000..649a9985d7 --- /dev/null +++ b/queue-6.17/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch @@ -0,0 +1,67 @@ +From 8a38e567c4058e466e5e1b0823fe7b9c902ff337 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:41 -0400 +Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of + current->mm == NULL + +From: Steven Rostedt + +[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ] + +To determine if a task is a kernel thread or not, it is more reliable to +use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on +current->mm being NULL. That is because some kernel tasks (io_uring +helpers) may have a mm field. + +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 6 +++--- + kernel/events/core.c | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 6c83ad674d010..decff7266cfbd 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -242,10 +242,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + + if (user) { + if (!user_mode(regs)) { +- if (current->mm) +- regs = task_pt_regs(current); +- else ++ if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; ++ else ++ regs = task_pt_regs(current); + } + + if (regs) { +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 6e9427c4aaff7..a3dc79ec6f879 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -7440,7 +7440,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; +- } else if (!(current->flags & PF_KTHREAD)) { ++ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + perf_get_regs_user(regs_user, regs); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; +@@ -8080,7 +8080,7 @@ static u64 perf_virt_to_phys(u64 virt) + * Try IRQ-safe get_user_page_fast_only first. + * If failed, leave phys_addr as 0. + */ +- if (current->mm != NULL) { ++ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct page *p; + + pagefault_disable(); +-- +2.51.0 + diff --git a/queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch b/queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch new file mode 100644 index 0000000000..d740688f95 --- /dev/null +++ b/queue-6.17/perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch @@ -0,0 +1,101 @@ +From ca9f460d3f2e517fd9f873da6a0d8f17baef1972 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 10:30:31 +0800 +Subject: perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into + INTEL_FIXED_BITS_MASK + +From: Dapeng Mi + +[ Upstream commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a ] + +ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, +add it. + +With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can +be optimized. The old fixed counter control bits can be unconditionally +cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on +new configuration. + +Signed-off-by: Dapeng Mi +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Kan Liang +Tested-by: Yi Lai +Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com +Signed-off-by: Sasha Levin +--- + arch/x86/events/intel/core.c | 10 +++------- + arch/x86/include/asm/perf_event.h | 6 +++++- + arch/x86/kvm/pmu.h | 2 +- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 15da60cf69f20..046d12281fd94 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; +- u64 mask, bits = 0; + int idx = hwc->idx; ++ u64 bits = 0; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); +@@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + + idx -= INTEL_PMC_IDX_FIXED; + bits = intel_fixed_bits_by_idx(idx, bits); +- mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); +- +- if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { ++ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) + bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- } + +- cpuc->fixed_ctrl_val &= ~mask; ++ cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); + cpuc->fixed_ctrl_val |= bits; + } + +diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h +index 70d1d94aca7e6..ee943bd1595af 100644 +--- a/arch/x86/include/asm/perf_event.h ++++ b/arch/x86/include/asm/perf_event.h +@@ -35,7 +35,6 @@ + #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) + #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) + +-#define INTEL_FIXED_BITS_MASK 0xFULL + #define INTEL_FIXED_BITS_STRIDE 4 + #define INTEL_FIXED_0_KERNEL (1ULL << 0) + #define INTEL_FIXED_0_USER (1ULL << 1) +@@ -48,6 +47,11 @@ + #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) + #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) + ++#define INTEL_FIXED_BITS_MASK \ ++ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ ++ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ ++ ICL_FIXED_0_ADAPTIVE) ++ + #define intel_fixed_bits_by_idx(_idx, _bits) \ + ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) + +diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h +index ad89d0bd60058..103604c4b33b5 100644 +--- a/arch/x86/kvm/pmu.h ++++ b/arch/x86/kvm/pmu.h +@@ -13,7 +13,7 @@ + #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + +-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ ++/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ + #define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) + +-- +2.51.0 + diff --git a/queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch b/queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch new file mode 100644 index 0000000000..318b3d523b --- /dev/null +++ b/queue-6.17/sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch @@ -0,0 +1,55 @@ +From cf5b7d7a98bfe5768f519b0840be34ee7ef9d389 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 10 Sep 2025 17:50:42 +0800 +Subject: sched/fair: update_cfs_group() for throttled cfs_rqs + +From: Aaron Lu + +[ Upstream commit fcd394866e3db344cbe0bb485d7e3f741ac07245 ] + +With task based throttle model, tasks in a throttled hierarchy are +allowed to continue to run if they are running in kernel mode. For this +reason, PELT clock is not stopped for these cfs_rqs in throttled +hierarchy when they still have tasks running or queued. + +Since PELT clock is not stopped, whether to allow update_cfs_group() +doing its job for cfs_rqs which are in throttled hierarchy but still +have tasks running/queued is a question. + +The good side is, continue to run update_cfs_group() can get these +cfs_rq entities with an up2date weight and that up2date weight can be +useful to derive an accurate load for the CPU as well as ensure fairness +if multiple tasks of different cgroups are running on the same CPU. +OTOH, as Benjamin Segall pointed: when unthrottle comes around the most +likely correct distribution is the distribution we had at the time of +throttle. + +In reality, either way may not matter that much if tasks in throttled +hierarchy don't run in kernel mode for too long. But in case that +happens, let these cfs_rq entities have an up2date weight seems a good +thing to do. + +Signed-off-by: Aaron Lu +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sasha Levin +--- + kernel/sched/fair.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4770d25ae2406..3e0d999e5ee2c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) + if (!gcfs_rq || !gcfs_rq->load.weight) + return; + +- if (throttled_hierarchy(gcfs_rq)) +- return; +- + shares = calc_group_shares(gcfs_rq); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); +-- +2.51.0 + diff --git a/queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch b/queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch new file mode 100644 index 0000000000..2f82f070a4 --- /dev/null +++ b/queue-6.17/sched_ext-keep-bypass-on-between-enable-failure-and-.patch @@ -0,0 +1,48 @@ +From 89d634457fa0b1abe8647e67fdc54d9c13669cb9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Sep 2025 11:33:28 -1000 +Subject: sched_ext: Keep bypass on between enable failure and + scx_disable_workfn() + +From: Tejun Heo + +[ Upstream commit 4a1d9d73aabc8f97f48c4f84f936de3b265ffd6f ] + +scx_enable() turns on the bypass mode while enable is in progress. If +enabling fails, it turns off the bypass mode and then triggers scx_error(). +scx_error() will trigger scx_disable_workfn() which will turn on the bypass +mode again and unload the failed scheduler. + +This moves the system out of bypass mode between the enable error path and +the disable path, which is unnecessary and can be brittle - e.g. the thread +running scx_enable() may already be on the failed scheduler and can be +switched out before it triggers scx_error() leading to a stall. The watchdog +would eventually kick in, so the situation isn't critical but is still +suboptimal. + +There is nothing to be gained by turning off the bypass mode between +scx_enable() failure and scx_disable_workfn(). Keep bypass on. + +Signed-off-by: Tejun Heo +Acked-by: Andrea Righi +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index f89894476e51f..14724dae0b795 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4763,7 +4763,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) + err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +- scx_bypass(false); ++ /* we'll soon enter disable path, keep bypass on */ + err_disable: + mutex_unlock(&scx_enable_mutex); + /* +-- +2.51.0 + diff --git a/queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch b/queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch new file mode 100644 index 0000000000..d0d414ae46 --- /dev/null +++ b/queue-6.17/sched_ext-make-qmap-dump-operation-non-destructive.patch @@ -0,0 +1,70 @@ +From d32b2132f0dc2b791769a353ae1bfafc2a0df0e2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Sep 2025 09:03:26 -1000 +Subject: sched_ext: Make qmap dump operation non-destructive + +From: Tejun Heo + +[ Upstream commit d452972858e5cfa4262320ab74fe8f016460b96f ] + +The qmap dump operation was destructively consuming queue entries while +displaying them. As dump can be triggered anytime, this can easily lead to +stalls. Add a temporary dump_store queue and modify the dump logic to pop +entries, display them, and then restore them back to the original queue. +This allows dump operations to be performed without affecting the +scheduler's queue state. + +Note that if racing against new enqueues during dump, ordering can get +mixed up, but this is acceptable for debugging purposes. + +Acked-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + tools/sched_ext/scx_qmap.bpf.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c +index 69d877501cb72..cd50a94326e3a 100644 +--- a/tools/sched_ext/scx_qmap.bpf.c ++++ b/tools/sched_ext/scx_qmap.bpf.c +@@ -56,7 +56,8 @@ struct qmap { + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), +- queue4 SEC(".maps"); ++ queue4 SEC(".maps"), ++ dump_store SEC(".maps"); + + struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +@@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); ++ ++ /* ++ * Dump can be invoked anytime and there is no way to iterate in ++ * a non-destructive way. Pop and store in dump_store and then ++ * restore afterwards. If racing against new enqueues, ordering ++ * can get mixed up. ++ */ + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; ++ bpf_map_push_elem(&dump_store, &pid, 0); + scx_bpf_dump(" %d", pid); + } ++ ++ bpf_repeat(4096) { ++ if (bpf_map_pop_elem(&dump_store, &pid)) ++ break; ++ bpf_map_push_elem(fifo, &pid, 0); ++ } ++ + scx_bpf_dump("\n"); + } + } +-- +2.51.0 + diff --git a/queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch b/queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch new file mode 100644 index 0000000000..410bc23308 --- /dev/null +++ b/queue-6.17/sched_ext-move-internal-type-and-accessor-definition.patch @@ -0,0 +1,2189 @@ +From 63f4bd85d580e08409e9128b0715e253a2e0697f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Sep 2025 11:33:28 -1000 +Subject: sched_ext: Move internal type and accessor definitions to + ext_internal.h + +From: Tejun Heo + +[ Upstream commit 0c2b8356e430229efef42b03bd765a2a7ecf73fd ] + +There currently isn't a place to place SCX-internal types and accessors to +be shared between ext.c and ext_idle.c. Create kernel/sched/ext_internal.h +and move internal type and accessor definitions there. This trims ext.c a +bit and makes future additions easier. Pure code reorganization. No +functional changes. + +Signed-off-by: Tejun Heo +Acked-by: Andrea Righi +Stable-dep-of: efeeaac9ae97 ("sched_ext: Sync error_irq_work before freeing scx_sched") +Signed-off-by: Sasha Levin +--- + kernel/sched/build_policy.c | 1 + + kernel/sched/ext.c | 1034 ---------------------------------- + kernel/sched/ext.h | 23 - + kernel/sched/ext_internal.h | 1061 +++++++++++++++++++++++++++++++++++ + 4 files changed, 1062 insertions(+), 1057 deletions(-) + create mode 100644 kernel/sched/ext_internal.h + +diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c +index c4a488e67aa7d..755883faf7518 100644 +--- a/kernel/sched/build_policy.c ++++ b/kernel/sched/build_policy.c +@@ -58,6 +58,7 @@ + #include "deadline.c" + + #ifdef CONFIG_SCHED_CLASS_EXT ++# include "ext_internal.h" + # include "ext.c" + # include "ext_idle.c" + #endif +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 088ceff38c8a4..8ecde1abb4e28 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -9,1040 +9,6 @@ + #include + #include "ext_idle.h" + +-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) +- +-enum scx_consts { +- SCX_DSP_DFL_MAX_BATCH = 32, +- SCX_DSP_MAX_LOOPS = 32, +- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, +- +- SCX_EXIT_BT_LEN = 64, +- SCX_EXIT_MSG_LEN = 1024, +- SCX_EXIT_DUMP_DFL_LEN = 32768, +- +- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, +- +- /* +- * Iterating all tasks may take a while. Periodically drop +- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. +- */ +- SCX_TASK_ITER_BATCH = 32, +-}; +- +-enum scx_exit_kind { +- SCX_EXIT_NONE, +- SCX_EXIT_DONE, +- +- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ +- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ +- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ +- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ +- +- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ +- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ +- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +-}; +- +-/* +- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), +- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes +- * are 64bit of the format: +- * +- * Bits: [63 .. 48 47 .. 32 31 .. 0] +- * [ SYS ACT ] [ SYS RSN ] [ USR ] +- * +- * SYS ACT: System-defined exit actions +- * SYS RSN: System-defined exit reasons +- * USR : User-defined exit codes and reasons +- * +- * Using the above, users may communicate intention and context by ORing system +- * actions and/or system reasons with a user-defined exit code. +- */ +-enum scx_exit_code { +- /* Reasons */ +- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, +- +- /* Actions */ +- SCX_ECODE_ACT_RESTART = 1LLU << 48, +-}; +- +-/* +- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is +- * being disabled. +- */ +-struct scx_exit_info { +- /* %SCX_EXIT_* - broad category of the exit reason */ +- enum scx_exit_kind kind; +- +- /* exit code if gracefully exiting */ +- s64 exit_code; +- +- /* textual representation of the above */ +- const char *reason; +- +- /* backtrace if exiting due to an error */ +- unsigned long *bt; +- u32 bt_len; +- +- /* informational message */ +- char *msg; +- +- /* debug dump */ +- char *dump; +-}; +- +-/* sched_ext_ops.flags */ +-enum scx_ops_flags { +- /* +- * Keep built-in idle tracking even if ops.update_idle() is implemented. +- */ +- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, +- +- /* +- * By default, if there are no other task to run on the CPU, ext core +- * keeps running the current task even after its slice expires. If this +- * flag is specified, such tasks are passed to ops.enqueue() with +- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. +- */ +- SCX_OPS_ENQ_LAST = 1LLU << 1, +- +- /* +- * An exiting task may schedule after PF_EXITING is set. In such cases, +- * bpf_task_from_pid() may not be able to find the task and if the BPF +- * scheduler depends on pid lookup for dispatching, the task will be +- * lost leading to various issues including RCU grace period stalls. +- * +- * To mask this problem, by default, unhashed tasks are automatically +- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't +- * depend on pid lookups and wants to handle these tasks directly, the +- * following flag can be used. +- */ +- SCX_OPS_ENQ_EXITING = 1LLU << 2, +- +- /* +- * If set, only tasks with policy set to SCHED_EXT are attached to +- * sched_ext. If clear, SCHED_NORMAL tasks are also included. +- */ +- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, +- +- /* +- * A migration disabled task can only execute on its current CPU. By +- * default, such tasks are automatically put on the CPU's local DSQ with +- * the default slice on enqueue. If this ops flag is set, they also go +- * through ops.enqueue(). +- * +- * A migration disabled task never invokes ops.select_cpu() as it can +- * only select the current CPU. Also, p->cpus_ptr will only contain its +- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr +- * and thus may disagree with cpumask_weight(p->cpus_ptr). +- */ +- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, +- +- /* +- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes +- * ops.enqueue() on the ops.select_cpu() selected or the wakee's +- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline +- * transfers. When this optimization is enabled, ops.select_cpu() is +- * skipped in some cases (when racing against the wakee switching out). +- * As the BPF scheduler may depend on ops.select_cpu() being invoked +- * during wakeups, queued wakeup is disabled by default. +- * +- * If this ops flag is set, queued wakeup optimization is enabled and +- * the BPF scheduler must be able to handle ops.enqueue() invoked on the +- * wakee's CPU without preceding ops.select_cpu() even for tasks which +- * may be executed on multiple CPUs. +- */ +- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, +- +- /* +- * If set, enable per-node idle cpumasks. If clear, use a single global +- * flat idle cpumask. +- */ +- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, +- +- /* +- * CPU cgroup support flags +- */ +- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ +- +- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | +- SCX_OPS_ENQ_LAST | +- SCX_OPS_ENQ_EXITING | +- SCX_OPS_ENQ_MIGRATION_DISABLED | +- SCX_OPS_ALLOW_QUEUED_WAKEUP | +- SCX_OPS_SWITCH_PARTIAL | +- SCX_OPS_BUILTIN_IDLE_PER_NODE | +- SCX_OPS_HAS_CGROUP_WEIGHT, +- +- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ +- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, +- +- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +-}; +- +-/* argument container for ops.init_task() */ +-struct scx_init_task_args { +- /* +- * Set if ops.init_task() is being invoked on the fork path, as opposed +- * to the scheduler transition path. +- */ +- bool fork; +-#ifdef CONFIG_EXT_GROUP_SCHED +- /* the cgroup the task is joining */ +- struct cgroup *cgroup; +-#endif +-}; +- +-/* argument container for ops.exit_task() */ +-struct scx_exit_task_args { +- /* Whether the task exited before running on sched_ext. */ +- bool cancelled; +-}; +- +-/* argument container for ops->cgroup_init() */ +-struct scx_cgroup_init_args { +- /* the weight of the cgroup [1..10000] */ +- u32 weight; +- +- /* bandwidth control parameters from cpu.max and cpu.max.burst */ +- u64 bw_period_us; +- u64 bw_quota_us; +- u64 bw_burst_us; +-}; +- +-enum scx_cpu_preempt_reason { +- /* next task is being scheduled by &sched_class_rt */ +- SCX_CPU_PREEMPT_RT, +- /* next task is being scheduled by &sched_class_dl */ +- SCX_CPU_PREEMPT_DL, +- /* next task is being scheduled by &sched_class_stop */ +- SCX_CPU_PREEMPT_STOP, +- /* unknown reason for SCX being preempted */ +- SCX_CPU_PREEMPT_UNKNOWN, +-}; +- +-/* +- * Argument container for ops->cpu_acquire(). Currently empty, but may be +- * expanded in the future. +- */ +-struct scx_cpu_acquire_args {}; +- +-/* argument container for ops->cpu_release() */ +-struct scx_cpu_release_args { +- /* the reason the CPU was preempted */ +- enum scx_cpu_preempt_reason reason; +- +- /* the task that's going to be scheduled on the CPU */ +- struct task_struct *task; +-}; +- +-/* +- * Informational context provided to dump operations. +- */ +-struct scx_dump_ctx { +- enum scx_exit_kind kind; +- s64 exit_code; +- const char *reason; +- u64 at_ns; +- u64 at_jiffies; +-}; +- +-/** +- * struct sched_ext_ops - Operation table for BPF scheduler implementation +- * +- * A BPF scheduler can implement an arbitrary scheduling policy by +- * implementing and loading operations in this table. Note that a userland +- * scheduling policy can also be implemented using the BPF scheduler +- * as a shim layer. +- */ +-struct sched_ext_ops { +- /** +- * @select_cpu: Pick the target CPU for a task which is being woken up +- * @p: task being woken up +- * @prev_cpu: the cpu @p was on before sleeping +- * @wake_flags: SCX_WAKE_* +- * +- * Decision made here isn't final. @p may be moved to any CPU while it +- * is getting dispatched for execution later. However, as @p is not on +- * the rq at this point, getting the eventual execution CPU right here +- * saves a small bit of overhead down the line. +- * +- * If an idle CPU is returned, the CPU is kicked and will try to +- * dispatch. While an explicit custom mechanism can be added, +- * select_cpu() serves as the default way to wake up idle CPUs. +- * +- * @p may be inserted into a DSQ directly by calling +- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. +- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ +- * of the CPU returned by this operation. +- * +- * Note that select_cpu() is never called for tasks that can only run +- * on a single CPU or tasks with migration disabled, as they don't have +- * the option to select a different CPU. See select_task_rq() for +- * details. +- */ +- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); +- +- /** +- * @enqueue: Enqueue a task on the BPF scheduler +- * @p: task being enqueued +- * @enq_flags: %SCX_ENQ_* +- * +- * @p is ready to run. Insert directly into a DSQ by calling +- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly +- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, +- * the task will stall. +- * +- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is +- * skipped. +- */ +- void (*enqueue)(struct task_struct *p, u64 enq_flags); +- +- /** +- * @dequeue: Remove a task from the BPF scheduler +- * @p: task being dequeued +- * @deq_flags: %SCX_DEQ_* +- * +- * Remove @p from the BPF scheduler. This is usually called to isolate +- * the task while updating its scheduling properties (e.g. priority). +- * +- * The ext core keeps track of whether the BPF side owns a given task or +- * not and can gracefully ignore spurious dispatches from BPF side, +- * which makes it safe to not implement this method. However, depending +- * on the scheduling logic, this can lead to confusing behaviors - e.g. +- * scheduling position not being updated across a priority change. +- */ +- void (*dequeue)(struct task_struct *p, u64 deq_flags); +- +- /** +- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs +- * @cpu: CPU to dispatch tasks for +- * @prev: previous task being switched out +- * +- * Called when a CPU's local dsq is empty. The operation should dispatch +- * one or more tasks from the BPF scheduler into the DSQs using +- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ +- * using scx_bpf_dsq_move_to_local(). +- * +- * The maximum number of times scx_bpf_dsq_insert() can be called +- * without an intervening scx_bpf_dsq_move_to_local() is specified by +- * ops.dispatch_max_batch. See the comments on top of the two functions +- * for more details. +- * +- * When not %NULL, @prev is an SCX task with its slice depleted. If +- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in +- * @prev->scx.flags, it is not enqueued yet and will be enqueued after +- * ops.dispatch() returns. To keep executing @prev, return without +- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. +- */ +- void (*dispatch)(s32 cpu, struct task_struct *prev); +- +- /** +- * @tick: Periodic tick +- * @p: task running currently +- * +- * This operation is called every 1/HZ seconds on CPUs which are +- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an +- * immediate dispatch cycle on the CPU. +- */ +- void (*tick)(struct task_struct *p); +- +- /** +- * @runnable: A task is becoming runnable on its associated CPU +- * @p: task becoming runnable +- * @enq_flags: %SCX_ENQ_* +- * +- * This and the following three functions can be used to track a task's +- * execution state transitions. A task becomes ->runnable() on a CPU, +- * and then goes through one or more ->running() and ->stopping() pairs +- * as it runs on the CPU, and eventually becomes ->quiescent() when it's +- * done running on the CPU. +- * +- * @p is becoming runnable on the CPU because it's +- * +- * - waking up (%SCX_ENQ_WAKEUP) +- * - being moved from another CPU +- * - being restored after temporarily taken off the queue for an +- * attribute change. +- * +- * This and ->enqueue() are related but not coupled. This operation +- * notifies @p's state transition and may not be followed by ->enqueue() +- * e.g. when @p is being dispatched to a remote CPU, or when @p is +- * being enqueued on a CPU experiencing a hotplug event. Likewise, a +- * task may be ->enqueue()'d without being preceded by this operation +- * e.g. after exhausting its slice. +- */ +- void (*runnable)(struct task_struct *p, u64 enq_flags); +- +- /** +- * @running: A task is starting to run on its associated CPU +- * @p: task starting to run +- * +- * Note that this callback may be called from a CPU other than the +- * one the task is going to run on. This can happen when a task +- * property is changed (i.e., affinity), since scx_next_task_scx(), +- * which triggers this callback, may run on a CPU different from +- * the task's assigned CPU. +- * +- * Therefore, always use scx_bpf_task_cpu(@p) to determine the +- * target CPU the task is going to use. +- * +- * See ->runnable() for explanation on the task state notifiers. +- */ +- void (*running)(struct task_struct *p); +- +- /** +- * @stopping: A task is stopping execution +- * @p: task stopping to run +- * @runnable: is task @p still runnable? +- * +- * Note that this callback may be called from a CPU other than the +- * one the task was running on. This can happen when a task +- * property is changed (i.e., affinity), since dequeue_task_scx(), +- * which triggers this callback, may run on a CPU different from +- * the task's assigned CPU. +- * +- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU +- * the task was running on. +- * +- * See ->runnable() for explanation on the task state notifiers. If +- * !@runnable, ->quiescent() will be invoked after this operation +- * returns. +- */ +- void (*stopping)(struct task_struct *p, bool runnable); +- +- /** +- * @quiescent: A task is becoming not runnable on its associated CPU +- * @p: task becoming not runnable +- * @deq_flags: %SCX_DEQ_* +- * +- * See ->runnable() for explanation on the task state notifiers. +- * +- * @p is becoming quiescent on the CPU because it's +- * +- * - sleeping (%SCX_DEQ_SLEEP) +- * - being moved to another CPU +- * - being temporarily taken off the queue for an attribute change +- * (%SCX_DEQ_SAVE) +- * +- * This and ->dequeue() are related but not coupled. This operation +- * notifies @p's state transition and may not be preceded by ->dequeue() +- * e.g. when @p is being dispatched to a remote CPU. +- */ +- void (*quiescent)(struct task_struct *p, u64 deq_flags); +- +- /** +- * @yield: Yield CPU +- * @from: yielding task +- * @to: optional yield target task +- * +- * If @to is NULL, @from is yielding the CPU to other runnable tasks. +- * The BPF scheduler should ensure that other available tasks are +- * dispatched before the yielding task. Return value is ignored in this +- * case. +- * +- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf +- * scheduler can implement the request, return %true; otherwise, %false. +- */ +- bool (*yield)(struct task_struct *from, struct task_struct *to); +- +- /** +- * @core_sched_before: Task ordering for core-sched +- * @a: task A +- * @b: task B +- * +- * Used by core-sched to determine the ordering between two tasks. See +- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on +- * core-sched. +- * +- * Both @a and @b are runnable and may or may not currently be queued on +- * the BPF scheduler. Should return %true if @a should run before @b. +- * %false if there's no required ordering or @b should run before @a. +- * +- * If not specified, the default is ordering them according to when they +- * became runnable. +- */ +- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); +- +- /** +- * @set_weight: Set task weight +- * @p: task to set weight for +- * @weight: new weight [1..10000] +- * +- * Update @p's weight to @weight. +- */ +- void (*set_weight)(struct task_struct *p, u32 weight); +- +- /** +- * @set_cpumask: Set CPU affinity +- * @p: task to set CPU affinity for +- * @cpumask: cpumask of cpus that @p can run on +- * +- * Update @p's CPU affinity to @cpumask. +- */ +- void (*set_cpumask)(struct task_struct *p, +- const struct cpumask *cpumask); +- +- /** +- * @update_idle: Update the idle state of a CPU +- * @cpu: CPU to update the idle state for +- * @idle: whether entering or exiting the idle state +- * +- * This operation is called when @rq's CPU goes or leaves the idle +- * state. By default, implementing this operation disables the built-in +- * idle CPU tracking and the following helpers become unavailable: +- * +- * - scx_bpf_select_cpu_dfl() +- * - scx_bpf_select_cpu_and() +- * - scx_bpf_test_and_clear_cpu_idle() +- * - scx_bpf_pick_idle_cpu() +- * +- * The user also must implement ops.select_cpu() as the default +- * implementation relies on scx_bpf_select_cpu_dfl(). +- * +- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle +- * tracking. +- */ +- void (*update_idle)(s32 cpu, bool idle); +- +- /** +- * @cpu_acquire: A CPU is becoming available to the BPF scheduler +- * @cpu: The CPU being acquired by the BPF scheduler. +- * @args: Acquire arguments, see the struct definition. +- * +- * A CPU that was previously released from the BPF scheduler is now once +- * again under its control. +- */ +- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); +- +- /** +- * @cpu_release: A CPU is taken away from the BPF scheduler +- * @cpu: The CPU being released by the BPF scheduler. +- * @args: Release arguments, see the struct definition. +- * +- * The specified CPU is no longer under the control of the BPF +- * scheduler. This could be because it was preempted by a higher +- * priority sched_class, though there may be other reasons as well. The +- * caller should consult @args->reason to determine the cause. +- */ +- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); +- +- /** +- * @init_task: Initialize a task to run in a BPF scheduler +- * @p: task to initialize for BPF scheduling +- * @args: init arguments, see the struct definition +- * +- * Either we're loading a BPF scheduler or a new task is being forked. +- * Initialize @p for BPF scheduling. This operation may block and can +- * be used for allocations, and is called exactly once for a task. +- * +- * Return 0 for success, -errno for failure. An error return while +- * loading will abort loading of the BPF scheduler. During a fork, it +- * will abort that specific fork. +- */ +- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); +- +- /** +- * @exit_task: Exit a previously-running task from the system +- * @p: task to exit +- * @args: exit arguments, see the struct definition +- * +- * @p is exiting or the BPF scheduler is being unloaded. Perform any +- * necessary cleanup for @p. +- */ +- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); +- +- /** +- * @enable: Enable BPF scheduling for a task +- * @p: task to enable BPF scheduling for +- * +- * Enable @p for BPF scheduling. enable() is called on @p any time it +- * enters SCX, and is always paired with a matching disable(). +- */ +- void (*enable)(struct task_struct *p); +- +- /** +- * @disable: Disable BPF scheduling for a task +- * @p: task to disable BPF scheduling for +- * +- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. +- * Disable BPF scheduling for @p. A disable() call is always matched +- * with a prior enable() call. +- */ +- void (*disable)(struct task_struct *p); +- +- /** +- * @dump: Dump BPF scheduler state on error +- * @ctx: debug dump context +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. +- */ +- void (*dump)(struct scx_dump_ctx *ctx); +- +- /** +- * @dump_cpu: Dump BPF scheduler state for a CPU on error +- * @ctx: debug dump context +- * @cpu: CPU to generate debug dump for +- * @idle: @cpu is currently idle without any runnable tasks +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for +- * @cpu. If @idle is %true and this operation doesn't produce any +- * output, @cpu is skipped for dump. +- */ +- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); +- +- /** +- * @dump_task: Dump BPF scheduler state for a runnable task on error +- * @ctx: debug dump context +- * @p: runnable task to generate debug dump for +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for +- * @p. +- */ +- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); +- +-#ifdef CONFIG_EXT_GROUP_SCHED +- /** +- * @cgroup_init: Initialize a cgroup +- * @cgrp: cgroup being initialized +- * @args: init arguments, see the struct definition +- * +- * Either the BPF scheduler is being loaded or @cgrp created, initialize +- * @cgrp for sched_ext. This operation may block. +- * +- * Return 0 for success, -errno for failure. An error return while +- * loading will abort loading of the BPF scheduler. During cgroup +- * creation, it will abort the specific cgroup creation. +- */ +- s32 (*cgroup_init)(struct cgroup *cgrp, +- struct scx_cgroup_init_args *args); +- +- /** +- * @cgroup_exit: Exit a cgroup +- * @cgrp: cgroup being exited +- * +- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit +- * @cgrp for sched_ext. This operation my block. +- */ +- void (*cgroup_exit)(struct cgroup *cgrp); +- +- /** +- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup +- * @p: task being moved +- * @from: cgroup @p is being moved from +- * @to: cgroup @p is being moved to +- * +- * Prepare @p for move from cgroup @from to @to. This operation may +- * block and can be used for allocations. +- * +- * Return 0 for success, -errno for failure. An error return aborts the +- * migration. +- */ +- s32 (*cgroup_prep_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_move: Commit cgroup move +- * @p: task being moved +- * @from: cgroup @p is being moved from +- * @to: cgroup @p is being moved to +- * +- * Commit the move. @p is dequeued during this operation. +- */ +- void (*cgroup_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_cancel_move: Cancel cgroup move +- * @p: task whose cgroup move is being canceled +- * @from: cgroup @p was being moved from +- * @to: cgroup @p was being moved to +- * +- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). +- * Undo the preparation. +- */ +- void (*cgroup_cancel_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_set_weight: A cgroup's weight is being changed +- * @cgrp: cgroup whose weight is being updated +- * @weight: new weight [1..10000] +- * +- * Update @cgrp's weight to @weight. +- */ +- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +- +- /** +- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed +- * @cgrp: cgroup whose bandwidth is being updated +- * @period_us: bandwidth control period +- * @quota_us: bandwidth control quota +- * @burst_us: bandwidth control burst +- * +- * Update @cgrp's bandwidth control parameters. This is from the cpu.max +- * cgroup interface. +- * +- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled +- * to. For example, if @period_us is 1_000_000 and @quota_us is +- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be +- * interpreted in the same fashion and specifies how much @cgrp can +- * burst temporarily. The specific control mechanism and thus the +- * interpretation of @period_us and burstiness is upto to the BPF +- * scheduler. +- */ +- void (*cgroup_set_bandwidth)(struct cgroup *cgrp, +- u64 period_us, u64 quota_us, u64 burst_us); +- +-#endif /* CONFIG_EXT_GROUP_SCHED */ +- +- /* +- * All online ops must come before ops.cpu_online(). +- */ +- +- /** +- * @cpu_online: A CPU became online +- * @cpu: CPU which just came up +- * +- * @cpu just came online. @cpu will not call ops.enqueue() or +- * ops.dispatch(), nor run tasks associated with other CPUs beforehand. +- */ +- void (*cpu_online)(s32 cpu); +- +- /** +- * @cpu_offline: A CPU is going offline +- * @cpu: CPU which is going offline +- * +- * @cpu is going offline. @cpu will not call ops.enqueue() or +- * ops.dispatch(), nor run tasks associated with other CPUs afterwards. +- */ +- void (*cpu_offline)(s32 cpu); +- +- /* +- * All CPU hotplug ops must come before ops.init(). +- */ +- +- /** +- * @init: Initialize the BPF scheduler +- */ +- s32 (*init)(void); +- +- /** +- * @exit: Clean up after the BPF scheduler +- * @info: Exit info +- * +- * ops.exit() is also called on ops.init() failure, which is a bit +- * unusual. This is to allow rich reporting through @info on how +- * ops.init() failed. +- */ +- void (*exit)(struct scx_exit_info *info); +- +- /** +- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch +- */ +- u32 dispatch_max_batch; +- +- /** +- * @flags: %SCX_OPS_* flags +- */ +- u64 flags; +- +- /** +- * @timeout_ms: The maximum amount of time, in milliseconds, that a +- * runnable task should be able to wait before being scheduled. The +- * maximum timeout may not exceed the default timeout of 30 seconds. +- * +- * Defaults to the maximum allowed timeout value of 30 seconds. +- */ +- u32 timeout_ms; +- +- /** +- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default +- * value of 32768 is used. +- */ +- u32 exit_dump_len; +- +- /** +- * @hotplug_seq: A sequence number that may be set by the scheduler to +- * detect when a hotplug event has occurred during the loading process. +- * If 0, no detection occurs. Otherwise, the scheduler will fail to +- * load if the sequence number does not match @scx_hotplug_seq on the +- * enable path. +- */ +- u64 hotplug_seq; +- +- /** +- * @name: BPF scheduler's name +- * +- * Must be a non-zero valid BPF object name including only isalnum(), +- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the +- * BPF scheduler is enabled. +- */ +- char name[SCX_OPS_NAME_LEN]; +- +- /* internal use only, must be NULL */ +- void *priv; +-}; +- +-enum scx_opi { +- SCX_OPI_BEGIN = 0, +- SCX_OPI_NORMAL_BEGIN = 0, +- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), +- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), +- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), +- SCX_OPI_END = SCX_OP_IDX(init), +-}; +- +-/* +- * Collection of event counters. Event types are placed in descending order. +- */ +-struct scx_event_stats { +- /* +- * If ops.select_cpu() returns a CPU which can't be used by the task, +- * the core scheduler code silently picks a fallback CPU. +- */ +- s64 SCX_EV_SELECT_CPU_FALLBACK; +- +- /* +- * When dispatching to a local DSQ, the CPU may have gone offline in +- * the meantime. In this case, the task is bounced to the global DSQ. +- */ +- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; +- +- /* +- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task +- * continued to run because there were no other tasks on the CPU. +- */ +- s64 SCX_EV_DISPATCH_KEEP_LAST; +- +- /* +- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task +- * is dispatched to a local DSQ when exiting. +- */ +- s64 SCX_EV_ENQ_SKIP_EXITING; +- +- /* +- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a +- * migration disabled task skips ops.enqueue() and is dispatched to its +- * local DSQ. +- */ +- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; +- +- /* +- * Total number of times a task's time slice was refilled with the +- * default value (SCX_SLICE_DFL). +- */ +- s64 SCX_EV_REFILL_SLICE_DFL; +- +- /* +- * The total duration of bypass modes in nanoseconds. +- */ +- s64 SCX_EV_BYPASS_DURATION; +- +- /* +- * The number of tasks dispatched in the bypassing mode. +- */ +- s64 SCX_EV_BYPASS_DISPATCH; +- +- /* +- * The number of times the bypassing mode has been activated. +- */ +- s64 SCX_EV_BYPASS_ACTIVATE; +-}; +- +-struct scx_sched { +- struct sched_ext_ops ops; +- DECLARE_BITMAP(has_op, SCX_OPI_END); +- +- /* +- * Dispatch queues. +- * +- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. +- * This is to avoid live-locking in bypass mode where all tasks are +- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If +- * per-node split isn't sufficient, it can be further split. +- */ +- struct rhashtable dsq_hash; +- struct scx_dispatch_q **global_dsqs; +- +- /* +- * The event counters are in a per-CPU variable to minimize the +- * accounting overhead. A system-wide view on the event counter is +- * constructed when requested by scx_bpf_events(). +- */ +- struct scx_event_stats __percpu *event_stats_cpu; +- +- bool warned_zero_slice; +- +- atomic_t exit_kind; +- struct scx_exit_info *exit_info; +- +- struct kobject kobj; +- +- struct kthread_worker *helper; +- struct irq_work error_irq_work; +- struct kthread_work disable_work; +- struct rcu_work rcu_work; +-}; +- +-enum scx_wake_flags { +- /* expose select WF_* flags as enums */ +- SCX_WAKE_FORK = WF_FORK, +- SCX_WAKE_TTWU = WF_TTWU, +- SCX_WAKE_SYNC = WF_SYNC, +-}; +- +-enum scx_enq_flags { +- /* expose select ENQUEUE_* flags as enums */ +- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, +- SCX_ENQ_HEAD = ENQUEUE_HEAD, +- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, +- +- /* high 32bits are SCX specific */ +- +- /* +- * Set the following to trigger preemption when calling +- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the +- * current task is cleared to zero and the CPU is kicked into the +- * scheduling path. Implies %SCX_ENQ_HEAD. +- */ +- SCX_ENQ_PREEMPT = 1LLU << 32, +- +- /* +- * The task being enqueued was previously enqueued on the current CPU's +- * %SCX_DSQ_LOCAL, but was removed from it in a call to the +- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was +- * invoked in a ->cpu_release() callback, and the task is again +- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the +- * task will not be scheduled on the CPU until at least the next invocation +- * of the ->cpu_acquire() callback. +- */ +- SCX_ENQ_REENQ = 1LLU << 40, +- +- /* +- * The task being enqueued is the only task available for the cpu. By +- * default, ext core keeps executing such tasks but when +- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the +- * %SCX_ENQ_LAST flag set. +- * +- * The BPF scheduler is responsible for triggering a follow-up +- * scheduling event. Otherwise, Execution may stall. +- */ +- SCX_ENQ_LAST = 1LLU << 41, +- +- /* high 8 bits are internal */ +- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, +- +- SCX_ENQ_CLEAR_OPSS = 1LLU << 56, +- SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +-}; +- +-enum scx_deq_flags { +- /* expose select DEQUEUE_* flags as enums */ +- SCX_DEQ_SLEEP = DEQUEUE_SLEEP, +- +- /* high 32bits are SCX specific */ +- +- /* +- * The generic core-sched layer decided to execute the task even though +- * it hasn't been dispatched yet. Dequeue from the BPF side. +- */ +- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +-}; +- +-enum scx_pick_idle_cpu_flags { +- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +-}; +- +-enum scx_kick_flags { +- /* +- * Kick the target CPU if idle. Guarantees that the target CPU goes +- * through at least one full scheduling cycle before going idle. If the +- * target CPU can be determined to be currently not idle and going to go +- * through a scheduling cycle before going idle, noop. +- */ +- SCX_KICK_IDLE = 1LLU << 0, +- +- /* +- * Preempt the current task and execute the dispatch path. If the +- * current task of the target CPU is an SCX task, its ->scx.slice is +- * cleared to zero before the scheduling path is invoked so that the +- * task expires and the dispatch path is invoked. +- */ +- SCX_KICK_PREEMPT = 1LLU << 1, +- +- /* +- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will +- * return after the target CPU finishes picking the next task. +- */ +- SCX_KICK_WAIT = 1LLU << 2, +-}; +- +-enum scx_tg_flags { +- SCX_TG_ONLINE = 1U << 0, +- SCX_TG_INITED = 1U << 1, +-}; +- +-enum scx_enable_state { +- SCX_ENABLING, +- SCX_ENABLED, +- SCX_DISABLING, +- SCX_DISABLED, +-}; +- +-static const char *scx_enable_state_str[] = { +- [SCX_ENABLING] = "enabling", +- [SCX_ENABLED] = "enabled", +- [SCX_DISABLING] = "disabling", +- [SCX_DISABLED] = "disabled", +-}; +- +-/* +- * sched_ext_entity->ops_state +- * +- * Used to track the task ownership between the SCX core and the BPF scheduler. +- * State transitions look as follows: +- * +- * NONE -> QUEUEING -> QUEUED -> DISPATCHING +- * ^ | | +- * | v v +- * \-------------------------------/ +- * +- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call +- * sites for explanations on the conditions being waited upon and why they are +- * safe. Transitions out of them into NONE or QUEUED must store_release and the +- * waiters should load_acquire. +- * +- * Tracking scx_ops_state enables sched_ext core to reliably determine whether +- * any given task can be dispatched by the BPF scheduler at all times and thus +- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler +- * to try to dispatch any task anytime regardless of its state as the SCX core +- * can safely reject invalid dispatches. +- */ +-enum scx_ops_state { +- SCX_OPSS_NONE, /* owned by the SCX core */ +- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ +- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ +- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ +- +- /* +- * QSEQ brands each QUEUED instance so that, when dispatch races +- * dequeue/requeue, the dispatcher can tell whether it still has a claim +- * on the task being dispatched. +- * +- * As some 32bit archs can't do 64bit store_release/load_acquire, +- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on +- * 32bit machines. The dispatch race window QSEQ protects is very narrow +- * and runs with IRQ disabled. 30 bits should be sufficient. +- */ +- SCX_OPSS_QSEQ_SHIFT = 2, +-}; +- +-/* Use macros to ensure that the type is unsigned long for the masks */ +-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +- + /* + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the +diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h +index 292bb41a242ec..33858607bc97f 100644 +--- a/kernel/sched/ext.h ++++ b/kernel/sched/ext.h +@@ -8,29 +8,6 @@ + */ + #ifdef CONFIG_SCHED_CLASS_EXT + +-static inline bool scx_kf_allowed_if_unlocked(void) +-{ +- return !current->scx.kf_mask; +-} +- +-static inline bool scx_rq_bypassing(struct rq *rq) +-{ +- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +-} +- +-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); +- +-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +- +-/* +- * Return the rq currently locked from an scx callback, or NULL if no rq is +- * locked. +- */ +-static inline struct rq *scx_locked_rq(void) +-{ +- return __this_cpu_read(scx_locked_rq_state); +-} +- + void scx_tick(struct rq *rq); + void init_scx_entity(struct sched_ext_entity *scx); + void scx_pre_fork(struct task_struct *p); +diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h +new file mode 100644 +index 0000000000000..76690ede8700f +--- /dev/null ++++ b/kernel/sched/ext_internal.h +@@ -0,0 +1,1061 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst ++ * ++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2025 Tejun Heo ++ */ ++#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) ++ ++enum scx_consts { ++ SCX_DSP_DFL_MAX_BATCH = 32, ++ SCX_DSP_MAX_LOOPS = 32, ++ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, ++ ++ SCX_EXIT_BT_LEN = 64, ++ SCX_EXIT_MSG_LEN = 1024, ++ SCX_EXIT_DUMP_DFL_LEN = 32768, ++ ++ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, ++ ++ /* ++ * Iterating all tasks may take a while. Periodically drop ++ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. ++ */ ++ SCX_TASK_ITER_BATCH = 32, ++}; ++ ++enum scx_exit_kind { ++ SCX_EXIT_NONE, ++ SCX_EXIT_DONE, ++ ++ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ ++ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ ++ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ ++ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ ++ ++ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ ++ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ ++ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ ++}; ++ ++/* ++ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), ++ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes ++ * are 64bit of the format: ++ * ++ * Bits: [63 .. 48 47 .. 32 31 .. 0] ++ * [ SYS ACT ] [ SYS RSN ] [ USR ] ++ * ++ * SYS ACT: System-defined exit actions ++ * SYS RSN: System-defined exit reasons ++ * USR : User-defined exit codes and reasons ++ * ++ * Using the above, users may communicate intention and context by ORing system ++ * actions and/or system reasons with a user-defined exit code. ++ */ ++enum scx_exit_code { ++ /* Reasons */ ++ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, ++ ++ /* Actions */ ++ SCX_ECODE_ACT_RESTART = 1LLU << 48, ++}; ++ ++/* ++ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is ++ * being disabled. ++ */ ++struct scx_exit_info { ++ /* %SCX_EXIT_* - broad category of the exit reason */ ++ enum scx_exit_kind kind; ++ ++ /* exit code if gracefully exiting */ ++ s64 exit_code; ++ ++ /* textual representation of the above */ ++ const char *reason; ++ ++ /* backtrace if exiting due to an error */ ++ unsigned long *bt; ++ u32 bt_len; ++ ++ /* informational message */ ++ char *msg; ++ ++ /* debug dump */ ++ char *dump; ++}; ++ ++/* sched_ext_ops.flags */ ++enum scx_ops_flags { ++ /* ++ * Keep built-in idle tracking even if ops.update_idle() is implemented. ++ */ ++ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, ++ ++ /* ++ * By default, if there are no other task to run on the CPU, ext core ++ * keeps running the current task even after its slice expires. If this ++ * flag is specified, such tasks are passed to ops.enqueue() with ++ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. ++ */ ++ SCX_OPS_ENQ_LAST = 1LLU << 1, ++ ++ /* ++ * An exiting task may schedule after PF_EXITING is set. In such cases, ++ * bpf_task_from_pid() may not be able to find the task and if the BPF ++ * scheduler depends on pid lookup for dispatching, the task will be ++ * lost leading to various issues including RCU grace period stalls. ++ * ++ * To mask this problem, by default, unhashed tasks are automatically ++ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't ++ * depend on pid lookups and wants to handle these tasks directly, the ++ * following flag can be used. ++ */ ++ SCX_OPS_ENQ_EXITING = 1LLU << 2, ++ ++ /* ++ * If set, only tasks with policy set to SCHED_EXT are attached to ++ * sched_ext. If clear, SCHED_NORMAL tasks are also included. ++ */ ++ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, ++ ++ /* ++ * A migration disabled task can only execute on its current CPU. By ++ * default, such tasks are automatically put on the CPU's local DSQ with ++ * the default slice on enqueue. If this ops flag is set, they also go ++ * through ops.enqueue(). ++ * ++ * A migration disabled task never invokes ops.select_cpu() as it can ++ * only select the current CPU. Also, p->cpus_ptr will only contain its ++ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr ++ * and thus may disagree with cpumask_weight(p->cpus_ptr). ++ */ ++ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, ++ ++ /* ++ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes ++ * ops.enqueue() on the ops.select_cpu() selected or the wakee's ++ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline ++ * transfers. When this optimization is enabled, ops.select_cpu() is ++ * skipped in some cases (when racing against the wakee switching out). ++ * As the BPF scheduler may depend on ops.select_cpu() being invoked ++ * during wakeups, queued wakeup is disabled by default. ++ * ++ * If this ops flag is set, queued wakeup optimization is enabled and ++ * the BPF scheduler must be able to handle ops.enqueue() invoked on the ++ * wakee's CPU without preceding ops.select_cpu() even for tasks which ++ * may be executed on multiple CPUs. ++ */ ++ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, ++ ++ /* ++ * If set, enable per-node idle cpumasks. If clear, use a single global ++ * flat idle cpumask. ++ */ ++ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, ++ ++ /* ++ * CPU cgroup support flags ++ */ ++ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ ++ ++ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | ++ SCX_OPS_ENQ_LAST | ++ SCX_OPS_ENQ_EXITING | ++ SCX_OPS_ENQ_MIGRATION_DISABLED | ++ SCX_OPS_ALLOW_QUEUED_WAKEUP | ++ SCX_OPS_SWITCH_PARTIAL | ++ SCX_OPS_BUILTIN_IDLE_PER_NODE | ++ SCX_OPS_HAS_CGROUP_WEIGHT, ++ ++ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ ++ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, ++ ++ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, ++}; ++ ++/* argument container for ops.init_task() */ ++struct scx_init_task_args { ++ /* ++ * Set if ops.init_task() is being invoked on the fork path, as opposed ++ * to the scheduler transition path. ++ */ ++ bool fork; ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /* the cgroup the task is joining */ ++ struct cgroup *cgroup; ++#endif ++}; ++ ++/* argument container for ops.exit_task() */ ++struct scx_exit_task_args { ++ /* Whether the task exited before running on sched_ext. */ ++ bool cancelled; ++}; ++ ++/* argument container for ops->cgroup_init() */ ++struct scx_cgroup_init_args { ++ /* the weight of the cgroup [1..10000] */ ++ u32 weight; ++ ++ /* bandwidth control parameters from cpu.max and cpu.max.burst */ ++ u64 bw_period_us; ++ u64 bw_quota_us; ++ u64 bw_burst_us; ++}; ++ ++enum scx_cpu_preempt_reason { ++ /* next task is being scheduled by &sched_class_rt */ ++ SCX_CPU_PREEMPT_RT, ++ /* next task is being scheduled by &sched_class_dl */ ++ SCX_CPU_PREEMPT_DL, ++ /* next task is being scheduled by &sched_class_stop */ ++ SCX_CPU_PREEMPT_STOP, ++ /* unknown reason for SCX being preempted */ ++ SCX_CPU_PREEMPT_UNKNOWN, ++}; ++ ++/* ++ * Argument container for ops->cpu_acquire(). Currently empty, but may be ++ * expanded in the future. ++ */ ++struct scx_cpu_acquire_args {}; ++ ++/* argument container for ops->cpu_release() */ ++struct scx_cpu_release_args { ++ /* the reason the CPU was preempted */ ++ enum scx_cpu_preempt_reason reason; ++ ++ /* the task that's going to be scheduled on the CPU */ ++ struct task_struct *task; ++}; ++ ++/* ++ * Informational context provided to dump operations. ++ */ ++struct scx_dump_ctx { ++ enum scx_exit_kind kind; ++ s64 exit_code; ++ const char *reason; ++ u64 at_ns; ++ u64 at_jiffies; ++}; ++ ++/** ++ * struct sched_ext_ops - Operation table for BPF scheduler implementation ++ * ++ * A BPF scheduler can implement an arbitrary scheduling policy by ++ * implementing and loading operations in this table. Note that a userland ++ * scheduling policy can also be implemented using the BPF scheduler ++ * as a shim layer. ++ */ ++struct sched_ext_ops { ++ /** ++ * @select_cpu: Pick the target CPU for a task which is being woken up ++ * @p: task being woken up ++ * @prev_cpu: the cpu @p was on before sleeping ++ * @wake_flags: SCX_WAKE_* ++ * ++ * Decision made here isn't final. @p may be moved to any CPU while it ++ * is getting dispatched for execution later. However, as @p is not on ++ * the rq at this point, getting the eventual execution CPU right here ++ * saves a small bit of overhead down the line. ++ * ++ * If an idle CPU is returned, the CPU is kicked and will try to ++ * dispatch. While an explicit custom mechanism can be added, ++ * select_cpu() serves as the default way to wake up idle CPUs. ++ * ++ * @p may be inserted into a DSQ directly by calling ++ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. ++ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ ++ * of the CPU returned by this operation. ++ * ++ * Note that select_cpu() is never called for tasks that can only run ++ * on a single CPU or tasks with migration disabled, as they don't have ++ * the option to select a different CPU. See select_task_rq() for ++ * details. ++ */ ++ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); ++ ++ /** ++ * @enqueue: Enqueue a task on the BPF scheduler ++ * @p: task being enqueued ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * @p is ready to run. Insert directly into a DSQ by calling ++ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly ++ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, ++ * the task will stall. ++ * ++ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is ++ * skipped. ++ */ ++ void (*enqueue)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * @dequeue: Remove a task from the BPF scheduler ++ * @p: task being dequeued ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * Remove @p from the BPF scheduler. This is usually called to isolate ++ * the task while updating its scheduling properties (e.g. priority). ++ * ++ * The ext core keeps track of whether the BPF side owns a given task or ++ * not and can gracefully ignore spurious dispatches from BPF side, ++ * which makes it safe to not implement this method. However, depending ++ * on the scheduling logic, this can lead to confusing behaviors - e.g. ++ * scheduling position not being updated across a priority change. ++ */ ++ void (*dequeue)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs ++ * @cpu: CPU to dispatch tasks for ++ * @prev: previous task being switched out ++ * ++ * Called when a CPU's local dsq is empty. The operation should dispatch ++ * one or more tasks from the BPF scheduler into the DSQs using ++ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ ++ * using scx_bpf_dsq_move_to_local(). ++ * ++ * The maximum number of times scx_bpf_dsq_insert() can be called ++ * without an intervening scx_bpf_dsq_move_to_local() is specified by ++ * ops.dispatch_max_batch. See the comments on top of the two functions ++ * for more details. ++ * ++ * When not %NULL, @prev is an SCX task with its slice depleted. If ++ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in ++ * @prev->scx.flags, it is not enqueued yet and will be enqueued after ++ * ops.dispatch() returns. To keep executing @prev, return without ++ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. ++ */ ++ void (*dispatch)(s32 cpu, struct task_struct *prev); ++ ++ /** ++ * @tick: Periodic tick ++ * @p: task running currently ++ * ++ * This operation is called every 1/HZ seconds on CPUs which are ++ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an ++ * immediate dispatch cycle on the CPU. ++ */ ++ void (*tick)(struct task_struct *p); ++ ++ /** ++ * @runnable: A task is becoming runnable on its associated CPU ++ * @p: task becoming runnable ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * This and the following three functions can be used to track a task's ++ * execution state transitions. A task becomes ->runnable() on a CPU, ++ * and then goes through one or more ->running() and ->stopping() pairs ++ * as it runs on the CPU, and eventually becomes ->quiescent() when it's ++ * done running on the CPU. ++ * ++ * @p is becoming runnable on the CPU because it's ++ * ++ * - waking up (%SCX_ENQ_WAKEUP) ++ * - being moved from another CPU ++ * - being restored after temporarily taken off the queue for an ++ * attribute change. ++ * ++ * This and ->enqueue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be followed by ->enqueue() ++ * e.g. when @p is being dispatched to a remote CPU, or when @p is ++ * being enqueued on a CPU experiencing a hotplug event. Likewise, a ++ * task may be ->enqueue()'d without being preceded by this operation ++ * e.g. after exhausting its slice. ++ */ ++ void (*runnable)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * @running: A task is starting to run on its associated CPU ++ * @p: task starting to run ++ * ++ * Note that this callback may be called from a CPU other than the ++ * one the task is going to run on. This can happen when a task ++ * property is changed (i.e., affinity), since scx_next_task_scx(), ++ * which triggers this callback, may run on a CPU different from ++ * the task's assigned CPU. ++ * ++ * Therefore, always use scx_bpf_task_cpu(@p) to determine the ++ * target CPU the task is going to use. ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ */ ++ void (*running)(struct task_struct *p); ++ ++ /** ++ * @stopping: A task is stopping execution ++ * @p: task stopping to run ++ * @runnable: is task @p still runnable? ++ * ++ * Note that this callback may be called from a CPU other than the ++ * one the task was running on. This can happen when a task ++ * property is changed (i.e., affinity), since dequeue_task_scx(), ++ * which triggers this callback, may run on a CPU different from ++ * the task's assigned CPU. ++ * ++ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU ++ * the task was running on. ++ * ++ * See ->runnable() for explanation on the task state notifiers. If ++ * !@runnable, ->quiescent() will be invoked after this operation ++ * returns. ++ */ ++ void (*stopping)(struct task_struct *p, bool runnable); ++ ++ /** ++ * @quiescent: A task is becoming not runnable on its associated CPU ++ * @p: task becoming not runnable ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ * ++ * @p is becoming quiescent on the CPU because it's ++ * ++ * - sleeping (%SCX_DEQ_SLEEP) ++ * - being moved to another CPU ++ * - being temporarily taken off the queue for an attribute change ++ * (%SCX_DEQ_SAVE) ++ * ++ * This and ->dequeue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be preceded by ->dequeue() ++ * e.g. when @p is being dispatched to a remote CPU. ++ */ ++ void (*quiescent)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * @yield: Yield CPU ++ * @from: yielding task ++ * @to: optional yield target task ++ * ++ * If @to is NULL, @from is yielding the CPU to other runnable tasks. ++ * The BPF scheduler should ensure that other available tasks are ++ * dispatched before the yielding task. Return value is ignored in this ++ * case. ++ * ++ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf ++ * scheduler can implement the request, return %true; otherwise, %false. ++ */ ++ bool (*yield)(struct task_struct *from, struct task_struct *to); ++ ++ /** ++ * @core_sched_before: Task ordering for core-sched ++ * @a: task A ++ * @b: task B ++ * ++ * Used by core-sched to determine the ordering between two tasks. See ++ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on ++ * core-sched. ++ * ++ * Both @a and @b are runnable and may or may not currently be queued on ++ * the BPF scheduler. Should return %true if @a should run before @b. ++ * %false if there's no required ordering or @b should run before @a. ++ * ++ * If not specified, the default is ordering them according to when they ++ * became runnable. ++ */ ++ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); ++ ++ /** ++ * @set_weight: Set task weight ++ * @p: task to set weight for ++ * @weight: new weight [1..10000] ++ * ++ * Update @p's weight to @weight. ++ */ ++ void (*set_weight)(struct task_struct *p, u32 weight); ++ ++ /** ++ * @set_cpumask: Set CPU affinity ++ * @p: task to set CPU affinity for ++ * @cpumask: cpumask of cpus that @p can run on ++ * ++ * Update @p's CPU affinity to @cpumask. ++ */ ++ void (*set_cpumask)(struct task_struct *p, ++ const struct cpumask *cpumask); ++ ++ /** ++ * @update_idle: Update the idle state of a CPU ++ * @cpu: CPU to update the idle state for ++ * @idle: whether entering or exiting the idle state ++ * ++ * This operation is called when @rq's CPU goes or leaves the idle ++ * state. By default, implementing this operation disables the built-in ++ * idle CPU tracking and the following helpers become unavailable: ++ * ++ * - scx_bpf_select_cpu_dfl() ++ * - scx_bpf_select_cpu_and() ++ * - scx_bpf_test_and_clear_cpu_idle() ++ * - scx_bpf_pick_idle_cpu() ++ * ++ * The user also must implement ops.select_cpu() as the default ++ * implementation relies on scx_bpf_select_cpu_dfl(). ++ * ++ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle ++ * tracking. ++ */ ++ void (*update_idle)(s32 cpu, bool idle); ++ ++ /** ++ * @cpu_acquire: A CPU is becoming available to the BPF scheduler ++ * @cpu: The CPU being acquired by the BPF scheduler. ++ * @args: Acquire arguments, see the struct definition. ++ * ++ * A CPU that was previously released from the BPF scheduler is now once ++ * again under its control. ++ */ ++ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); ++ ++ /** ++ * @cpu_release: A CPU is taken away from the BPF scheduler ++ * @cpu: The CPU being released by the BPF scheduler. ++ * @args: Release arguments, see the struct definition. ++ * ++ * The specified CPU is no longer under the control of the BPF ++ * scheduler. This could be because it was preempted by a higher ++ * priority sched_class, though there may be other reasons as well. The ++ * caller should consult @args->reason to determine the cause. ++ */ ++ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); ++ ++ /** ++ * @init_task: Initialize a task to run in a BPF scheduler ++ * @p: task to initialize for BPF scheduling ++ * @args: init arguments, see the struct definition ++ * ++ * Either we're loading a BPF scheduler or a new task is being forked. ++ * Initialize @p for BPF scheduling. This operation may block and can ++ * be used for allocations, and is called exactly once for a task. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During a fork, it ++ * will abort that specific fork. ++ */ ++ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); ++ ++ /** ++ * @exit_task: Exit a previously-running task from the system ++ * @p: task to exit ++ * @args: exit arguments, see the struct definition ++ * ++ * @p is exiting or the BPF scheduler is being unloaded. Perform any ++ * necessary cleanup for @p. ++ */ ++ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); ++ ++ /** ++ * @enable: Enable BPF scheduling for a task ++ * @p: task to enable BPF scheduling for ++ * ++ * Enable @p for BPF scheduling. enable() is called on @p any time it ++ * enters SCX, and is always paired with a matching disable(). ++ */ ++ void (*enable)(struct task_struct *p); ++ ++ /** ++ * @disable: Disable BPF scheduling for a task ++ * @p: task to disable BPF scheduling for ++ * ++ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. ++ * Disable BPF scheduling for @p. A disable() call is always matched ++ * with a prior enable() call. ++ */ ++ void (*disable)(struct task_struct *p); ++ ++ /** ++ * @dump: Dump BPF scheduler state on error ++ * @ctx: debug dump context ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. ++ */ ++ void (*dump)(struct scx_dump_ctx *ctx); ++ ++ /** ++ * @dump_cpu: Dump BPF scheduler state for a CPU on error ++ * @ctx: debug dump context ++ * @cpu: CPU to generate debug dump for ++ * @idle: @cpu is currently idle without any runnable tasks ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @cpu. If @idle is %true and this operation doesn't produce any ++ * output, @cpu is skipped for dump. ++ */ ++ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); ++ ++ /** ++ * @dump_task: Dump BPF scheduler state for a runnable task on error ++ * @ctx: debug dump context ++ * @p: runnable task to generate debug dump for ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @p. ++ */ ++ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); ++ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /** ++ * @cgroup_init: Initialize a cgroup ++ * @cgrp: cgroup being initialized ++ * @args: init arguments, see the struct definition ++ * ++ * Either the BPF scheduler is being loaded or @cgrp created, initialize ++ * @cgrp for sched_ext. This operation may block. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During cgroup ++ * creation, it will abort the specific cgroup creation. ++ */ ++ s32 (*cgroup_init)(struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args); ++ ++ /** ++ * @cgroup_exit: Exit a cgroup ++ * @cgrp: cgroup being exited ++ * ++ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit ++ * @cgrp for sched_ext. This operation my block. ++ */ ++ void (*cgroup_exit)(struct cgroup *cgrp); ++ ++ /** ++ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Prepare @p for move from cgroup @from to @to. This operation may ++ * block and can be used for allocations. ++ * ++ * Return 0 for success, -errno for failure. An error return aborts the ++ * migration. ++ */ ++ s32 (*cgroup_prep_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_move: Commit cgroup move ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Commit the move. @p is dequeued during this operation. ++ */ ++ void (*cgroup_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_cancel_move: Cancel cgroup move ++ * @p: task whose cgroup move is being canceled ++ * @from: cgroup @p was being moved from ++ * @to: cgroup @p was being moved to ++ * ++ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). ++ * Undo the preparation. ++ */ ++ void (*cgroup_cancel_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_set_weight: A cgroup's weight is being changed ++ * @cgrp: cgroup whose weight is being updated ++ * @weight: new weight [1..10000] ++ * ++ * Update @cgrp's weight to @weight. ++ */ ++ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); ++ ++ /** ++ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed ++ * @cgrp: cgroup whose bandwidth is being updated ++ * @period_us: bandwidth control period ++ * @quota_us: bandwidth control quota ++ * @burst_us: bandwidth control burst ++ * ++ * Update @cgrp's bandwidth control parameters. This is from the cpu.max ++ * cgroup interface. ++ * ++ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled ++ * to. For example, if @period_us is 1_000_000 and @quota_us is ++ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be ++ * interpreted in the same fashion and specifies how much @cgrp can ++ * burst temporarily. The specific control mechanism and thus the ++ * interpretation of @period_us and burstiness is upto to the BPF ++ * scheduler. ++ */ ++ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, ++ u64 period_us, u64 quota_us, u64 burst_us); ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ ++ /* ++ * All online ops must come before ops.cpu_online(). ++ */ ++ ++ /** ++ * @cpu_online: A CPU became online ++ * @cpu: CPU which just came up ++ * ++ * @cpu just came online. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs beforehand. ++ */ ++ void (*cpu_online)(s32 cpu); ++ ++ /** ++ * @cpu_offline: A CPU is going offline ++ * @cpu: CPU which is going offline ++ * ++ * @cpu is going offline. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs afterwards. ++ */ ++ void (*cpu_offline)(s32 cpu); ++ ++ /* ++ * All CPU hotplug ops must come before ops.init(). ++ */ ++ ++ /** ++ * @init: Initialize the BPF scheduler ++ */ ++ s32 (*init)(void); ++ ++ /** ++ * @exit: Clean up after the BPF scheduler ++ * @info: Exit info ++ * ++ * ops.exit() is also called on ops.init() failure, which is a bit ++ * unusual. This is to allow rich reporting through @info on how ++ * ops.init() failed. ++ */ ++ void (*exit)(struct scx_exit_info *info); ++ ++ /** ++ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch ++ */ ++ u32 dispatch_max_batch; ++ ++ /** ++ * @flags: %SCX_OPS_* flags ++ */ ++ u64 flags; ++ ++ /** ++ * @timeout_ms: The maximum amount of time, in milliseconds, that a ++ * runnable task should be able to wait before being scheduled. The ++ * maximum timeout may not exceed the default timeout of 30 seconds. ++ * ++ * Defaults to the maximum allowed timeout value of 30 seconds. ++ */ ++ u32 timeout_ms; ++ ++ /** ++ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default ++ * value of 32768 is used. ++ */ ++ u32 exit_dump_len; ++ ++ /** ++ * @hotplug_seq: A sequence number that may be set by the scheduler to ++ * detect when a hotplug event has occurred during the loading process. ++ * If 0, no detection occurs. Otherwise, the scheduler will fail to ++ * load if the sequence number does not match @scx_hotplug_seq on the ++ * enable path. ++ */ ++ u64 hotplug_seq; ++ ++ /** ++ * @name: BPF scheduler's name ++ * ++ * Must be a non-zero valid BPF object name including only isalnum(), ++ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the ++ * BPF scheduler is enabled. ++ */ ++ char name[SCX_OPS_NAME_LEN]; ++ ++ /* internal use only, must be NULL */ ++ void *priv; ++}; ++ ++enum scx_opi { ++ SCX_OPI_BEGIN = 0, ++ SCX_OPI_NORMAL_BEGIN = 0, ++ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), ++ SCX_OPI_END = SCX_OP_IDX(init), ++}; ++ ++/* ++ * Collection of event counters. Event types are placed in descending order. ++ */ ++struct scx_event_stats { ++ /* ++ * If ops.select_cpu() returns a CPU which can't be used by the task, ++ * the core scheduler code silently picks a fallback CPU. ++ */ ++ s64 SCX_EV_SELECT_CPU_FALLBACK; ++ ++ /* ++ * When dispatching to a local DSQ, the CPU may have gone offline in ++ * the meantime. In this case, the task is bounced to the global DSQ. ++ */ ++ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; ++ ++ /* ++ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task ++ * continued to run because there were no other tasks on the CPU. ++ */ ++ s64 SCX_EV_DISPATCH_KEEP_LAST; ++ ++ /* ++ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task ++ * is dispatched to a local DSQ when exiting. ++ */ ++ s64 SCX_EV_ENQ_SKIP_EXITING; ++ ++ /* ++ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a ++ * migration disabled task skips ops.enqueue() and is dispatched to its ++ * local DSQ. ++ */ ++ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; ++ ++ /* ++ * Total number of times a task's time slice was refilled with the ++ * default value (SCX_SLICE_DFL). ++ */ ++ s64 SCX_EV_REFILL_SLICE_DFL; ++ ++ /* ++ * The total duration of bypass modes in nanoseconds. ++ */ ++ s64 SCX_EV_BYPASS_DURATION; ++ ++ /* ++ * The number of tasks dispatched in the bypassing mode. ++ */ ++ s64 SCX_EV_BYPASS_DISPATCH; ++ ++ /* ++ * The number of times the bypassing mode has been activated. ++ */ ++ s64 SCX_EV_BYPASS_ACTIVATE; ++}; ++ ++struct scx_sched { ++ struct sched_ext_ops ops; ++ DECLARE_BITMAP(has_op, SCX_OPI_END); ++ ++ /* ++ * Dispatch queues. ++ * ++ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. ++ * This is to avoid live-locking in bypass mode where all tasks are ++ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If ++ * per-node split isn't sufficient, it can be further split. ++ */ ++ struct rhashtable dsq_hash; ++ struct scx_dispatch_q **global_dsqs; ++ ++ /* ++ * The event counters are in a per-CPU variable to minimize the ++ * accounting overhead. A system-wide view on the event counter is ++ * constructed when requested by scx_bpf_events(). ++ */ ++ struct scx_event_stats __percpu *event_stats_cpu; ++ ++ bool warned_zero_slice; ++ ++ atomic_t exit_kind; ++ struct scx_exit_info *exit_info; ++ ++ struct kobject kobj; ++ ++ struct kthread_worker *helper; ++ struct irq_work error_irq_work; ++ struct kthread_work disable_work; ++ struct rcu_work rcu_work; ++}; ++ ++enum scx_wake_flags { ++ /* expose select WF_* flags as enums */ ++ SCX_WAKE_FORK = WF_FORK, ++ SCX_WAKE_TTWU = WF_TTWU, ++ SCX_WAKE_SYNC = WF_SYNC, ++}; ++ ++enum scx_enq_flags { ++ /* expose select ENQUEUE_* flags as enums */ ++ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, ++ SCX_ENQ_HEAD = ENQUEUE_HEAD, ++ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * Set the following to trigger preemption when calling ++ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the ++ * current task is cleared to zero and the CPU is kicked into the ++ * scheduling path. Implies %SCX_ENQ_HEAD. ++ */ ++ SCX_ENQ_PREEMPT = 1LLU << 32, ++ ++ /* ++ * The task being enqueued was previously enqueued on the current CPU's ++ * %SCX_DSQ_LOCAL, but was removed from it in a call to the ++ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was ++ * invoked in a ->cpu_release() callback, and the task is again ++ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the ++ * task will not be scheduled on the CPU until at least the next invocation ++ * of the ->cpu_acquire() callback. ++ */ ++ SCX_ENQ_REENQ = 1LLU << 40, ++ ++ /* ++ * The task being enqueued is the only task available for the cpu. By ++ * default, ext core keeps executing such tasks but when ++ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the ++ * %SCX_ENQ_LAST flag set. ++ * ++ * The BPF scheduler is responsible for triggering a follow-up ++ * scheduling event. Otherwise, Execution may stall. ++ */ ++ SCX_ENQ_LAST = 1LLU << 41, ++ ++ /* high 8 bits are internal */ ++ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, ++ ++ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, ++ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, ++}; ++ ++enum scx_deq_flags { ++ /* expose select DEQUEUE_* flags as enums */ ++ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * The generic core-sched layer decided to execute the task even though ++ * it hasn't been dispatched yet. Dequeue from the BPF side. ++ */ ++ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, ++}; ++ ++enum scx_pick_idle_cpu_flags { ++ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ ++ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ ++}; ++ ++enum scx_kick_flags { ++ /* ++ * Kick the target CPU if idle. Guarantees that the target CPU goes ++ * through at least one full scheduling cycle before going idle. If the ++ * target CPU can be determined to be currently not idle and going to go ++ * through a scheduling cycle before going idle, noop. ++ */ ++ SCX_KICK_IDLE = 1LLU << 0, ++ ++ /* ++ * Preempt the current task and execute the dispatch path. If the ++ * current task of the target CPU is an SCX task, its ->scx.slice is ++ * cleared to zero before the scheduling path is invoked so that the ++ * task expires and the dispatch path is invoked. ++ */ ++ SCX_KICK_PREEMPT = 1LLU << 1, ++ ++ /* ++ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will ++ * return after the target CPU finishes picking the next task. ++ */ ++ SCX_KICK_WAIT = 1LLU << 2, ++}; ++ ++enum scx_tg_flags { ++ SCX_TG_ONLINE = 1U << 0, ++ SCX_TG_INITED = 1U << 1, ++}; ++ ++enum scx_enable_state { ++ SCX_ENABLING, ++ SCX_ENABLED, ++ SCX_DISABLING, ++ SCX_DISABLED, ++}; ++ ++static const char *scx_enable_state_str[] = { ++ [SCX_ENABLING] = "enabling", ++ [SCX_ENABLED] = "enabled", ++ [SCX_DISABLING] = "disabling", ++ [SCX_DISABLED] = "disabled", ++}; ++ ++/* ++ * sched_ext_entity->ops_state ++ * ++ * Used to track the task ownership between the SCX core and the BPF scheduler. ++ * State transitions look as follows: ++ * ++ * NONE -> QUEUEING -> QUEUED -> DISPATCHING ++ * ^ | | ++ * | v v ++ * \-------------------------------/ ++ * ++ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call ++ * sites for explanations on the conditions being waited upon and why they are ++ * safe. Transitions out of them into NONE or QUEUED must store_release and the ++ * waiters should load_acquire. ++ * ++ * Tracking scx_ops_state enables sched_ext core to reliably determine whether ++ * any given task can be dispatched by the BPF scheduler at all times and thus ++ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler ++ * to try to dispatch any task anytime regardless of its state as the SCX core ++ * can safely reject invalid dispatches. ++ */ ++enum scx_ops_state { ++ SCX_OPSS_NONE, /* owned by the SCX core */ ++ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ ++ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ ++ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ ++ ++ /* ++ * QSEQ brands each QUEUED instance so that, when dispatch races ++ * dequeue/requeue, the dispatcher can tell whether it still has a claim ++ * on the task being dispatched. ++ * ++ * As some 32bit archs can't do 64bit store_release/load_acquire, ++ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on ++ * 32bit machines. The dispatch race window QSEQ protects is very narrow ++ * and runs with IRQ disabled. 30 bits should be sufficient. ++ */ ++ SCX_OPSS_QSEQ_SHIFT = 2, ++}; ++ ++/* Use macros to ensure that the type is unsigned long for the masks */ ++#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) ++#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) ++ ++DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); ++ ++/* ++ * Return the rq currently locked from an scx callback, or NULL if no rq is ++ * locked. ++ */ ++static inline struct rq *scx_locked_rq(void) ++{ ++ return __this_cpu_read(scx_locked_rq_state); ++} ++ ++static inline bool scx_kf_allowed_if_unlocked(void) ++{ ++ return !current->scx.kf_mask; ++} ++ ++static inline bool scx_rq_bypassing(struct rq *rq) ++{ ++ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); ++} +-- +2.51.0 + diff --git a/queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch b/queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch new file mode 100644 index 0000000000..a9b69c8558 --- /dev/null +++ b/queue-6.17/sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch @@ -0,0 +1,128 @@ +From d30e5472caf956fd0d6267d20b2c9f45871ae70a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Sep 2025 11:33:28 -1000 +Subject: sched_ext: Put event_stats_cpu in struct scx_sched_pcpu + +From: Tejun Heo + +[ Upstream commit bcb7c2305682c77a8bfdbfe37106b314ac10110f ] + +scx_sched.event_stats_cpu is the percpu counters that are used to track +stats. Introduce struct scx_sched_pcpu and move the counters inside. This +will ease adding more per-cpu fields. No functional changes. + +Signed-off-by: Tejun Heo +Acked-by: Andrea Righi +Stable-dep-of: efeeaac9ae97 ("sched_ext: Sync error_irq_work before freeing scx_sched") +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 18 +++++++++--------- + kernel/sched/ext_internal.h | 17 ++++++++++------- + 2 files changed, 19 insertions(+), 16 deletions(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 8ecde1abb4e28..46029050b170f 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -630,7 +630,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) + * This can be used when preemption is not disabled. + */ + #define scx_add_event(sch, name, cnt) do { \ +- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ ++ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ + } while(0) + +@@ -643,7 +643,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) + * This should be used only when preemption is disabled. + */ + #define __scx_add_event(sch, name, cnt) do { \ +- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ ++ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ + trace_sched_ext_event(#name, cnt); \ + } while(0) + +@@ -3538,7 +3538,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) + int node; + + kthread_stop(sch->helper->task); +- free_percpu(sch->event_stats_cpu); ++ free_percpu(sch->pcpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); +@@ -4439,13 +4439,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) + sch->global_dsqs[node] = dsq; + } + +- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); +- if (!sch->event_stats_cpu) ++ sch->pcpu = alloc_percpu(struct scx_sched_pcpu); ++ if (!sch->pcpu) + goto err_free_gdsqs; + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (!sch->helper) +- goto err_free_event_stats; ++ goto err_free_pcpu; + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); +@@ -4463,8 +4463,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) + + err_stop_helper: + kthread_stop(sch->helper->task); +-err_free_event_stats: +- free_percpu(sch->event_stats_cpu); ++err_free_pcpu: ++ free_percpu(sch->pcpu); + err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); +@@ -6490,7 +6490,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { +- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); ++ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); +diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h +index 76690ede8700f..af4c054fb6f85 100644 +--- a/kernel/sched/ext_internal.h ++++ b/kernel/sched/ext_internal.h +@@ -846,6 +846,15 @@ struct scx_event_stats { + s64 SCX_EV_BYPASS_ACTIVATE; + }; + ++struct scx_sched_pcpu { ++ /* ++ * The event counters are in a per-CPU variable to minimize the ++ * accounting overhead. A system-wide view on the event counter is ++ * constructed when requested by scx_bpf_events(). ++ */ ++ struct scx_event_stats event_stats; ++}; ++ + struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); +@@ -860,13 +869,7 @@ struct scx_sched { + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; +- +- /* +- * The event counters are in a per-CPU variable to minimize the +- * accounting overhead. A system-wide view on the event counter is +- * constructed when requested by scx_bpf_events(). +- */ +- struct scx_event_stats __percpu *event_stats_cpu; ++ struct scx_sched_pcpu __percpu *pcpu; + + bool warned_zero_slice; + +-- +2.51.0 + diff --git a/queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch b/queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch new file mode 100644 index 0000000000..d0013daeb4 --- /dev/null +++ b/queue-6.17/sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch @@ -0,0 +1,38 @@ +From 03175244f8b0c9d1e7bae5998f3bcfb7f77c59c7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Oct 2025 13:56:23 -1000 +Subject: sched_ext: Sync error_irq_work before freeing scx_sched + +From: Tejun Heo + +[ Upstream commit efeeaac9ae9763f9c953e69633c86bc3031e39b5 ] + +By the time scx_sched_free_rcu_work() runs, the scx_sched is no longer +reachable. However, a previously queued error_irq_work may still be pending or +running. Ensure it completes before proceeding with teardown. + +Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched") +Acked-by: Andrea Righi +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 46029050b170f..f89894476e51f 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -3537,7 +3537,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work) + struct scx_dispatch_q *dsq; + int node; + ++ irq_work_sync(&sch->error_irq_work); + kthread_stop(sch->helper->task); ++ + free_percpu(sch->pcpu); + + for_each_node_state(node, N_POSSIBLE) +-- +2.51.0 + diff --git a/queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch b/queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch new file mode 100644 index 0000000000..7c7a311daa --- /dev/null +++ b/queue-6.17/seccomp-passthrough-uprobe-systemcall-without-filter.patch @@ -0,0 +1,85 @@ +From fe915f3331ace294cf2bb31d41fdcb2842b01530 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 20 Jul 2025 13:21:30 +0200 +Subject: seccomp: passthrough uprobe systemcall without filtering + +From: Jiri Olsa + +[ Upstream commit 89d1d8434d246c96309a6068dfcf9e36dc61227b ] + +Adding uprobe as another exception to the seccomp filter alongside +with the uretprobe syscall. + +Same as the uretprobe the uprobe syscall is installed by kernel as +replacement for the breakpoint exception and is limited to x86_64 +arch and isn't expected to ever be supported in i386. + +Signed-off-by: Jiri Olsa +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Kees Cook +Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/seccomp.c | 32 +++++++++++++++++++++++++------- + 1 file changed, 25 insertions(+), 7 deletions(-) + +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index 3bbfba30a777a..25f62867a16d9 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) + } + + #ifdef SECCOMP_ARCH_NATIVE ++static bool seccomp_uprobe_exception(struct seccomp_data *sd) ++{ ++#if defined __NR_uretprobe || defined __NR_uprobe ++#ifdef SECCOMP_ARCH_COMPAT ++ if (sd->arch == SECCOMP_ARCH_NATIVE) ++#endif ++ { ++#ifdef __NR_uretprobe ++ if (sd->nr == __NR_uretprobe) ++ return true; ++#endif ++#ifdef __NR_uprobe ++ if (sd->nr == __NR_uprobe) ++ return true; ++#endif ++ } ++#endif ++ return false; ++} ++ + /** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs +@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + return false; + + /* Our single exception to filtering. */ +-#ifdef __NR_uretprobe +-#ifdef SECCOMP_ARCH_COMPAT +- if (sd->arch == SECCOMP_ARCH_NATIVE) +-#endif +- if (sd->nr == __NR_uretprobe) +- return true; +-#endif ++ if (seccomp_uprobe_exception(sd)) ++ return true; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; +@@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { + __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, + #ifdef __NR_uretprobe + __NR_uretprobe, ++#endif ++#ifdef __NR_uprobe ++ __NR_uprobe, + #endif + -1, /* negative terminated */ + }; +-- +2.51.0 + diff --git a/queue-6.17/series b/queue-6.17/series new file mode 100644 index 0000000000..fc76b3a9b9 --- /dev/null +++ b/queue-6.17/series @@ -0,0 +1,35 @@ +sched_ext-move-internal-type-and-accessor-definition.patch +sched_ext-put-event_stats_cpu-in-struct-scx_sched_pc.patch +sched_ext-sync-error_irq_work-before-freeing-scx_sch.patch +timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch +x86-bugs-report-correct-retbleed-mitigation-status.patch +x86-bugs-qualify-retbleed_intel_msg.patch +genirq-chip-add-buslock-back-in-to-irq_set_handler.patch +genirq-manage-add-buslock-back-in-to-__disable_irq_n.patch +genirq-manage-add-buslock-back-in-to-enable_irq.patch +audit-record-fanotify-event-regardless-of-presence-o.patch +edac-ie31200-add-two-more-intel-alder-lake-s-socs-fo.patch +perf-x86-intel-add-icl_fixed_0_adaptive-bit-into-int.patch +perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch +perf-have-get_perf_callchain-return-null-if-crosstas.patch +perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch +edac-fix-wrong-executable-file-modes-for-c-source-fi.patch +seccomp-passthrough-uprobe-systemcall-without-filter.patch +sched_ext-keep-bypass-on-between-enable-failure-and-.patch +x86-bugs-add-attack-vector-controls-for-vmscape.patch +sched-fair-update_cfs_group-for-throttled-cfs_rqs.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +edac-mc_sysfs-increase-legacy-channel-support-to-16.patch +cpuset-use-new-excpus-for-nocpu-error-check-when-ena.patch +btrfs-abort-transaction-on-specific-error-places-whe.patch +btrfs-abort-transaction-in-the-process_one_buffer-lo.patch +btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch +btrfs-zoned-refine-extent-allocator-hint-selection.patch +btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-level-argument-in-log-tree-walk-callback-r.patch +btrfs-abort-transaction-if-we-fail-to-update-inode-i.patch +btrfs-tree-checker-add-inode-extref-checks.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch +sched_ext-make-qmap-dump-operation-non-destructive.patch +arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch diff --git a/queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch b/queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch new file mode 100644 index 0000000000..6f0931f275 --- /dev/null +++ b/queue-6.17/timekeeping-fix-aux-clocks-sysfs-initialization-loop.patch @@ -0,0 +1,45 @@ +From 03823cc4dcccf525a9b20bef586082b1dcc89adf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Oct 2025 14:17:53 +0800 +Subject: timekeeping: Fix aux clocks sysfs initialization loop bound + +From: Haofeng Li + +[ Upstream commit 39a9ed0fb6dac58547afdf9b6cb032d326a3698f ] + +The loop in tk_aux_sysfs_init() uses `i <= MAX_AUX_CLOCKS` as the +termination condition, which results in 9 iterations (i=0 to 8) when +MAX_AUX_CLOCKS is defined as 8. However, the kernel is designed to support +only up to 8 auxiliary clocks. + +This off-by-one error causes the creation of a 9th sysfs entry that exceeds +the intended auxiliary clock range. + +Fix the loop bound to use `i < MAX_AUX_CLOCKS` to ensure exactly 8 +auxiliary clock entries are created, matching the design specification. + +Fixes: 7b95663a3d96 ("timekeeping: Provide interface to control auxiliary clocks") +Signed-off-by: Haofeng Li +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/tencent_2376993D9FC06A3616A4F981B3DE1C599607@qq.com +Signed-off-by: Sasha Levin +--- + kernel/time/timekeeping.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index b6974fce800cd..3a4d3b2e3f740 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -3070,7 +3070,7 @@ static int __init tk_aux_sysfs_init(void) + return -ENOMEM; + } + +- for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { ++ for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + +-- +2.51.0 + diff --git a/queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch b/queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch new file mode 100644 index 0000000000..f2dbade985 --- /dev/null +++ b/queue-6.17/x86-bugs-add-attack-vector-controls-for-vmscape.patch @@ -0,0 +1,72 @@ +From 60bd79a607d557eed0d51b5455016a10ea60aafc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 10:24:28 -0500 +Subject: x86/bugs: Add attack vector controls for VMSCAPE + +From: David Kaplan + +[ Upstream commit 5799d5d8a6c877f03ad5b5a640977053be45059a ] + +Use attack vector controls to select whether VMSCAPE requires mitigation, +similar to other bugs. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Signed-off-by: Sasha Levin +--- + .../admin-guide/hw-vuln/attack_vector_controls.rst | 1 + + arch/x86/kernel/cpu/bugs.c | 14 ++++++++++---- + 2 files changed, 11 insertions(+), 4 deletions(-) + +diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +index 5964901d66e31..d0bdbd81dcf9f 100644 +--- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst ++++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +@@ -218,6 +218,7 @@ SRSO X X X X + SSB X + TAA X X X X * (Note 2) + TSA X X X X ++VMSCAPE X + =============== ============== ============ ============= ============== ============ ======== + + Notes: +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 9750ce448e626..c6bb8e76eb984 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) + case X86_BUG_SPEC_STORE_BYPASS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + ++ case X86_BUG_VMSCAPE: ++ return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); ++ + default: + WARN(1, "Unknown bug %x\n", bug); + return false; +@@ -3308,15 +3311,18 @@ early_param("vmscape", vmscape_parse_cmdline); + + static void __init vmscape_select_mitigation(void) + { +- if (cpu_mitigations_off() || +- !boot_cpu_has_bug(X86_BUG_VMSCAPE) || ++ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + +- if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) +- vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; ++ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { ++ if (should_mitigate_vuln(X86_BUG_VMSCAPE)) ++ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; ++ else ++ vmscape_mitigation = VMSCAPE_MITIGATION_NONE; ++ } + } + + static void __init vmscape_update_mitigation(void) +-- +2.51.0 + diff --git a/queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..317654044f --- /dev/null +++ b/queue-6.17/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From 705fc41b44d203b1500a524f3fb04ba1c63cd931 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index c6bb8e76eb984..26ece97011fd7 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -2052,7 +2052,7 @@ static void __init spectre_v2_user_apply_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3636,9 +3636,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch b/queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch new file mode 100644 index 0000000000..40946f7b59 --- /dev/null +++ b/queue-6.17/x86-bugs-qualify-retbleed_intel_msg.patch @@ -0,0 +1,47 @@ +From 9965d529966df68e304d4db15a0da58fce023b71 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Oct 2025 12:19:36 -0500 +Subject: x86/bugs: Qualify RETBLEED_INTEL_MSG + +From: David Kaplan + +[ Upstream commit 204ced4108f5d38f6804968fd9543cc69c3f8da6 ] + +When retbleed mitigation is disabled, the kernel already prints an info +message that the system is vulnerable. Recent code restructuring also +inadvertently led to RETBLEED_INTEL_MSG being printed as an error, which is +unnecessary as retbleed mitigation was already explicitly disabled (by config +option, cmdline, etc.). + +Qualify this print statement so the warning is not printed unless an actual +retbleed mitigation was selected and is being disabled due to incompatibility +with spectre_v2. + +Fixes: e3b78a7ad5ea ("x86/bugs: Restructure retbleed mitigation") +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220624 +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://patch.msgid.link/20251003171936.155391-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index bf79ff6a1f662..9750ce448e626 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1461,7 +1461,9 @@ static void __init retbleed_update_mitigation(void) + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { +- pr_err(RETBLEED_INTEL_MSG); ++ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE) ++ pr_err(RETBLEED_INTEL_MSG); ++ + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } + } +-- +2.51.0 + diff --git a/queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch new file mode 100644 index 0000000000..c7daac2154 --- /dev/null +++ b/queue-6.17/x86-bugs-report-correct-retbleed-mitigation-status.patch @@ -0,0 +1,47 @@ +From 029a4346ea7f82d5882b314eca129b1591db28b4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:06 -0500 +Subject: x86/bugs: Report correct retbleed mitigation status + +From: David Kaplan + +[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ] + +On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this +requires that a similar spectre_v2 mitigation is applied. If the user +selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a +warning is printed but sysfs will still report 'Mitigation: IBRS' or +'Mitigation: Enhanced IBRS'. This is incorrect because retbleed is not +mitigated, and IBRS is not actually set. + +Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the +kernel correctly reports the system as vulnerable to retbleed. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Stable-dep-of: 204ced4108f5 ("x86/bugs: Qualify RETBLEED_INTEL_MSG") +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 36dcfc5105be9..bf79ff6a1f662 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1460,8 +1460,10 @@ static void __init retbleed_update_mitigation(void) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: +- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) ++ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { + pr_err(RETBLEED_INTEL_MSG); ++ retbleed_mitigation = RETBLEED_MITIGATION_NONE; ++ } + } + } + +-- +2.51.0 + diff --git a/queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch b/queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch new file mode 100644 index 0000000000..b8d4f9ec82 --- /dev/null +++ b/queue-6.6/arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch @@ -0,0 +1,295 @@ +From 54968164c79970c4670228c2de8fd262e28c5c2e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Sep 2025 14:09:13 +0800 +Subject: arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c + +From: Menglong Dong + +[ Upstream commit 35561bab768977c9e05f1f1a9bc00134c85f3e28 ] + +The include/generated/asm-offsets.h is generated in Kbuild during +compiling from arch/SRCARCH/kernel/asm-offsets.c. When we want to +generate another similar offset header file, circular dependency can +happen. + +For example, we want to generate a offset file include/generated/test.h, +which is included in include/sched/sched.h. If we generate asm-offsets.h +first, it will fail, as include/sched/sched.h is included in asm-offsets.c +and include/generated/test.h doesn't exist; If we generate test.h first, +it can't success neither, as include/generated/asm-offsets.h is included +by it. + +In x86_64, the macro COMPILE_OFFSETS is used to avoid such circular +dependency. We can generate asm-offsets.h first, and if the +COMPILE_OFFSETS is defined, we don't include the "generated/test.h". + +And we define the macro COMPILE_OFFSETS for all the asm-offsets.c for this +purpose. + +Signed-off-by: Menglong Dong +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sasha Levin +--- + arch/alpha/kernel/asm-offsets.c | 1 + + arch/arc/kernel/asm-offsets.c | 1 + + arch/arm/kernel/asm-offsets.c | 2 ++ + arch/arm64/kernel/asm-offsets.c | 1 + + arch/csky/kernel/asm-offsets.c | 1 + + arch/hexagon/kernel/asm-offsets.c | 1 + + arch/loongarch/kernel/asm-offsets.c | 2 ++ + arch/m68k/kernel/asm-offsets.c | 1 + + arch/microblaze/kernel/asm-offsets.c | 1 + + arch/mips/kernel/asm-offsets.c | 2 ++ + arch/nios2/kernel/asm-offsets.c | 1 + + arch/openrisc/kernel/asm-offsets.c | 1 + + arch/parisc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/riscv/kernel/asm-offsets.c | 1 + + arch/s390/kernel/asm-offsets.c | 1 + + arch/sh/kernel/asm-offsets.c | 1 + + arch/sparc/kernel/asm-offsets.c | 1 + + arch/um/kernel/asm-offsets.c | 2 ++ + arch/xtensa/kernel/asm-offsets.c | 1 + + 20 files changed, 24 insertions(+) + +diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c +index 11c35cf45b461..cb205f22096d7 100644 +--- a/arch/alpha/kernel/asm-offsets.c ++++ b/arch/alpha/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c +index f77deb7991757..2978da85fcb65 100644 +--- a/arch/arc/kernel/asm-offsets.c ++++ b/arch/arc/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 219cbc7e5d134..3840e1e22b751 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -7,6 +7,8 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 5ff1942b04fcf..ea2d740db81c5 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -6,6 +6,7 @@ + * 2001-2002 Keith Owens + * Copyright (C) 2012 ARM Ltd. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c +index d1e9035794733..5525c8e7e1d9e 100644 +--- a/arch/csky/kernel/asm-offsets.c ++++ b/arch/csky/kernel/asm-offsets.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c +index 03a7063f94561..50eea9fa6f137 100644 +--- a/arch/hexagon/kernel/asm-offsets.c ++++ b/arch/hexagon/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * + * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c +index 8da0726777edb..110afd3cc8f34 100644 +--- a/arch/loongarch/kernel/asm-offsets.c ++++ b/arch/loongarch/kernel/asm-offsets.c +@@ -4,6 +4,8 @@ + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c +index 906d732305374..67a1990f9d748 100644 +--- a/arch/m68k/kernel/asm-offsets.c ++++ b/arch/m68k/kernel/asm-offsets.c +@@ -9,6 +9,7 @@ + * #defines from the assembly-language output. + */ + ++#define COMPILE_OFFSETS + #define ASM_OFFSETS_C + + #include +diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c +index 104c3ac5f30c8..b4b67d58e7f6a 100644 +--- a/arch/microblaze/kernel/asm-offsets.c ++++ b/arch/microblaze/kernel/asm-offsets.c +@@ -7,6 +7,7 @@ + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c +index cb1045ebab062..22c99a2cd5707 100644 +--- a/arch/mips/kernel/asm-offsets.c ++++ b/arch/mips/kernel/asm-offsets.c +@@ -9,6 +9,8 @@ + * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000 MIPS Technologies, Inc. + */ ++#define COMPILE_OFFSETS ++ + #include + #include + #include +diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c +index e3d9b7b6fb48a..88190b503ce5d 100644 +--- a/arch/nios2/kernel/asm-offsets.c ++++ b/arch/nios2/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2011 Tobias Klauser + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c +index 710651d5aaae1..3cc826f2216b1 100644 +--- a/arch/openrisc/kernel/asm-offsets.c ++++ b/arch/openrisc/kernel/asm-offsets.c +@@ -18,6 +18,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c +index 757816a7bd4b2..9abfe65492c65 100644 +--- a/arch/parisc/kernel/asm-offsets.c ++++ b/arch/parisc/kernel/asm-offsets.c +@@ -13,6 +13,7 @@ + * Copyright (C) 2002 Randolph Chung + * Copyright (C) 2003 James Bottomley + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index 2affd30468bc4..e2cee2f2ededd 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index 6a992cba2f287..e4589457e6085 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -3,6 +3,7 @@ + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c +index fa5f6885c74aa..73a989dcfe208 100644 +--- a/arch/s390/kernel/asm-offsets.c ++++ b/arch/s390/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #define ASM_OFFSETS_C + +diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c +index a0322e8328456..429b6a7631468 100644 +--- a/arch/sh/kernel/asm-offsets.c ++++ b/arch/sh/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c +index 5784f2df489a4..f1e27a7f800f4 100644 +--- a/arch/sparc/kernel/asm-offsets.c ++++ b/arch/sparc/kernel/asm-offsets.c +@@ -10,6 +10,7 @@ + * + * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. + */ ++#define COMPILE_OFFSETS + + #include + #include +diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c +index 1fb12235ab9c8..a69873aa697f4 100644 +--- a/arch/um/kernel/asm-offsets.c ++++ b/arch/um/kernel/asm-offsets.c +@@ -1 +1,3 @@ ++#define COMPILE_OFFSETS ++ + #include +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index da38de20ae598..cfbced95e944a 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -11,6 +11,7 @@ + * + * Chris Zankel + */ ++#define COMPILE_OFFSETS + + #include + #include +-- +2.51.0 + diff --git a/queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch b/queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch new file mode 100644 index 0000000000..5c81f75e0e --- /dev/null +++ b/queue-6.6/audit-record-fanotify-event-regardless-of-presence-o.patch @@ -0,0 +1,44 @@ +From 7a103238fdb26a551efc3eec1a75f8d386103d02 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Aug 2025 17:04:07 -0400 +Subject: audit: record fanotify event regardless of presence of rules + +From: Richard Guy Briggs + +[ Upstream commit ce8370e2e62a903e18be7dd0e0be2eee079501e1 ] + +When no audit rules are in place, fanotify event results are +unconditionally dropped due to an explicit check for the existence of +any audit rules. Given this is a report from another security +sub-system, allow it to be recorded regardless of the existence of any +audit rules. + +To test, install and run the fapolicyd daemon with default config. Then +as an unprivileged user, create and run a very simple binary that should +be denied. Then check for an event with + ausearch -m FANOTIFY -ts recent + +Link: https://issues.redhat.com/browse/RHEL-9065 +Signed-off-by: Richard Guy Briggs +Signed-off-by: Paul Moore +Signed-off-by: Sasha Levin +--- + include/linux/audit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/audit.h b/include/linux/audit.h +index 335e1ba5a2327..7ca75f8873799 100644 +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -526,7 +526,7 @@ static inline void audit_log_kern_module(const char *name) + + static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) + { +- if (!audit_dummy_context()) ++ if (audit_enabled) + __audit_fanotify(response, friar); + } + +-- +2.51.0 + diff --git a/queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch b/queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch new file mode 100644 index 0000000000..0b54c20884 --- /dev/null +++ b/queue-6.6/btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch @@ -0,0 +1,63 @@ +From 012af0d8a5f2d9c3d7e993a07113cefeca540801 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Aug 2025 12:10:28 +0100 +Subject: btrfs: always drop log root tree reference in btrfs_replay_log() + +From: Filipe Manana + +[ Upstream commit 2f5b8095ea47b142c56c09755a8b1e14145a2d30 ] + +Currently we have this odd behaviour: + +1) At btrfs_replay_log() we drop the reference of the log root tree if + the call to btrfs_recover_log_trees() failed; + +2) But if the call to btrfs_recover_log_trees() did not fail, we don't + drop the reference in btrfs_replay_log() - we expect that + btrfs_recover_log_trees() does it in case it returns success. + +Let's simplify this and make btrfs_replay_log() always drop the reference +on the log root tree, not only this simplifies code as it's what makes +sense since it's btrfs_replay_log() who grabbed the reference in the first +place. + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index bb5f7911d473c..7ad1734cbbfc9 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2080,10 +2080,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 4b53e19f7520f..e00298c6c30a1 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -7422,7 +7422,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +-- +2.51.0 + diff --git a/queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch b/queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch new file mode 100644 index 0000000000..ebdb01f9cf --- /dev/null +++ b/queue-6.6/btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch @@ -0,0 +1,44 @@ +From 33914610d5e0981512b297973a9438c04ce73add Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Sep 2025 17:01:44 +0200 +Subject: btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + +From: Thorsten Blum + +[ Upstream commit a7f3dfb8293c4cee99743132d69863a92e8f4875 ] + +Replace max_t() followed by min_t() with a single clamp(). + +As was pointed by David Laight in +https://lore.kernel.org/linux-btrfs/20250906122458.75dfc8f0@pumpkin/ +the calculation may overflow u32 when the input value is too large, so +clamp_t() is not used. In practice the expected values are in range of +megabytes to gigabytes (throughput limit) so the bug would not happen. + +Signed-off-by: Thorsten Blum +Reviewed-by: David Sterba +[ Use clamp() and add explanation. ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 7632d652a1257..4a5a5ee360e57 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1271,8 +1271,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +-- +2.51.0 + diff --git a/queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch b/queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch new file mode 100644 index 0000000000..f56907d3d1 --- /dev/null +++ b/queue-6.6/btrfs-use-level-argument-in-log-tree-walk-callback-r.patch @@ -0,0 +1,50 @@ +From 2a84dc26e9fa0d7a677021843e9d860d72f6a485 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Aug 2025 17:46:18 +0100 +Subject: btrfs: use level argument in log tree walk callback + replay_one_buffer() + +From: Filipe Manana + +[ Upstream commit 6cb7f0b8c9b0d6a35682335fea88bd26f089306f ] + +We already have the extent buffer's level in an argument, there's no need +to first ensure the extent buffer's data is loaded (by calling +btrfs_read_extent_buffer()) and then call btrfs_header_level() to check +the level. So use the level argument and do the check before calling +btrfs_read_extent_buffer(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index e00298c6c30a1..5512991b24faa 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2493,15 +2493,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + int i; + int ret; + ++ if (level != 0) ++ return 0; ++ + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + +- level = btrfs_header_level(eb); +- +- if (level != 0) +- return 0; +- + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +-- +2.51.0 + diff --git a/queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch b/queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch new file mode 100644 index 0000000000..4c76fca0b5 --- /dev/null +++ b/queue-6.6/btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch @@ -0,0 +1,58 @@ +From 4e2e37c8c157fbd155fedbe333bfb6f4e13941f1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Sep 2025 12:09:14 +0100 +Subject: btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + +From: Filipe Manana + +[ Upstream commit 45c222468d33202c07c41c113301a4b9c8451b8f ] + +After setting the BTRFS_ROOT_FORCE_COW flag on the root we are doing a +full write barrier, smp_wmb(), but we don't need to, all we need is a +smp_mb__after_atomic(). The use of the smp_wmb() is from the old days +when we didn't use a bit and used instead an int field in the root to +signal if cow is forced. After the int field was changed to a bit in +the root's state (flags field), we forgot to update the memory barrier +in create_pending_snapshot() to smp_mb__after_atomic(), but we did the +change in commit_fs_roots() after clearing BTRFS_ROOT_FORCE_COW. That +happened in commit 27cdeb7096b8 ("Btrfs: use bitfield instead of integer +data type for the some variants in btrfs_root"). On the reader side, in +should_cow_block(), we also use the counterpart smp_mb__before_atomic() +which generates further confusion. + +So change the smp_wmb() to smp_mb__after_atomic(). In fact we don't +even need any barrier at all since create_pending_snapshot() is called +in the critical section of a transaction commit and therefore no one +can concurrently join/attach the transaction, or start a new one, until +the transaction is unblocked. By the time someone starts a new transaction +and enters should_cow_block(), a lot of implicit memory barriers already +took place by having acquired several locks such as fs_info->trans_lock +and extent buffer locks on the root node at least. Nevertlheless, for +consistency use smp_mb__after_atomic() after setting the force cow bit +in create_pending_snapshot(). + +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index 3989cb19cdae7..20add63421b3d 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1796,7 +1796,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +-- +2.51.0 + diff --git a/queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch b/queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch new file mode 100644 index 0000000000..ac945a1758 --- /dev/null +++ b/queue-6.6/btrfs-zoned-refine-extent-allocator-hint-selection.patch @@ -0,0 +1,59 @@ +From ffa3e67ee0a42fb8a270a866991cde00d27090a1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Jul 2025 11:13:15 +0900 +Subject: btrfs: zoned: refine extent allocator hint selection + +From: Naohiro Aota + +[ Upstream commit 0d703963d297964451783e1a0688ebdf74cd6151 ] + +The hint block group selection in the extent allocator is wrong in the +first place, as it can select the dedicated data relocation block group for +the normal data allocation. + +Since we separated the normal data space_info and the data relocation +space_info, we can easily identify a block group is for data relocation or +not. Do not choose it for the normal data allocation. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 8248113eb067f..5e3d1a87b7e9d 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4175,7 +4175,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + } + + static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++ struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_space_info *space_info) + { + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); +@@ -4199,6 +4200,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && ++ block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; +@@ -4220,7 +4222,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: +- return prepare_allocation_zoned(fs_info, ffe_ctl); ++ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); + default: + BUG(); + } +-- +2.51.0 + diff --git a/queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch b/queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch new file mode 100644 index 0000000000..592fdf9797 --- /dev/null +++ b/queue-6.6/btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch @@ -0,0 +1,111 @@ +From 34e6b449c4d896095ab2e81088393f75b5995c52 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Jul 2025 13:39:11 +0200 +Subject: btrfs: zoned: return error from btrfs_zone_finish_endio() + +From: Johannes Thumshirn + +[ Upstream commit 3c44cd3c79fcb38a86836dea6ff8fec322a9e68c ] + +Now that btrfs_zone_finish_endio_workfn() is directly calling +do_zone_finish() the only caller of btrfs_zone_finish_endio() is +btrfs_finish_one_ordered(). + +btrfs_finish_one_ordered() already has error handling in-place so +btrfs_zone_finish_endio() can return an error if the block group lookup +fails. + +Also as btrfs_zone_finish_endio() already checks for zoned filesystems and +returns early, there's no need to do this in the caller. + +Reviewed-by: Damien Le Moal +Signed-off-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 7 ++++--- + fs/btrfs/zoned.c | 8 +++++--- + fs/btrfs/zoned.h | 9 ++++++--- + 3 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index ee5ffeab85bb7..b1be3e0fe7282 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3051,9 +3051,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) + goto out; + } + +- if (btrfs_is_zoned(fs_info)) +- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, +- ordered_extent->disk_num_bytes); ++ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ++ ordered_extent->disk_num_bytes); ++ if (ret) ++ goto out; + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 3622ba1d8e09f..6e8b8c46ba18f 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2263,16 +2263,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + return ret; + } + +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) + { + struct btrfs_block_group *block_group; + u64 min_alloc_bytes; + + if (!btrfs_is_zoned(fs_info)) +- return; ++ return 0; + + block_group = btrfs_lookup_block_group(fs_info, logical); +- ASSERT(block_group); ++ if (WARN_ON_ONCE(!block_group)) ++ return -ENOENT; + + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) +@@ -2289,6 +2290,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len + + out: + btrfs_put_block_group(block_group); ++ return 0; + } + + static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 448955641d114..c18f31d3dc25f 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -71,7 +71,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); +@@ -227,8 +227,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + return true; + } + +-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) { } ++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, ++ u64 logical, u64 length) ++{ ++ return 0; ++} + + static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } +-- +2.51.0 + diff --git a/queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch b/queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch new file mode 100644 index 0000000000..b07bb8b86f --- /dev/null +++ b/queue-6.6/edac-mc_sysfs-increase-legacy-channel-support-to-16.patch @@ -0,0 +1,89 @@ +From 762f27a0e18a98a9747de23af1f4ad13e83ccecf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 20:30:17 +0000 +Subject: EDAC/mc_sysfs: Increase legacy channel support to 16 + +From: Avadhut Naik + +[ Upstream commit 6e1c2c6c2c40ce99e0d2633b212f43c702c1a002 ] + +Newer AMD systems can support up to 16 channels per EDAC "mc" device. +These are detected by the EDAC module running on the device, and the +current EDAC interface is appropriately enumerated. + +The legacy EDAC sysfs interface however, provides device attributes for +channels 0 through 11 only. Consequently, the last four channels, 12 +through 15, will not be enumerated and will not be visible through the +legacy sysfs interface. + +Add additional device attributes to ensure that all 16 channels, if +present, are enumerated by and visible through the legacy EDAC sysfs +interface. + +Signed-off-by: Avadhut Naik +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250916203242.1281036-1-avadhut.naik@amd.com +Signed-off-by: Sasha Levin +--- + drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c +index 15f63452a9bec..b01436d9ddaed 100644 +--- a/drivers/edac/edac_mc_sysfs.c ++++ b/drivers/edac/edac_mc_sysfs.c +@@ -306,6 +306,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); + DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); ++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 12); ++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 13); ++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 14); ++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 15); + + /* Total possible dynamic DIMM Label attribute file table */ + static struct attribute *dynamic_csrow_dimm_attr[] = { +@@ -321,6 +329,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, ++ &dev_attr_legacy_ch12_dimm_label.attr.attr, ++ &dev_attr_legacy_ch13_dimm_label.attr.attr, ++ &dev_attr_legacy_ch14_dimm_label.attr.attr, ++ &dev_attr_legacy_ch15_dimm_label.attr.attr, + NULL + }; + +@@ -349,6 +361,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); + DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); ++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 12); ++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 13); ++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 14); ++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 15); + + /* Total possible dynamic ce_count attribute file table */ + static struct attribute *dynamic_csrow_ce_count_attr[] = { +@@ -364,6 +384,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, ++ &dev_attr_legacy_ch12_ce_count.attr.attr, ++ &dev_attr_legacy_ch13_ce_count.attr.attr, ++ &dev_attr_legacy_ch14_ce_count.attr.attr, ++ &dev_attr_legacy_ch15_ce_count.attr.attr, + NULL + }; + +-- +2.51.0 + diff --git a/queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch b/queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch new file mode 100644 index 0000000000..56ee58921f --- /dev/null +++ b/queue-6.6/perf-have-get_perf_callchain-return-null-if-crosstas.patch @@ -0,0 +1,68 @@ +From 802862b01265aaa81829f67331259dede4a3354b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:40 -0400 +Subject: perf: Have get_perf_callchain() return NULL if crosstask and user are + set + +From: Josh Poimboeuf + +[ Upstream commit 153f9e74dec230f2e070e16fa061bc7adfd2c450 ] + +get_perf_callchain() doesn't support cross-task unwinding for user space +stacks, have it return NULL if both the crosstask and user arguments are +set. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 65fea424874c5..ee01cfcc35064 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -184,6 +184,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + struct perf_callchain_entry_ctx ctx; + int rctx; + ++ /* crosstask is not supported for user stacks */ ++ if (crosstask && user && !kernel) ++ return NULL; ++ + entry = get_callchain_entry(&rctx); + if (!entry) + return NULL; +@@ -200,7 +204,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + perf_callchain_kernel(&ctx, regs); + } + +- if (user) { ++ if (user && !crosstask) { + if (!user_mode(regs)) { + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; +@@ -209,9 +213,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + + if (regs) { +- if (crosstask) +- goto exit_put; +- + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + +@@ -219,7 +220,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + } + +-exit_put: + put_callchain_entry(rctx); + + return entry; +-- +2.51.0 + diff --git a/queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch b/queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch new file mode 100644 index 0000000000..1e5cbfc956 --- /dev/null +++ b/queue-6.6/perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch @@ -0,0 +1,37 @@ +From f7b7c12558c04bfe26dfb17cefd5143e4e8e98f9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:43 -0400 +Subject: perf: Skip user unwind if the task is a kernel thread + +From: Josh Poimboeuf + +[ Upstream commit 16ed389227651330879e17bd83d43bd234006722 ] + +If the task is not a user thread, there's no user stack to unwind. + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/core.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 3eb9125431b43..c9a3fb6fdb2f6 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -7724,7 +7724,8 @@ struct perf_callchain_entry * + perf_callchain(struct perf_event *event, struct pt_regs *regs) + { + bool kernel = !event->attr.exclude_callchain_kernel; +- bool user = !event->attr.exclude_callchain_user; ++ bool user = !event->attr.exclude_callchain_user && ++ !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; +-- +2.51.0 + diff --git a/queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch b/queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch new file mode 100644 index 0000000000..42659ada66 --- /dev/null +++ b/queue-6.6/perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch @@ -0,0 +1,67 @@ +From a77239aceb868da27cb7d047c23b0e2c38130faf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Aug 2025 14:03:41 -0400 +Subject: perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of + current->mm == NULL + +From: Steven Rostedt + +[ Upstream commit 90942f9fac05702065ff82ed0bade0d08168d4ea ] + +To determine if a task is a kernel thread or not, it is more reliable to +use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on +current->mm being NULL. That is because some kernel tasks (io_uring +helpers) may have a mm field. + +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org +Signed-off-by: Sasha Levin +--- + kernel/events/callchain.c | 6 +++--- + kernel/events/core.c | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 1273be84392cf..65fea424874c5 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -202,10 +202,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + + if (user) { + if (!user_mode(regs)) { +- if (current->mm) +- regs = task_pt_regs(current); +- else ++ if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; ++ else ++ regs = task_pt_regs(current); + } + + if (regs) { +diff --git a/kernel/events/core.c b/kernel/events/core.c +index b73f5c44113d6..3eb9125431b43 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -6985,7 +6985,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; +- } else if (!(current->flags & PF_KTHREAD)) { ++ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + perf_get_regs_user(regs_user, regs); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; +@@ -7612,7 +7612,7 @@ static u64 perf_virt_to_phys(u64 virt) + * Try IRQ-safe get_user_page_fast_only first. + * If failed, leave phys_addr as 0. + */ +- if (current->mm != NULL) { ++ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct page *p; + + pagefault_disable(); +-- +2.51.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 64bc2c1eea..00661666af 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -1 +1,15 @@ net-sched-sch_qfq-fix-null-deref-in-agg_dequeue.patch +audit-record-fanotify-event-regardless-of-presence-o.patch +perf-use-current-flags-pf_kthread-pf_user_worker-ins.patch +perf-have-get_perf_callchain-return-null-if-crosstas.patch +perf-skip-user-unwind-if-the-task-is-a-kernel-thread.patch +x86-bugs-report-correct-retbleed-mitigation-status.patch +x86-bugs-fix-reporting-of-lfence-retpoline.patch +edac-mc_sysfs-increase-legacy-channel-support-to-16.patch +btrfs-zoned-return-error-from-btrfs_zone_finish_endi.patch +btrfs-zoned-refine-extent-allocator-hint-selection.patch +btrfs-scrub-replace-max_t-min_t-with-clamp-in-scrub_.patch +btrfs-always-drop-log-root-tree-reference-in-btrfs_r.patch +btrfs-use-level-argument-in-log-tree-walk-callback-r.patch +btrfs-use-smp_mb__after_atomic-when-forcing-cow-in-c.patch +arch-add-the-macro-compile_offsets-to-all-the-asm-of.patch diff --git a/queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch b/queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch new file mode 100644 index 0000000000..cdc1afc63e --- /dev/null +++ b/queue-6.6/x86-bugs-fix-reporting-of-lfence-retpoline.patch @@ -0,0 +1,51 @@ +From 9e19126d42b14a90eced153d573608f84adf3db2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:05 -0500 +Subject: x86/bugs: Fix reporting of LFENCE retpoline + +From: David Kaplan + +[ Upstream commit d1cc1baef67ac6c09b74629ca053bf3fb812f7dc ] + +The LFENCE retpoline mitigation is not secure but the kernel prints +inconsistent messages about this fact. The dmesg log says 'Mitigation: +LFENCE', implying the system is mitigated. But sysfs reports 'Vulnerable: +LFENCE' implying the system (correctly) is not mitigated. + +Fix this by printing a consistent 'Vulnerable: LFENCE' string everywhere +when this mitigation is selected. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index f66e0e5b49eb1..ef1d3a5024ed4 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1594,7 +1594,7 @@ spectre_v2_user_select_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3222,9 +3222,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +-- +2.51.0 + diff --git a/queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch b/queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch new file mode 100644 index 0000000000..cece4798c5 --- /dev/null +++ b/queue-6.6/x86-bugs-report-correct-retbleed-mitigation-status.patch @@ -0,0 +1,46 @@ +From c0f1ee31d227222ce9307b53bc63615f5c5fc2b6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Sep 2025 08:47:06 -0500 +Subject: x86/bugs: Report correct retbleed mitigation status + +From: David Kaplan + +[ Upstream commit 930f2361fe542a00de9ce6070b1b6edb976f1165 ] + +On Intel CPUs, the default retbleed mitigation is IBRS/eIBRS but this +requires that a similar spectre_v2 mitigation is applied. If the user +selects a different spectre_v2 mitigation (like spectre_v2=retpoline) a +warning is printed but sysfs will still report 'Mitigation: IBRS' or +'Mitigation: Enhanced IBRS'. This is incorrect because retbleed is not +mitigated, and IBRS is not actually set. + +Fix this by choosing RETBLEED_MITIGATION_NONE in this scenario so the +kernel correctly reports the system as vulnerable to retbleed. + +Signed-off-by: David Kaplan +Signed-off-by: Borislav Petkov (AMD) +Link: https://lore.kernel.org/20250915134706.3201818-1-david.kaplan@amd.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/cpu/bugs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 315926ccea0fa..f66e0e5b49eb1 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1185,8 +1185,10 @@ static void __init retbleed_select_mitigation(void) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: +- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) ++ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { + pr_err(RETBLEED_INTEL_MSG); ++ retbleed_mitigation = RETBLEED_MITIGATION_NONE; ++ } + } + } + +-- +2.51.0 + -- 2.47.3