From: Sasha Levin Date: Mon, 22 Jun 2026 19:53:52 +0000 (-0400) Subject: Fixes for all trees X-Git-Url: http://git.ipfire.org/index.cgi?a=commitdiff_plain;h=HEAD;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for all trees Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch b/queue-6.1/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch new file mode 100644 index 0000000000..3eec8900c5 --- /dev/null +++ b/queue-6.1/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch @@ -0,0 +1,39 @@ +From c9fdb67b2301d0ea95df7310796670a59113bad5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:32 +0200 +Subject: ARM: allow __do_kernel_fault() to report execution of memory faults + +From: Russell King (Oracle) + +commit 40b466db1dffb41f0529035c59c5739636d0e5b8 upstream + +Allow __do_kernel_fault() to detect the execution of memory, so we can +provide the same fault message as do_page_fault() would do. This is +required when we split the kernel address fault handling from the +main do_page_fault() code path. + +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 2315d40760a787..c94633eb64a1bb 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -172,6 +172,8 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, + */ + if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; ++ } else if (is_permission_fault(fsr) && fsr & FSR_LNX_PF) { ++ msg = "execution of memory"; + } else { + if (is_translation_fault(fsr) && + kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) +-- +2.53.0 + diff --git a/queue-6.1/arm-fix-branch-predictor-hardening.patch b/queue-6.1/arm-fix-branch-predictor-hardening.patch new file mode 100644 index 0000000000..c8db399cee --- /dev/null +++ b/queue-6.1/arm-fix-branch-predictor-hardening.patch @@ -0,0 +1,154 @@ +From 1bbb126846de94b95bc4cb739ceb067c945455b3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:34 +0200 +Subject: ARM: fix branch predictor hardening + +From: Russell King (Oracle) + +commit fd2dee1c6e2256f726ba33fd3083a7be0efc80d3 upstream. + +__do_user_fault() may be called with indeterminent interrupt enable +state, which means we may be preemptive at this point. This causes +problems when calling harden_branch_predictor(). For example, when +called from a data abort, do_alignment_fault()->do_bad_area(). + +Move harden_branch_predictor() out of __do_user_fault() and into the +calling contexts. + +Moving it into do_kernel_address_page_fault(), we can be sure that +interrupts will be disabled here. + +Converting do_translation_fault() to use do_kernel_address_page_fault() +rather than do_bad_area() means that we keep branch predictor handling +for translation faults. Interrupts will also be disabled at this call +site. + +do_sect_fault() needs special handling, so detect user mode accesses +to kernel-addresses, and add an explicit call to branch predictor +hardening. + +Finally, add branch predictor hardening to do_alignment() for the +faulting case (user mode accessing kernel addresses) before interrupts +are enabled. + +This should cover all cases where harden_branch_predictor() is called, +ensuring that it is always has interrupts disabled, also ensuring that +it is called early in each call path. + +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/alignment.c | 4 ++++ + arch/arm/mm/fault.c | 39 ++++++++++++++++++++++++++------------- + 2 files changed, 30 insertions(+), 13 deletions(-) + +diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c +index f8dd0b3cc8e040..ee264737be6d26 100644 +--- a/arch/arm/mm/alignment.c ++++ b/arch/arm/mm/alignment.c +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + #include + +@@ -809,6 +810,9 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + int thumb2_32b = 0; + int fault; + ++ if (addr >= TASK_SIZE && user_mode(regs)) ++ harden_branch_predictor(); ++ + if (interrupts_enabled(regs)) + local_irq_enable(); + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 907705992ab65f..d0681285dbda3e 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -195,9 +195,6 @@ __do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig, + { + struct task_struct *tsk = current; + +- if (addr > TASK_SIZE) +- harden_branch_predictor(); +- + #ifdef CONFIG_DEBUG_USER + if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) || + ((user_debug & UDBG_BUS) && (sig == SIGBUS))) { +@@ -248,8 +245,10 @@ do_kernel_address_page_fault(struct mm_struct *mm, unsigned long addr, + /* + * Fault from user mode for a kernel space address. User mode + * should not be faulting in kernel space, which includes the +- * vector/khelper page. Send a SIGSEGV. ++ * vector/khelper page. Handle the branch predictor hardening ++ * while interrupts are still disabled, then send a SIGSEGV. + */ ++ harden_branch_predictor(); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + } else { + /* +@@ -419,16 +418,20 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + * We enter here because the first level page table doesn't contain + * a valid entry for the address. + * +- * If the address is in kernel space (>= TASK_SIZE), then we are +- * probably faulting in the vmalloc() area. ++ * If this is a user address (addr < TASK_SIZE), we handle this as a ++ * normal page fault. This leaves the remainder of the function to handle ++ * kernel address translation faults. + * +- * If the init_task's first level page tables contains the relevant +- * entry, we copy the it to this task. If not, we send the process +- * a signal, fixup the exception, or oops the kernel. ++ * Since user mode is not permitted to access kernel addresses, pass these ++ * directly to do_kernel_address_page_fault() to handle. + * +- * NOTE! We MUST NOT take any locks for this case. We may be in an +- * interrupt or a critical region, and should only copy the information +- * from the master page table, nothing more. ++ * Otherwise, we're probably faulting in the vmalloc() area, so try to fix ++ * that up. Note that we must not take any locks or enable interrupts in ++ * this case. ++ * ++ * If vmalloc() fixup fails, that means the non-leaf page tables did not ++ * contain an entry for this address, so handle this via ++ * do_kernel_address_page_fault(). + */ + #ifdef CONFIG_MMU + static int __kprobes +@@ -494,7 +497,8 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + return 0; + + bad_area: +- do_bad_area(addr, fsr, regs); ++ do_kernel_address_page_fault(current->mm, addr, fsr, regs); ++ + return 0; + } + #else /* CONFIG_MMU */ +@@ -514,7 +518,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + static int + do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { ++ /* ++ * If this is a kernel address, but from user mode, then userspace ++ * is trying bad stuff. Invoke the branch predictor handling. ++ * Interrupts are disabled here. ++ */ ++ if (addr >= TASK_SIZE && user_mode(regs)) ++ harden_branch_predictor(); ++ + do_bad_area(addr, fsr, regs); ++ + return 0; + } + #endif /* CONFIG_ARM_LPAE */ +-- +2.53.0 + diff --git a/queue-6.1/arm-fix-hash_name-fault.patch b/queue-6.1/arm-fix-hash_name-fault.patch new file mode 100644 index 0000000000..e2c2eeb3ed --- /dev/null +++ b/queue-6.1/arm-fix-hash_name-fault.patch @@ -0,0 +1,109 @@ +From 3f5062f86cdd26f1ed4d1aed7a99c629bd5ea80c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:33 +0200 +Subject: ARM: fix hash_name() fault + +From: Russell King (Oracle) + +commit 7733bc7d299d682f2723dc38fc7f370b9bf973e9 upstream. + +Zizhi Wo reports: + +"During the execution of hash_name()->load_unaligned_zeropad(), a + potential memory access beyond the PAGE boundary may occur. For + example, when the filename length is near the PAGE_SIZE boundary. + This triggers a page fault, which leads to a call to + do_page_fault()->mmap_read_trylock(). If we can't acquire the lock, + we have to fall back to the mmap_read_lock() path, which calls + might_sleep(). This breaks RCU semantics because path lookup occurs + under an RCU read-side critical section." + +This is seen with CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_KFENCE=y. + +Kernel addresses (with the exception of the vectors/kuser helper +page) do not have VMAs associated with them. If the vectors/kuser +helper page faults, then there are two possibilities: + +1. if the fault happened while in kernel mode, then we're basically + dead, because the CPU won't be able to vector through this page + to handle the fault. +2. if the fault happened while in user mode, that means the page was + protected from user access, and we want to fault anyway. + +Thus, we can handle kernel addresses from any context entirely +separately without going anywhere near the mmap lock. This gives us +an entirely non-sleeping path for all kernel mode kernel address +faults. + +As we handle the kernel address faults before interrupts are enabled, +this change has the side effect of improving the branch predictor +hardening, but does not completely solve the issue. + +Reported-by: Zizhi Wo +Reported-by: Xie Yuanbin +Link: https://lore.kernel.org/r/20251126090505.3057219-1-wozizhi@huaweicloud.com +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 35 +++++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index c94633eb64a1bb..907705992ab65f 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -240,6 +240,35 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + ++static int __kprobes ++do_kernel_address_page_fault(struct mm_struct *mm, unsigned long addr, ++ unsigned int fsr, struct pt_regs *regs) ++{ ++ if (user_mode(regs)) { ++ /* ++ * Fault from user mode for a kernel space address. User mode ++ * should not be faulting in kernel space, which includes the ++ * vector/khelper page. Send a SIGSEGV. ++ */ ++ __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); ++ } else { ++ /* ++ * Fault from kernel mode. Enable interrupts if they were ++ * enabled in the parent context. Section (upper page table) ++ * translation faults are handled via do_translation_fault(), ++ * so we will only get here for a non-present kernel space ++ * PTE or PTE permission fault. This may happen in exceptional ++ * circumstances and need the fixup tables to be walked. ++ */ ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ ++ __do_kernel_fault(mm, addr, fsr, regs); ++ } ++ ++ return 0; ++} ++ + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { +@@ -253,6 +282,12 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + if (kprobe_page_fault(regs, fsr)) + return 0; + ++ /* ++ * Handle kernel addresses faults separately, which avoids touching ++ * the mmap lock from contexts that are not able to sleep. ++ */ ++ if (addr >= TASK_SIZE) ++ return do_kernel_address_page_fault(mm, addr, fsr, regs); + + /* Enable interrupts if they were enabled in the parent context. */ + if (interrupts_enabled(regs)) +-- +2.53.0 + diff --git a/queue-6.1/arm-group-is_permission_fault-with-is_translation_fa.patch b/queue-6.1/arm-group-is_permission_fault-with-is_translation_fa.patch new file mode 100644 index 0000000000..dda4f1d3dd --- /dev/null +++ b/queue-6.1/arm-group-is_permission_fault-with-is_translation_fa.patch @@ -0,0 +1,68 @@ +From b2b73c84ad000c9ccaedbbfed4048b3d415b9ea5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:31 +0200 +Subject: ARM: group is_permission_fault() with is_translation_fault() + +From: Russell King (Oracle) + +commit dea20281ac88226615761c570c8ff7adc18e6ac2 upstream. + +Group is_permission_fault() with is_translation_fault(), which is +needed to use is_permission_fault() in __do_kernel_fault(). As +this is static inline, there is no need for this to be under +CONFIG_MMU. + +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 16a7765511f8e4..2315d40760a787 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -125,6 +125,19 @@ static inline bool is_translation_fault(unsigned int fsr) + return false; + } + ++static inline bool is_permission_fault(unsigned int fsr) ++{ ++ int fs = fsr_fs(fsr); ++#ifdef CONFIG_ARM_LPAE ++ if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) ++ return true; ++#else ++ if (fs == FS_L1_PERM || fs == FS_L2_PERM) ++ return true; ++#endif ++ return false; ++} ++ + static void die_kernel_fault(const char *msg, struct mm_struct *mm, + unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +@@ -225,19 +238,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + +-static inline bool is_permission_fault(unsigned int fsr) +-{ +- int fs = fsr_fs(fsr); +-#ifdef CONFIG_ARM_LPAE +- if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) +- return true; +-#else +- if (fs == FS_L1_PERM || fs == FS_L2_PERM) +- return true; +-#endif +- return false; +-} +- + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { +-- +2.53.0 + diff --git a/queue-6.1/debugobjects-allow-to-refill-the-pool-before-system_.patch b/queue-6.1/debugobjects-allow-to-refill-the-pool-before-system_.patch new file mode 100644 index 0000000000..5f00d8c62e --- /dev/null +++ b/queue-6.1/debugobjects-allow-to-refill-the-pool-before-system_.patch @@ -0,0 +1,48 @@ +From 4c2c9ce1c285ffecb3dfdf1491191d8030426121 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:53 +0200 +Subject: debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING + +From: Sebastian Andrzej Siewior + +commit 06e0ae988f6e3499785c407429953ade19c1096b upstream. + +The pool of free objects is refilled on several occasions such as object +initialisation. On PREEMPT_RT refilling is limited to preemptible +sections due to sleeping locks used by the memory allocator. The system +boots with disabled interrupts so the pool can not be refilled. + +If too many objects are initialized and the pool gets empty then +debugobjects disables itself. + +Refiling can also happen early in the boot with disabled interrupts as +long as the scheduler is not operational. If the scheduler can not +preempt a task then a sleeping lock can not be contended. + +Allow to additionally refill the pool if the scheduler is not +operational. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-2-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 46eabbae69ccfc..bb5c909458535b 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -605,7 +605,7 @@ static void debug_objects_fill_pool(void) + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching +-- +2.53.0 + diff --git a/queue-6.1/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch b/queue-6.1/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch new file mode 100644 index 0000000000..fb9d519c24 --- /dev/null +++ b/queue-6.1/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch @@ -0,0 +1,65 @@ +From fc1b7b54a02c62650fce4a16e202e37742e04bb7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:56:02 +0200 +Subject: debugobjects: Do not fill_pool() if pi_blocked_on + +From: Helen Koike + +commit 5f41161059fd0f1bbf18c90f3180e38cc45a14eb upstream. + +On RT enabled kernels, fill_pool() ends up calling rtlock_lock(), which +asserts if current::pi_blocked_on is set, because a task can obviously only +block on one lock as otherwise the priority inheritenace chain gets +corrupted. + +Prevent this by expanding the conditional to take current::pi_blocked_on +into account. + +Fixes: 4bedcc28469a ("debugobjects: Make them PREEMPT_RT aware") +Reported-by: syzbot+b8ca586b9fc235f0c0df@syzkaller.appspotmail.com +Signed-off-by: Helen Koike +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20260511215359.3351259-1-koike@igalia.com +Closes: https://syzkaller.appspot.com/bug?extid=b8ca586b9fc235f0c0df +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index a7f3c6f15125a9..5e653e2ffa8dac 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -597,15 +597,25 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket + return NULL; + } + ++static inline bool debug_objects_is_pi_blocked_on(void) ++{ ++#ifdef CONFIG_RT_MUTEXES ++ return current->pi_blocked_on != NULL; ++#else ++ return false; ++#endif ++} ++ + static void debug_objects_fill_pool(void) + { + /* + * On RT enabled kernels the pool refill must happen in preemptible +- * context -- for !RT kernels we rely on the fact that spinlock_t and +- * raw_spinlock_t are basically the same type and this lock-type +- * inversion works just fine. ++ * context and not enqueued on an rt_mutex -- for !RT kernels we rely ++ * on the fact that spinlock_t and raw_spinlock_t are basically the ++ * same type and this lock-type inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || ++ (preemptible() && !debug_objects_is_pi_blocked_on())) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +-- +2.53.0 + diff --git a/queue-6.1/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch b/queue-6.1/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch new file mode 100644 index 0000000000..71cedd65a7 --- /dev/null +++ b/queue-6.1/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch @@ -0,0 +1,101 @@ +From 3eb961e2cde12b5c17ca9cc4d08ca32b7d04f19d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:56:07 +0200 +Subject: debugobjects: Dont call fill_pool() in early boot hardirq context + +From: Waiman Long + +commit 0d046ae106255cba5eb83b23f78ee93f3620247d upstream. + +When booting a debug PREEMPT_RT kernel on an ARM64 system, a "inconsistent +{HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage" lockdep warning message was +reported to the console. + +During early boot, interrupts are enabled before the scheduler is +enabled. In this window (before SYSTEM_SCHEDULING is set) interrupts can +fire and in the hard interrupt context handler attempt to fill the pool + +This can lead to a deadlock when the interrupt occurred when the interrupt +hits a region which holds a lock that is required to be taken in the +allocation path. + +Add a new can_fill_pool() helper and reorder the exception rule and forbid +this scenario by excluding allocations from hard interrupt context. + +Fixes: 06e0ae988f6e ("debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING") +Suggested-by: Sebastian Andrzej Siewior +Suggested-by: Thomas Gleixner +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Sebastian Andrzej Siewior +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/20260605173038.495075-1-longman@redhat.com +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 44 ++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 5e653e2ffa8dac..ef1fea990df70a 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -606,20 +606,48 @@ static inline bool debug_objects_is_pi_blocked_on(void) + #endif + } + +-static void debug_objects_fill_pool(void) ++static inline bool can_fill_pool(void) + { + /* +- * On RT enabled kernels the pool refill must happen in preemptible +- * context and not enqueued on an rt_mutex -- for !RT kernels we rely +- * on the fact that spinlock_t and raw_spinlock_t are basically the +- * same type and this lock-type inversion works just fine. ++ * On !RT enabled kernels there are no restrictions and spinlock_t and ++ * raw_spinlock_t are the same types. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return true; ++ ++ /* ++ * On RT enabled kernels, the task must not be blocked on a lock as ++ * that could corrupt the PI state when blocking on a lock in the ++ * allocation path. ++ */ ++ if (debug_objects_is_pi_blocked_on()) ++ return false; ++ ++ /* ++ * On RT enabled kernels the pool refill should happen in preemptible ++ * context. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || +- (preemptible() && !debug_objects_is_pi_blocked_on())) { ++ if (preemptible()) ++ return true; ++ ++ /* ++ * Though during system boot before scheduling is set up, preemption is ++ * disabled and the pool can get exhausted. Before scheduling is active ++ * a task cannot be blocked on a sleeping lock, but it might hold a lock ++ * and if interrupted then hard interrupt context might run into a lock ++ * inversion. So exclude hard interrupt context from allocations before ++ * scheduling is active. ++ */ ++ return system_state < SYSTEM_SCHEDULING && !in_hardirq(); ++} ++ ++static void debug_objects_fill_pool(void) ++{ ++ if (can_fill_pool()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +- * the preemptible() condition above. ++ * the preemptible() condition in can_fill_pool(). + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.1/debugobjects-locking-annotate-debug_object_fill_pool.patch b/queue-6.1/debugobjects-locking-annotate-debug_object_fill_pool.patch new file mode 100644 index 0000000000..17bc52878c --- /dev/null +++ b/queue-6.1/debugobjects-locking-annotate-debug_object_fill_pool.patch @@ -0,0 +1,180 @@ +From aa674b172bc3fa6f2e17a2e80139e4ac8c0a76bc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:49 +0200 +Subject: debugobjects,locking: Annotate debug_object_fill_pool() wait type + violation + +From: Peter Zijlstra + +commit 0cce06ba859a515bd06224085d3addb870608b6d upstream. + +There is an explicit wait-type violation in debug_object_fill_pool() +for PREEMPT_RT=n kernels which allows them to more easily fill the +object pool and reduce the chance of allocation failures. + +Lockdep's wait-type checks are designed to check the PREEMPT_RT +locking rules even for PREEMPT_RT=n kernels and object to this, so +create a lockdep annotation to allow this to stand. + +Specifically, create a 'lock' type that overrides the inner wait-type +while it is held -- allowing one to temporarily raise it, such that +the violation is hidden. + +Reported-by: Vlastimil Babka +Reported-by: Qi Zheng +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Qi Zheng +Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net +Signed-off-by: Thomas Gleixner +Signed-off-by: Sasha Levin +--- + include/linux/lockdep.h | 14 ++++++++++++++ + include/linux/lockdep_types.h | 1 + + kernel/locking/lockdep.c | 28 +++++++++++++++++++++------- + lib/debugobjects.c | 15 +++++++++++++-- + 4 files changed, 49 insertions(+), 9 deletions(-) + +diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h +index 43d8734ac0eb0b..90aa802a30669c 100644 +--- a/include/linux/lockdep.h ++++ b/include/linux/lockdep.h +@@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); + #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) + #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) + ++/* ++ * Must use lock_map_aquire_try() with override maps to avoid ++ * lockdep thinking they participate in the block chain. ++ */ ++#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ ++ struct lockdep_map _name = { \ ++ .name = #_name "-wait-type-override", \ ++ .wait_type_inner = _wait_type, \ ++ .lock_type = LD_LOCK_WAIT_OVERRIDE, } ++ + #else /* !CONFIG_LOCKDEP */ + + static inline void lockdep_init_task(struct task_struct *task) +@@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *); + #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) + #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) + ++#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ ++ struct lockdep_map __maybe_unused _name = {} ++ + #endif /* !LOCKDEP */ + + enum xhlock_context_t { +@@ -552,6 +565,7 @@ do { \ + #define rwsem_release(l, i) lock_release(l, i) + + #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) ++#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) + #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) + #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) + #define lock_map_release(l) lock_release(l, _THIS_IP_) +diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h +index d22430840b53f9..59f4fb1626ea60 100644 +--- a/include/linux/lockdep_types.h ++++ b/include/linux/lockdep_types.h +@@ -33,6 +33,7 @@ enum lockdep_wait_type { + enum lockdep_lock_type { + LD_LOCK_NORMAL = 0, /* normal, catch all */ + LD_LOCK_PERCPU, /* percpu */ ++ LD_LOCK_WAIT_OVERRIDE, /* annotation */ + LD_LOCK_MAX, + }; + +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index 5f8ce961cd9a3f..463834778b4b04 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -2245,6 +2245,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask) + + static inline bool usage_skip(struct lock_list *entry, void *mask) + { ++ if (entry->class->lock_type == LD_LOCK_NORMAL) ++ return false; ++ + /* + * Skip local_lock() for irq inversion detection. + * +@@ -2271,14 +2274,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) + * As a result, we will skip local_lock(), when we search for irq + * inversion bugs. + */ +- if (entry->class->lock_type == LD_LOCK_PERCPU) { +- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) +- return false; ++ if (entry->class->lock_type == LD_LOCK_PERCPU && ++ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) ++ return false; + +- return true; +- } ++ /* ++ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually ++ * a lock and only used to override the wait_type. ++ */ + +- return false; ++ return true; + } + + /* +@@ -4745,7 +4750,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; +- u8 prev_inner = hlock_class(prev)->wait_type_inner; ++ struct lock_class *class = hlock_class(prev); ++ u8 prev_inner = class->wait_type_inner; + + if (prev_inner) { + /* +@@ -4755,6 +4761,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); ++ ++ /* ++ * Allow override for annotations -- this is typically ++ * only valid/needed for code that only exists when ++ * CONFIG_PREEMPT_RT=n. ++ */ ++ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) ++ curr_inner = prev_inner; + } + } + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 1e193a5f6b4a72..46eabbae69ccfc 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -601,10 +601,21 @@ static void debug_objects_fill_pool(void) + { + /* + * On RT enabled kernels the pool refill must happen in preemptible +- * context: ++ * context -- for !RT kernels we rely on the fact that spinlock_t and ++ * raw_spinlock_t are basically the same type and this lock-type ++ * inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ /* ++ * Annotate away the spinlock_t inside raw_spinlock_t warning ++ * by temporarily raising the wait-type to WAIT_SLEEP, matching ++ * the preemptible() condition above. ++ */ ++ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); ++ lock_map_acquire_try(&fill_pool_map); + fill_pool(); ++ lock_map_release(&fill_pool_map); ++ } + } + + static void +-- +2.53.0 + diff --git a/queue-6.1/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch b/queue-6.1/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch new file mode 100644 index 0000000000..e66802237c --- /dev/null +++ b/queue-6.1/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch @@ -0,0 +1,47 @@ +From 4af4bc44d340a7c542af22a2ae2c2aef2c5fa440 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:57 +0200 +Subject: debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP + +From: Sebastian Andrzej Siewior + +commit 37de2dbc318ee10577c1c2704de5a803e75e55a2 upstream. + +fill_pool_map is used to suppress nesting violations caused by acquiring +a spinlock_t (from within the memory allocator) while holding a +raw_spinlock_t. The used annotation is wrong. + +LD_WAIT_SLEEP is for always sleeping lock types such as mutex_t. +LD_WAIT_CONFIG is for lock type which are sleeping while spinning on +PREEMPT_RT such as spinlock_t. + +Use LD_WAIT_CONFIG as override. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-3-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index bb5c909458535b..a7f3c6f15125a9 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -608,10 +608,10 @@ static void debug_objects_fill_pool(void) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning +- * by temporarily raising the wait-type to WAIT_SLEEP, matching ++ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching + * the preemptible() condition above. + */ +- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); ++ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); + fill_pool(); + lock_map_release(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.1/series b/queue-6.1/series index ae4d1e6159..7ad6497664 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -19,3 +19,12 @@ kvm-nvmx-add-a-helper-to-get-highest-pending-from-po.patch kvm-nvmx-check-for-pending-posted-interrupts-when-lo.patch kvm-nvmx-fold-requested-virtual-interrupt-check-into.patch net-annotate-data-races-around-sk-sk_-data_ready-wri.patch +debugobjects-locking-annotate-debug_object_fill_pool.patch +debugobjects-allow-to-refill-the-pool-before-system_.patch +debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch +debugobjects-do-not-fill_pool-if-pi_blocked_on.patch +debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch +arm-group-is_permission_fault-with-is_translation_fa.patch +arm-allow-__do_kernel_fault-to-report-execution-of-m.patch +arm-fix-hash_name-fault.patch +arm-fix-branch-predictor-hardening.patch diff --git a/queue-6.12/debugobjects-allow-to-refill-the-pool-before-system_.patch b/queue-6.12/debugobjects-allow-to-refill-the-pool-before-system_.patch new file mode 100644 index 0000000000..d598e1b541 --- /dev/null +++ b/queue-6.12/debugobjects-allow-to-refill-the-pool-before-system_.patch @@ -0,0 +1,48 @@ +From 12b5d1a683efb3c8108cc3207313db1eb1282b25 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:12 +0200 +Subject: debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING + +From: Sebastian Andrzej Siewior + +commit 06e0ae988f6e3499785c407429953ade19c1096b upstream. + +The pool of free objects is refilled on several occasions such as object +initialisation. On PREEMPT_RT refilling is limited to preemptible +sections due to sleeping locks used by the memory allocator. The system +boots with disabled interrupts so the pool can not be refilled. + +If too many objects are initialized and the pool gets empty then +debugobjects disables itself. + +Refiling can also happen early in the boot with disabled interrupts as +long as the scheduler is not operational. If the scheduler can not +preempt a task then a sleeping lock can not be contended. + +Allow to additionally refill the pool if the scheduler is not +operational. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-2-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 932e2d8dbd9b9b..d69721bb78b797 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -604,7 +604,7 @@ static void debug_objects_fill_pool(void) + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching +-- +2.53.0 + diff --git a/queue-6.12/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch b/queue-6.12/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch new file mode 100644 index 0000000000..98391455ed --- /dev/null +++ b/queue-6.12/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch @@ -0,0 +1,65 @@ +From 8c61c8d212c0f6e38df746af46593ca893b6bf28 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:21 +0200 +Subject: debugobjects: Do not fill_pool() if pi_blocked_on + +From: Helen Koike + +commit 5f41161059fd0f1bbf18c90f3180e38cc45a14eb upstream. + +On RT enabled kernels, fill_pool() ends up calling rtlock_lock(), which +asserts if current::pi_blocked_on is set, because a task can obviously only +block on one lock as otherwise the priority inheritenace chain gets +corrupted. + +Prevent this by expanding the conditional to take current::pi_blocked_on +into account. + +Fixes: 4bedcc28469a ("debugobjects: Make them PREEMPT_RT aware") +Reported-by: syzbot+b8ca586b9fc235f0c0df@syzkaller.appspotmail.com +Signed-off-by: Helen Koike +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20260511215359.3351259-1-koike@igalia.com +Closes: https://syzkaller.appspot.com/bug?extid=b8ca586b9fc235f0c0df +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 628fe54d927ecb..c1b8b754572caa 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -596,15 +596,25 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket + return NULL; + } + ++static inline bool debug_objects_is_pi_blocked_on(void) ++{ ++#ifdef CONFIG_RT_MUTEXES ++ return current->pi_blocked_on != NULL; ++#else ++ return false; ++#endif ++} ++ + static void debug_objects_fill_pool(void) + { + /* + * On RT enabled kernels the pool refill must happen in preemptible +- * context -- for !RT kernels we rely on the fact that spinlock_t and +- * raw_spinlock_t are basically the same type and this lock-type +- * inversion works just fine. ++ * context and not enqueued on an rt_mutex -- for !RT kernels we rely ++ * on the fact that spinlock_t and raw_spinlock_t are basically the ++ * same type and this lock-type inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || ++ (preemptible() && !debug_objects_is_pi_blocked_on())) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +-- +2.53.0 + diff --git a/queue-6.12/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch b/queue-6.12/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch new file mode 100644 index 0000000000..2fc63c78c4 --- /dev/null +++ b/queue-6.12/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch @@ -0,0 +1,101 @@ +From 813c2352f2652717378c8f1b4a18b99dafd72fcc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:26 +0200 +Subject: debugobjects: Dont call fill_pool() in early boot hardirq context + +From: Waiman Long + +commit 0d046ae106255cba5eb83b23f78ee93f3620247d upstream. + +When booting a debug PREEMPT_RT kernel on an ARM64 system, a "inconsistent +{HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage" lockdep warning message was +reported to the console. + +During early boot, interrupts are enabled before the scheduler is +enabled. In this window (before SYSTEM_SCHEDULING is set) interrupts can +fire and in the hard interrupt context handler attempt to fill the pool + +This can lead to a deadlock when the interrupt occurred when the interrupt +hits a region which holds a lock that is required to be taken in the +allocation path. + +Add a new can_fill_pool() helper and reorder the exception rule and forbid +this scenario by excluding allocations from hard interrupt context. + +Fixes: 06e0ae988f6e ("debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING") +Suggested-by: Sebastian Andrzej Siewior +Suggested-by: Thomas Gleixner +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Sebastian Andrzej Siewior +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/20260605173038.495075-1-longman@redhat.com +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 44 ++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index c1b8b754572caa..7abd909c8076af 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -605,20 +605,48 @@ static inline bool debug_objects_is_pi_blocked_on(void) + #endif + } + +-static void debug_objects_fill_pool(void) ++static inline bool can_fill_pool(void) + { + /* +- * On RT enabled kernels the pool refill must happen in preemptible +- * context and not enqueued on an rt_mutex -- for !RT kernels we rely +- * on the fact that spinlock_t and raw_spinlock_t are basically the +- * same type and this lock-type inversion works just fine. ++ * On !RT enabled kernels there are no restrictions and spinlock_t and ++ * raw_spinlock_t are the same types. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return true; ++ ++ /* ++ * On RT enabled kernels, the task must not be blocked on a lock as ++ * that could corrupt the PI state when blocking on a lock in the ++ * allocation path. ++ */ ++ if (debug_objects_is_pi_blocked_on()) ++ return false; ++ ++ /* ++ * On RT enabled kernels the pool refill should happen in preemptible ++ * context. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || +- (preemptible() && !debug_objects_is_pi_blocked_on())) { ++ if (preemptible()) ++ return true; ++ ++ /* ++ * Though during system boot before scheduling is set up, preemption is ++ * disabled and the pool can get exhausted. Before scheduling is active ++ * a task cannot be blocked on a sleeping lock, but it might hold a lock ++ * and if interrupted then hard interrupt context might run into a lock ++ * inversion. So exclude hard interrupt context from allocations before ++ * scheduling is active. ++ */ ++ return system_state < SYSTEM_SCHEDULING && !in_hardirq(); ++} ++ ++static void debug_objects_fill_pool(void) ++{ ++ if (can_fill_pool()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +- * the preemptible() condition above. ++ * the preemptible() condition in can_fill_pool(). + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.12/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch b/queue-6.12/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch new file mode 100644 index 0000000000..6fea05ba2d --- /dev/null +++ b/queue-6.12/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch @@ -0,0 +1,47 @@ +From 884c8d325e8c6dd26cc2ae32010fb69e5cad868d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:17 +0200 +Subject: debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP + +From: Sebastian Andrzej Siewior + +commit 37de2dbc318ee10577c1c2704de5a803e75e55a2 upstream. + +fill_pool_map is used to suppress nesting violations caused by acquiring +a spinlock_t (from within the memory allocator) while holding a +raw_spinlock_t. The used annotation is wrong. + +LD_WAIT_SLEEP is for always sleeping lock types such as mutex_t. +LD_WAIT_CONFIG is for lock type which are sleeping while spinning on +PREEMPT_RT such as spinlock_t. + +Use LD_WAIT_CONFIG as override. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-3-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index d69721bb78b797..628fe54d927ecb 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -607,10 +607,10 @@ static void debug_objects_fill_pool(void) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning +- * by temporarily raising the wait-type to WAIT_SLEEP, matching ++ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching + * the preemptible() condition above. + */ +- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); ++ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); + fill_pool(); + lock_map_release(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.12/series b/queue-6.12/series index 0aacead6bc..9cdb097658 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -20,3 +20,7 @@ iio-light-bh1780-fix-pm-runtime-leak-on-error-path.patch net-drop-the-lock-in-skb_may_tx_timestamp.patch ip6_vti-set-netns_immutable-on-the-fallback-device.patch reapply-selftest-ptp-update-ptp-selftest-to-exercise-the-gettimex-options.patch +debugobjects-allow-to-refill-the-pool-before-system_.patch +debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch +debugobjects-do-not-fill_pool-if-pi_blocked_on.patch +debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch diff --git a/queue-6.18/debugobjects-allow-to-refill-the-pool-before-system_.patch b/queue-6.18/debugobjects-allow-to-refill-the-pool-before-system_.patch new file mode 100644 index 0000000000..b59a881932 --- /dev/null +++ b/queue-6.18/debugobjects-allow-to-refill-the-pool-before-system_.patch @@ -0,0 +1,48 @@ +From 3d6f64a6578fbc098b21153b91241d1405c9c35d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:54:38 +0200 +Subject: debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING + +From: Sebastian Andrzej Siewior + +commit 06e0ae988f6e3499785c407429953ade19c1096b upstream. + +The pool of free objects is refilled on several occasions such as object +initialisation. On PREEMPT_RT refilling is limited to preemptible +sections due to sleeping locks used by the memory allocator. The system +boots with disabled interrupts so the pool can not be refilled. + +If too many objects are initialized and the pool gets empty then +debugobjects disables itself. + +Refiling can also happen early in the boot with disabled interrupts as +long as the scheduler is not operational. If the scheduler can not +preempt a task then a sleeping lock can not be contended. + +Allow to additionally refill the pool if the scheduler is not +operational. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-2-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index e4b7f77ece3b4f..9d59b797d1b507 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -731,7 +731,7 @@ static void debug_objects_fill_pool(void) + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching +-- +2.53.0 + diff --git a/queue-6.18/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch b/queue-6.18/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch new file mode 100644 index 0000000000..cbb404966c --- /dev/null +++ b/queue-6.18/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch @@ -0,0 +1,68 @@ +From a2727de7629c871fac17773b06f044f0e97feeba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:54:46 +0200 +Subject: debugobjects: Do not fill_pool() if pi_blocked_on + +From: Helen Koike + +commit 5f41161059fd0f1bbf18c90f3180e38cc45a14eb upstream. + +On RT enabled kernels, fill_pool() ends up calling rtlock_lock(), which +asserts if current::pi_blocked_on is set, because a task can obviously only +block on one lock as otherwise the priority inheritenace chain gets +corrupted. + +Prevent this by expanding the conditional to take current::pi_blocked_on +into account. + +Fixes: 4bedcc28469a ("debugobjects: Make them PREEMPT_RT aware") +Reported-by: syzbot+b8ca586b9fc235f0c0df@syzkaller.appspotmail.com +Signed-off-by: Helen Koike +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20260511215359.3351259-1-koike@igalia.com +Closes: https://syzkaller.appspot.com/bug?extid=b8ca586b9fc235f0c0df +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 4343dc5e5c99da..cbd025dae5ce92 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -711,6 +711,15 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket + return NULL; + } + ++static inline bool debug_objects_is_pi_blocked_on(void) ++{ ++#ifdef CONFIG_RT_MUTEXES ++ return current->pi_blocked_on != NULL; ++#else ++ return false; ++#endif ++} ++ + static void debug_objects_fill_pool(void) + { + if (!static_branch_likely(&obj_cache_enabled)) +@@ -727,11 +736,12 @@ static void debug_objects_fill_pool(void) + + /* + * On RT enabled kernels the pool refill must happen in preemptible +- * context -- for !RT kernels we rely on the fact that spinlock_t and +- * raw_spinlock_t are basically the same type and this lock-type +- * inversion works just fine. ++ * context and not enqueued on an rt_mutex -- for !RT kernels we rely ++ * on the fact that spinlock_t and raw_spinlock_t are basically the ++ * same type and this lock-type inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || ++ (preemptible() && !debug_objects_is_pi_blocked_on())) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +-- +2.53.0 + diff --git a/queue-6.18/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch b/queue-6.18/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch new file mode 100644 index 0000000000..12c2010983 --- /dev/null +++ b/queue-6.18/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch @@ -0,0 +1,107 @@ +From 4f4378e74e7cfd681c329bf4c2a295d900e53a53 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:54:51 +0200 +Subject: debugobjects: Dont call fill_pool() in early boot hardirq context + +From: Waiman Long + +commit 0d046ae106255cba5eb83b23f78ee93f3620247d upstream. + +When booting a debug PREEMPT_RT kernel on an ARM64 system, a "inconsistent +{HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage" lockdep warning message was +reported to the console. + +During early boot, interrupts are enabled before the scheduler is +enabled. In this window (before SYSTEM_SCHEDULING is set) interrupts can +fire and in the hard interrupt context handler attempt to fill the pool + +This can lead to a deadlock when the interrupt occurred when the interrupt +hits a region which holds a lock that is required to be taken in the +allocation path. + +Add a new can_fill_pool() helper and reorder the exception rule and forbid +this scenario by excluding allocations from hard interrupt context. + +Fixes: 06e0ae988f6e ("debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING") +Suggested-by: Sebastian Andrzej Siewior +Suggested-by: Thomas Gleixner +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Sebastian Andrzej Siewior +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/20260605173038.495075-1-longman@redhat.com +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 46 +++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 37 insertions(+), 9 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index cbd025dae5ce92..17f116247bb423 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -720,6 +720,41 @@ static inline bool debug_objects_is_pi_blocked_on(void) + #endif + } + ++static inline bool can_fill_pool(void) ++{ ++ /* ++ * On !RT enabled kernels there are no restrictions and spinlock_t and ++ * raw_spinlock_t are the same types. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return true; ++ ++ /* ++ * On RT enabled kernels, the task must not be blocked on a lock as ++ * that could corrupt the PI state when blocking on a lock in the ++ * allocation path. ++ */ ++ if (debug_objects_is_pi_blocked_on()) ++ return false; ++ ++ /* ++ * On RT enabled kernels the pool refill should happen in preemptible ++ * context. ++ */ ++ if (preemptible()) ++ return true; ++ ++ /* ++ * Though during system boot before scheduling is set up, preemption is ++ * disabled and the pool can get exhausted. Before scheduling is active ++ * a task cannot be blocked on a sleeping lock, but it might hold a lock ++ * and if interrupted then hard interrupt context might run into a lock ++ * inversion. So exclude hard interrupt context from allocations before ++ * scheduling is active. ++ */ ++ return system_state < SYSTEM_SCHEDULING && !in_hardirq(); ++} ++ + static void debug_objects_fill_pool(void) + { + if (!static_branch_likely(&obj_cache_enabled)) +@@ -734,18 +769,11 @@ static void debug_objects_fill_pool(void) + if (likely(!pool_should_refill(&pool_global))) + return; + +- /* +- * On RT enabled kernels the pool refill must happen in preemptible +- * context and not enqueued on an rt_mutex -- for !RT kernels we rely +- * on the fact that spinlock_t and raw_spinlock_t are basically the +- * same type and this lock-type inversion works just fine. +- */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || +- (preemptible() && !debug_objects_is_pi_blocked_on())) { ++ if (can_fill_pool()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +- * the preemptible() condition above. ++ * the preemptible() condition in can_fill_pool(). + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.18/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch b/queue-6.18/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch new file mode 100644 index 0000000000..8c22dc22fd --- /dev/null +++ b/queue-6.18/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch @@ -0,0 +1,47 @@ +From 05a54bb56aeea1aab8b1d2c1e336d1df3253773b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:54:42 +0200 +Subject: debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP + +From: Sebastian Andrzej Siewior + +commit 37de2dbc318ee10577c1c2704de5a803e75e55a2 upstream. + +fill_pool_map is used to suppress nesting violations caused by acquiring +a spinlock_t (from within the memory allocator) while holding a +raw_spinlock_t. The used annotation is wrong. + +LD_WAIT_SLEEP is for always sleeping lock types such as mutex_t. +LD_WAIT_CONFIG is for lock type which are sleeping while spinning on +PREEMPT_RT such as spinlock_t. + +Use LD_WAIT_CONFIG as override. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-3-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 9d59b797d1b507..4343dc5e5c99da 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -734,10 +734,10 @@ static void debug_objects_fill_pool(void) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning +- * by temporarily raising the wait-type to WAIT_SLEEP, matching ++ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching + * the preemptible() condition above. + */ +- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); ++ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); + fill_pool(); + lock_map_release(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.18/series b/queue-6.18/series index 33e0b0b97f..49aa8c68eb 100644 --- a/queue-6.18/series +++ b/queue-6.18/series @@ -2,3 +2,7 @@ io_uring-net-avoid-msghdr-on-op_connect-op_bind-asyn.patch net-stmmac-fix-stm32-and-potentially-others-resume-r.patch fuse-re-lock-request-before-replacing-page-cache-folio.patch revert-nfsd-defer-sub-object-cleanup-in-export-put-callbacks.patch +debugobjects-allow-to-refill-the-pool-before-system_.patch +debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch +debugobjects-do-not-fill_pool-if-pi_blocked_on.patch +debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch diff --git a/queue-6.6/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch b/queue-6.6/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch new file mode 100644 index 0000000000..d528b3540b --- /dev/null +++ b/queue-6.6/arm-allow-__do_kernel_fault-to-report-execution-of-m.patch @@ -0,0 +1,39 @@ +From 5b506c182e8aba4ea5fcce8d9cb4cf5d410885f4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:32 +0200 +Subject: ARM: allow __do_kernel_fault() to report execution of memory faults + +From: Russell King (Oracle) + +commit 40b466db1dffb41f0529035c59c5739636d0e5b8 upstream + +Allow __do_kernel_fault() to detect the execution of memory, so we can +provide the same fault message as do_page_fault() would do. This is +required when we split the kernel address fault handling from the +main do_page_fault() code path. + +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 879730a47c4a20..4c0ee81befb1ed 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -176,6 +176,8 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, + */ + if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; ++ } else if (is_permission_fault(fsr) && fsr & FSR_LNX_PF) { ++ msg = "execution of memory"; + } else { + if (is_translation_fault(fsr) && + kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) +-- +2.53.0 + diff --git a/queue-6.6/arm-fix-branch-predictor-hardening.patch b/queue-6.6/arm-fix-branch-predictor-hardening.patch new file mode 100644 index 0000000000..5bdacb1d43 --- /dev/null +++ b/queue-6.6/arm-fix-branch-predictor-hardening.patch @@ -0,0 +1,154 @@ +From 7b1fd36e7b330a5a35d5877e2fc6f30f8f43adcf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:34 +0200 +Subject: ARM: fix branch predictor hardening + +From: Russell King (Oracle) + +commit fd2dee1c6e2256f726ba33fd3083a7be0efc80d3 upstream. + +__do_user_fault() may be called with indeterminent interrupt enable +state, which means we may be preemptive at this point. This causes +problems when calling harden_branch_predictor(). For example, when +called from a data abort, do_alignment_fault()->do_bad_area(). + +Move harden_branch_predictor() out of __do_user_fault() and into the +calling contexts. + +Moving it into do_kernel_address_page_fault(), we can be sure that +interrupts will be disabled here. + +Converting do_translation_fault() to use do_kernel_address_page_fault() +rather than do_bad_area() means that we keep branch predictor handling +for translation faults. Interrupts will also be disabled at this call +site. + +do_sect_fault() needs special handling, so detect user mode accesses +to kernel-addresses, and add an explicit call to branch predictor +hardening. + +Finally, add branch predictor hardening to do_alignment() for the +faulting case (user mode accessing kernel addresses) before interrupts +are enabled. + +This should cover all cases where harden_branch_predictor() is called, +ensuring that it is always has interrupts disabled, also ensuring that +it is called early in each call path. + +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/alignment.c | 4 ++++ + arch/arm/mm/fault.c | 39 ++++++++++++++++++++++++++------------- + 2 files changed, 30 insertions(+), 13 deletions(-) + +diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c +index f8dd0b3cc8e040..ee264737be6d26 100644 +--- a/arch/arm/mm/alignment.c ++++ b/arch/arm/mm/alignment.c +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + #include + +@@ -809,6 +810,9 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + int thumb2_32b = 0; + int fault; + ++ if (addr >= TASK_SIZE && user_mode(regs)) ++ harden_branch_predictor(); ++ + if (interrupts_enabled(regs)) + local_irq_enable(); + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 47eecdf29a8312..87ed5da30e44f1 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -199,9 +199,6 @@ __do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig, + { + struct task_struct *tsk = current; + +- if (addr > TASK_SIZE) +- harden_branch_predictor(); +- + #ifdef CONFIG_DEBUG_USER + if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) || + ((user_debug & UDBG_BUS) && (sig == SIGBUS))) { +@@ -252,8 +249,10 @@ do_kernel_address_page_fault(struct mm_struct *mm, unsigned long addr, + /* + * Fault from user mode for a kernel space address. User mode + * should not be faulting in kernel space, which includes the +- * vector/khelper page. Send a SIGSEGV. ++ * vector/khelper page. Handle the branch predictor hardening ++ * while interrupts are still disabled, then send a SIGSEGV. + */ ++ harden_branch_predictor(); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + } else { + /* +@@ -423,16 +422,20 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + * We enter here because the first level page table doesn't contain + * a valid entry for the address. + * +- * If the address is in kernel space (>= TASK_SIZE), then we are +- * probably faulting in the vmalloc() area. ++ * If this is a user address (addr < TASK_SIZE), we handle this as a ++ * normal page fault. This leaves the remainder of the function to handle ++ * kernel address translation faults. + * +- * If the init_task's first level page tables contains the relevant +- * entry, we copy the it to this task. If not, we send the process +- * a signal, fixup the exception, or oops the kernel. ++ * Since user mode is not permitted to access kernel addresses, pass these ++ * directly to do_kernel_address_page_fault() to handle. + * +- * NOTE! We MUST NOT take any locks for this case. We may be in an +- * interrupt or a critical region, and should only copy the information +- * from the master page table, nothing more. ++ * Otherwise, we're probably faulting in the vmalloc() area, so try to fix ++ * that up. Note that we must not take any locks or enable interrupts in ++ * this case. ++ * ++ * If vmalloc() fixup fails, that means the non-leaf page tables did not ++ * contain an entry for this address, so handle this via ++ * do_kernel_address_page_fault(). + */ + #ifdef CONFIG_MMU + static int __kprobes +@@ -498,7 +501,8 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + return 0; + + bad_area: +- do_bad_area(addr, fsr, regs); ++ do_kernel_address_page_fault(current->mm, addr, fsr, regs); ++ + return 0; + } + #else /* CONFIG_MMU */ +@@ -518,7 +522,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + static int + do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { ++ /* ++ * If this is a kernel address, but from user mode, then userspace ++ * is trying bad stuff. Invoke the branch predictor handling. ++ * Interrupts are disabled here. ++ */ ++ if (addr >= TASK_SIZE && user_mode(regs)) ++ harden_branch_predictor(); ++ + do_bad_area(addr, fsr, regs); ++ + return 0; + } + #endif /* CONFIG_ARM_LPAE */ +-- +2.53.0 + diff --git a/queue-6.6/arm-fix-hash_name-fault.patch b/queue-6.6/arm-fix-hash_name-fault.patch new file mode 100644 index 0000000000..8d613bb9e8 --- /dev/null +++ b/queue-6.6/arm-fix-hash_name-fault.patch @@ -0,0 +1,109 @@ +From 83b9f7020ba31768a5eed6c46aef3623442184e1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:33 +0200 +Subject: ARM: fix hash_name() fault + +From: Russell King (Oracle) + +commit 7733bc7d299d682f2723dc38fc7f370b9bf973e9 upstream. + +Zizhi Wo reports: + +"During the execution of hash_name()->load_unaligned_zeropad(), a + potential memory access beyond the PAGE boundary may occur. For + example, when the filename length is near the PAGE_SIZE boundary. + This triggers a page fault, which leads to a call to + do_page_fault()->mmap_read_trylock(). If we can't acquire the lock, + we have to fall back to the mmap_read_lock() path, which calls + might_sleep(). This breaks RCU semantics because path lookup occurs + under an RCU read-side critical section." + +This is seen with CONFIG_DEBUG_ATOMIC_SLEEP=y and CONFIG_KFENCE=y. + +Kernel addresses (with the exception of the vectors/kuser helper +page) do not have VMAs associated with them. If the vectors/kuser +helper page faults, then there are two possibilities: + +1. if the fault happened while in kernel mode, then we're basically + dead, because the CPU won't be able to vector through this page + to handle the fault. +2. if the fault happened while in user mode, that means the page was + protected from user access, and we want to fault anyway. + +Thus, we can handle kernel addresses from any context entirely +separately without going anywhere near the mmap lock. This gives us +an entirely non-sleeping path for all kernel mode kernel address +faults. + +As we handle the kernel address faults before interrupts are enabled, +this change has the side effect of improving the branch predictor +hardening, but does not completely solve the issue. + +Reported-by: Zizhi Wo +Reported-by: Xie Yuanbin +Link: https://lore.kernel.org/r/20251126090505.3057219-1-wozizhi@huaweicloud.com +Reviewed-by: Xie Yuanbin +Tested-by: Xie Yuanbin +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 35 +++++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 4c0ee81befb1ed..47eecdf29a8312 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -244,6 +244,35 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + ++static int __kprobes ++do_kernel_address_page_fault(struct mm_struct *mm, unsigned long addr, ++ unsigned int fsr, struct pt_regs *regs) ++{ ++ if (user_mode(regs)) { ++ /* ++ * Fault from user mode for a kernel space address. User mode ++ * should not be faulting in kernel space, which includes the ++ * vector/khelper page. Send a SIGSEGV. ++ */ ++ __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); ++ } else { ++ /* ++ * Fault from kernel mode. Enable interrupts if they were ++ * enabled in the parent context. Section (upper page table) ++ * translation faults are handled via do_translation_fault(), ++ * so we will only get here for a non-present kernel space ++ * PTE or PTE permission fault. This may happen in exceptional ++ * circumstances and need the fixup tables to be walked. ++ */ ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ ++ __do_kernel_fault(mm, addr, fsr, regs); ++ } ++ ++ return 0; ++} ++ + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { +@@ -257,6 +286,12 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + if (kprobe_page_fault(regs, fsr)) + return 0; + ++ /* ++ * Handle kernel addresses faults separately, which avoids touching ++ * the mmap lock from contexts that are not able to sleep. ++ */ ++ if (addr >= TASK_SIZE) ++ return do_kernel_address_page_fault(mm, addr, fsr, regs); + + /* Enable interrupts if they were enabled in the parent context. */ + if (interrupts_enabled(regs)) +-- +2.53.0 + diff --git a/queue-6.6/arm-group-is_permission_fault-with-is_translation_fa.patch b/queue-6.6/arm-group-is_permission_fault-with-is_translation_fa.patch new file mode 100644 index 0000000000..681d9a641c --- /dev/null +++ b/queue-6.6/arm-group-is_permission_fault-with-is_translation_fa.patch @@ -0,0 +1,68 @@ +From 5524d39d849a013ac180a852c8bd9eb6d32ee67b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 12:26:31 +0200 +Subject: ARM: group is_permission_fault() with is_translation_fault() + +From: Russell King (Oracle) + +commit dea20281ac88226615761c570c8ff7adc18e6ac2 upstream. + +Group is_permission_fault() with is_translation_fault(), which is +needed to use is_permission_fault() in __do_kernel_fault(). As +this is static inline, there is no need for this to be under +CONFIG_MMU. + +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/arm/mm/fault.c | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index ed1a25f457e48e..879730a47c4a20 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -128,6 +128,19 @@ static inline bool is_translation_fault(unsigned int fsr) + return false; + } + ++static inline bool is_permission_fault(unsigned int fsr) ++{ ++ int fs = fsr_fs(fsr); ++#ifdef CONFIG_ARM_LPAE ++ if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) ++ return true; ++#else ++ if (fs == FS_L1_PERM || fs == FS_L2_PERM) ++ return true; ++#endif ++ return false; ++} ++ + static void die_kernel_fault(const char *msg, struct mm_struct *mm, + unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +@@ -229,19 +242,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + +-static inline bool is_permission_fault(unsigned int fsr) +-{ +- int fs = fsr_fs(fsr); +-#ifdef CONFIG_ARM_LPAE +- if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) +- return true; +-#else +- if (fs == FS_L1_PERM || fs == FS_L2_PERM) +- return true; +-#endif +- return false; +-} +- + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { +-- +2.53.0 + diff --git a/queue-6.6/debugobjects-allow-to-refill-the-pool-before-system_.patch b/queue-6.6/debugobjects-allow-to-refill-the-pool-before-system_.patch new file mode 100644 index 0000000000..990b17de7a --- /dev/null +++ b/queue-6.6/debugobjects-allow-to-refill-the-pool-before-system_.patch @@ -0,0 +1,48 @@ +From 5fcaa55af1d122a45987dd987a0aae167010cedd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:12 +0200 +Subject: debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING + +From: Sebastian Andrzej Siewior + +commit 06e0ae988f6e3499785c407429953ade19c1096b upstream. + +The pool of free objects is refilled on several occasions such as object +initialisation. On PREEMPT_RT refilling is limited to preemptible +sections due to sleeping locks used by the memory allocator. The system +boots with disabled interrupts so the pool can not be refilled. + +If too many objects are initialized and the pool gets empty then +debugobjects disables itself. + +Refiling can also happen early in the boot with disabled interrupts as +long as the scheduler is not operational. If the scheduler can not +preempt a task then a sleeping lock can not be contended. + +Allow to additionally refill the pool if the scheduler is not +operational. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-2-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 35cd384f7e8a2f..5b462a45a9c17b 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -605,7 +605,7 @@ static void debug_objects_fill_pool(void) + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching +-- +2.53.0 + diff --git a/queue-6.6/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch b/queue-6.6/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch new file mode 100644 index 0000000000..883b00c6de --- /dev/null +++ b/queue-6.6/debugobjects-do-not-fill_pool-if-pi_blocked_on.patch @@ -0,0 +1,65 @@ +From 126dc6c2c747f2864e665ce22c7734af591db48c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:21 +0200 +Subject: debugobjects: Do not fill_pool() if pi_blocked_on + +From: Helen Koike + +commit 5f41161059fd0f1bbf18c90f3180e38cc45a14eb upstream. + +On RT enabled kernels, fill_pool() ends up calling rtlock_lock(), which +asserts if current::pi_blocked_on is set, because a task can obviously only +block on one lock as otherwise the priority inheritenace chain gets +corrupted. + +Prevent this by expanding the conditional to take current::pi_blocked_on +into account. + +Fixes: 4bedcc28469a ("debugobjects: Make them PREEMPT_RT aware") +Reported-by: syzbot+b8ca586b9fc235f0c0df@syzkaller.appspotmail.com +Signed-off-by: Helen Koike +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20260511215359.3351259-1-koike@igalia.com +Closes: https://syzkaller.appspot.com/bug?extid=b8ca586b9fc235f0c0df +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index bebc00aacafedd..b1c3e873a71d40 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -597,15 +597,25 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket + return NULL; + } + ++static inline bool debug_objects_is_pi_blocked_on(void) ++{ ++#ifdef CONFIG_RT_MUTEXES ++ return current->pi_blocked_on != NULL; ++#else ++ return false; ++#endif ++} ++ + static void debug_objects_fill_pool(void) + { + /* + * On RT enabled kernels the pool refill must happen in preemptible +- * context -- for !RT kernels we rely on the fact that spinlock_t and +- * raw_spinlock_t are basically the same type and this lock-type +- * inversion works just fine. ++ * context and not enqueued on an rt_mutex -- for !RT kernels we rely ++ * on the fact that spinlock_t and raw_spinlock_t are basically the ++ * same type and this lock-type inversion works just fine. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || ++ (preemptible() && !debug_objects_is_pi_blocked_on())) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +-- +2.53.0 + diff --git a/queue-6.6/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch b/queue-6.6/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch new file mode 100644 index 0000000000..da771903cf --- /dev/null +++ b/queue-6.6/debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch @@ -0,0 +1,101 @@ +From 1f0ca563af654ec409735e370fc93240bc83f7d7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:26 +0200 +Subject: debugobjects: Dont call fill_pool() in early boot hardirq context + +From: Waiman Long + +commit 0d046ae106255cba5eb83b23f78ee93f3620247d upstream. + +When booting a debug PREEMPT_RT kernel on an ARM64 system, a "inconsistent +{HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage" lockdep warning message was +reported to the console. + +During early boot, interrupts are enabled before the scheduler is +enabled. In this window (before SYSTEM_SCHEDULING is set) interrupts can +fire and in the hard interrupt context handler attempt to fill the pool + +This can lead to a deadlock when the interrupt occurred when the interrupt +hits a region which holds a lock that is required to be taken in the +allocation path. + +Add a new can_fill_pool() helper and reorder the exception rule and forbid +this scenario by excluding allocations from hard interrupt context. + +Fixes: 06e0ae988f6e ("debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING") +Suggested-by: Sebastian Andrzej Siewior +Suggested-by: Thomas Gleixner +Signed-off-by: Waiman Long +Signed-off-by: Thomas Gleixner +Reviewed-by: Sebastian Andrzej Siewior +Cc: stable@vger.kernel.org +Link: https://patch.msgid.link/20260605173038.495075-1-longman@redhat.com +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 44 ++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index b1c3e873a71d40..e4ef9d032d1749 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -606,20 +606,48 @@ static inline bool debug_objects_is_pi_blocked_on(void) + #endif + } + +-static void debug_objects_fill_pool(void) ++static inline bool can_fill_pool(void) + { + /* +- * On RT enabled kernels the pool refill must happen in preemptible +- * context and not enqueued on an rt_mutex -- for !RT kernels we rely +- * on the fact that spinlock_t and raw_spinlock_t are basically the +- * same type and this lock-type inversion works just fine. ++ * On !RT enabled kernels there are no restrictions and spinlock_t and ++ * raw_spinlock_t are the same types. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return true; ++ ++ /* ++ * On RT enabled kernels, the task must not be blocked on a lock as ++ * that could corrupt the PI state when blocking on a lock in the ++ * allocation path. ++ */ ++ if (debug_objects_is_pi_blocked_on()) ++ return false; ++ ++ /* ++ * On RT enabled kernels the pool refill should happen in preemptible ++ * context. + */ +- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || +- (preemptible() && !debug_objects_is_pi_blocked_on())) { ++ if (preemptible()) ++ return true; ++ ++ /* ++ * Though during system boot before scheduling is set up, preemption is ++ * disabled and the pool can get exhausted. Before scheduling is active ++ * a task cannot be blocked on a sleeping lock, but it might hold a lock ++ * and if interrupted then hard interrupt context might run into a lock ++ * inversion. So exclude hard interrupt context from allocations before ++ * scheduling is active. ++ */ ++ return system_state < SYSTEM_SCHEDULING && !in_hardirq(); ++} ++ ++static void debug_objects_fill_pool(void) ++{ ++ if (can_fill_pool()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching +- * the preemptible() condition above. ++ * the preemptible() condition in can_fill_pool(). + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.6/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch b/queue-6.6/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch new file mode 100644 index 0000000000..1dbc323587 --- /dev/null +++ b/queue-6.6/debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch @@ -0,0 +1,47 @@ +From ff13abd57c38583c7704613f7f3090a2c74716ad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 11:55:17 +0200 +Subject: debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP + +From: Sebastian Andrzej Siewior + +commit 37de2dbc318ee10577c1c2704de5a803e75e55a2 upstream. + +fill_pool_map is used to suppress nesting violations caused by acquiring +a spinlock_t (from within the memory allocator) while holding a +raw_spinlock_t. The used annotation is wrong. + +LD_WAIT_SLEEP is for always sleeping lock types such as mutex_t. +LD_WAIT_CONFIG is for lock type which are sleeping while spinning on +PREEMPT_RT such as spinlock_t. + +Use LD_WAIT_CONFIG as override. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Thomas Gleixner +Link: https://patch.msgid.link/20251127153652.291697-3-bigeasy@linutronix.de +Signed-off-by: Sasha Levin +--- + lib/debugobjects.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 5b462a45a9c17b..bebc00aacafedd 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -608,10 +608,10 @@ static void debug_objects_fill_pool(void) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning +- * by temporarily raising the wait-type to WAIT_SLEEP, matching ++ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching + * the preemptible() condition above. + */ +- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); ++ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); + lock_map_acquire_try(&fill_pool_map); + fill_pool(); + lock_map_release(&fill_pool_map); +-- +2.53.0 + diff --git a/queue-6.6/kvm-vmx-update-svi-during-runtime-apicv-activation.patch b/queue-6.6/kvm-vmx-update-svi-during-runtime-apicv-activation.patch new file mode 100644 index 0000000000..0c2ffed895 --- /dev/null +++ b/queue-6.6/kvm-vmx-update-svi-during-runtime-apicv-activation.patch @@ -0,0 +1,149 @@ +From 9566e0e5e46151432d51d44b6699747369eb9faf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jun 2026 10:03:43 +0000 +Subject: KVM: VMX: Update SVI during runtime APICv activation + +From: Dongli Zhang + +[ Upstream commit b2849bec936be642b5420801f902337f2507648e ] + +The APICv (apic->apicv_active) can be activated or deactivated at runtime, +for instance, because of APICv inhibit reasons. Intel VMX employs different +mechanisms to virtualize LAPIC based on whether APICv is active. + +When APICv is activated at runtime, GUEST_INTR_STATUS is used to configure +and report the current pending IRR and ISR states. Unless a specific vector +is explicitly included in EOI_EXIT_BITMAP, its EOI will not be trapped to +KVM. Intel VMX automatically clears the corresponding ISR bit based on the +GUEST_INTR_STATUS.SVI field. + +When APICv is deactivated at runtime, the VM_ENTRY_INTR_INFO_FIELD is used +to specify the next interrupt vector to invoke upon VM-entry. The +VMX IDT_VECTORING_INFO_FIELD is used to report un-invoked vectors on +VM-exit. EOIs are always trapped to KVM, so the software can manually clear +pending ISR bits. + +There are scenarios where, with APICv activated at runtime, a guest-issued +EOI may not be able to clear the pending ISR bit. + +Taking vector 236 as an example, here is one scenario. + +1. Suppose APICv is inactive. Vector 236 is pending in the IRR. +2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, +and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). +3. After VM-entry, vector 236 is invoked through the guest IDT. At this +point, the data in VM_ENTRY_INTR_INFO_FIELD is no longer valid. The guest +interrupt handler for vector 236 is invoked. +4. Suppose a VM exit occurs very early in the guest interrupt handler, +before the EOI is issued. +5. Nothing is reported through the IDT_VECTORING_INFO_FIELD because +vector 236 has already been invoked in the guest. +6. Now, suppose APICv is activated. Before the next VM-entry, KVM calls +kvm_vcpu_update_apicv() to activate APICv. +7. Unfortunately, GUEST_INTR_STATUS.SVI is not configured, although +vector 236 is still pending in the ISR. +8. After VM-entry, the guest finally issues the EOI for vector 236. +However, because SVI is not configured, vector 236 is not cleared. +9. ISR is stalled forever on vector 236. + +Here is another scenario. + +1. Suppose APICv is inactive. Vector 236 is pending in the IRR. +2. To handle KVM_REQ_EVENT, KVM moves vector 236 from the IRR to the ISR, +and configures the VM_ENTRY_INTR_INFO_FIELD via vmx_inject_irq(). +3. VM-exit occurs immediately after the next VM-entry. The vector 236 is +not invoked through the guest IDT. Instead, it is saved to the +IDT_VECTORING_INFO_FIELD during the VM-exit. +4. KVM calls kvm_queue_interrupt() to re-queue the un-invoked vector 236 +into vcpu->arch.interrupt. A KVM_REQ_EVENT is requested. +5. Now, suppose APICv is activated. Before the next VM-entry, KVM calls +kvm_vcpu_update_apicv() to activate APICv. +6. Although APICv is now active, KVM still uses the legacy +VM_ENTRY_INTR_INFO_FIELD to re-inject vector 236. GUEST_INTR_STATUS.SVI is +not configured. +7. After the next VM-entry, vector 236 is invoked through the guest IDT. +Finally, an EOI occurs. However, due to the lack of GUEST_INTR_STATUS.SVI +configuration, vector 236 is not cleared from the ISR. +8. ISR is stalled forever on vector 236. + +Using QEMU as an example, vector 236 is stuck in ISR forever. + +(qemu) info lapic 1 +dumping local APIC state for CPU 1 + +LVT0 0x00010700 active-hi edge masked ExtINT (vec 0) +LVT1 0x00010400 active-hi edge masked NMI +LVTPC 0x00000400 active-hi edge NMI +LVTERR 0x000000fe active-hi edge Fixed (vec 254) +LVTTHMR 0x00010000 active-hi edge masked Fixed (vec 0) +LVTT 0x000400ec active-hi edge tsc-deadline Fixed (vec 236) +Timer DCR=0x0 (divide by 2) initial_count = 0 current_count = 0 +SPIV 0x000001ff APIC enabled, focus=off, spurious vec 255 +ICR 0x000000fd physical edge de-assert no-shorthand +ICR2 0x00000000 cpu 0 (X2APIC ID) +ESR 0x00000000 +ISR 236 +IRR 37(level) 236 + +The issue isn't applicable to AMD SVM as KVM simply writes vmcb01 directly +irrespective of whether L1 (vmcs01) or L2 (vmcb02) is active (unlike VMX, +there is no need/cost to switch between VMCBs). In addition, +APICV_INHIBIT_REASON_IRQWIN ensures AMD SVM AVIC is not activated until +the last interrupt is EOI'd. + +Fix the bug by configuring Intel VMX GUEST_INTR_STATUS.SVI if APICv is +activated at runtime. + +Signed-off-by: Dongli Zhang +Reviewed-by: Chao Gao +Link: https://patch.msgid.link/20251110063212.34902-1-dongli.zhang@oracle.com +[sean: call out that SVM writes vmcb01 directly, tweak comment] +Link: https://patch.msgid.link/20251205231913.441872-2-seanjc@google.com +Signed-off-by: Sean Christopherson +[gulshan: resolved a minor conflict in vmx.c arising from a comment] +Signed-off-by: Gulshan Gabel +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/vmx.c | 4 ---- + arch/x86/kvm/x86.c | 7 +++++++ + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4a45e86c5e2fcc..85d301a03b2004 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -6851,10 +6851,6 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { +- /* +- * KVM is supposed to forward intercepted L2 EOIs to L1 if VID +- * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. +- */ + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 3838b7336590dd..c04277b35e2edf 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10426,9 +10426,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) + * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was + * still active when the interrupt got accepted. Make sure + * kvm_check_and_inject_events() is called to check for that. ++ * ++ * Update SVI when APICv gets enabled, otherwise SVI won't reflect the ++ * highest bit in vISR and the next accelerated EOI in the guest won't ++ * be virtualized correctly (the CPU uses SVI to determine which vISR ++ * vector to clear). + */ + if (!apic->apicv_active) + kvm_make_request(KVM_REQ_EVENT, vcpu); ++ else ++ kvm_apic_update_hwapic_isr(vcpu); + + out: + preempt_enable(); +-- +2.53.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 3b9cc6db80..3ec1959530 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -5,3 +5,12 @@ ip6_vti-set-netns_immutable-on-the-fallback-device.patch drm-v3d-store-the-active-job-inside-the-queue-s-stat.patch drm-v3d-skip-csd-when-it-has-zeroed-workgroups.patch batman-adv-tt-prevent-tvlv-entry-number-overflow.patch +debugobjects-allow-to-refill-the-pool-before-system_.patch +debugobjects-use-ld_wait_config-instead-of-ld_wait_s.patch +debugobjects-do-not-fill_pool-if-pi_blocked_on.patch +debugobjects-dont-call-fill_pool-in-early-boot-hardi.patch +arm-group-is_permission_fault-with-is_translation_fa.patch +arm-allow-__do_kernel_fault-to-report-execution-of-m.patch +arm-fix-hash_name-fault.patch +arm-fix-branch-predictor-hardening.patch +kvm-vmx-update-svi-during-runtime-apicv-activation.patch