From: Greg Kroah-Hartman Date: Thu, 19 Mar 2026 11:12:50 +0000 (+0100) Subject: 6.6-stable patches X-Git-Tag: v6.18.19~12 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=230ebf8017967f83b371ff6b6f2ad9d17226dc37;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: arm64-mm-batch-dsb-and-isb-when-populating-pgtables.patch arm64-mm-don-t-remap-pgtables-for-allocate-vs-populate.patch arm64-mm-don-t-remap-pgtables-per-cont-pte-pmd-block.patch btrfs-always-fallback-to-buffered-write-if-the-inode-requires-checksum.patch btrfs-fix-null-dereference-on-root-when-tracing-inode-eviction.patch dm-verity-disable-recursive-forward-error-correction.patch dst-fix-races-in-rt6_uncached_list_del-and-rt_del_uncached_list.patch eth-bnxt-always-recalculate-features-after-xdp-clearing-fix-null-deref.patch ext4-always-allocate-blocks-only-from-groups-inode-can-use.patch ext4-fix-dirtyclusters-double-decrement-on-fs-shutdown.patch net-stmmac-dwmac-loongson-set-clk_csr_i-to-100-150mhz.patch nfs-fix-a-deadlock-involving-nfs_release_folio.patch nfs-pass-explicit-offset-count-to-trace-events.patch pnfs-fix-a-deadlock-when-returning-a-delegation-during-open.patch rxrpc-fix-recvmsg-unconditional-requeue.patch usb-typec-ucsi-move-unregister-out-of-atomic-section.patch --- diff --git a/queue-6.6/arm64-mm-batch-dsb-and-isb-when-populating-pgtables.patch b/queue-6.6/arm64-mm-batch-dsb-and-isb-when-populating-pgtables.patch new file mode 100644 index 0000000000..f52de71ca0 --- /dev/null +++ b/queue-6.6/arm64-mm-batch-dsb-and-isb-when-populating-pgtables.patch @@ -0,0 +1,89 @@ +From stable+bounces-216826-greg=kroah.com@vger.kernel.org Tue Feb 17 14:35:05 2026 +From: Ryan Roberts +Date: Tue, 17 Feb 2026 13:34:07 +0000 +Subject: arm64: mm: Batch dsb and isb when populating pgtables +To: stable@vger.kernel.org +Cc: Ryan Roberts , catalin.marinas@arm.com, will@kernel.org, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, Jack Aboutboul , Sharath George John , Noah Meyerhans , Jim Perrin , Itaru Kitayama , Eric Chanudet , Mark Rutland , Ard Biesheuvel +Message-ID: <20260217133411.2881311-3-ryan.roberts@arm.com> + +From: Ryan Roberts + +[ Upstream commit 1fcb7cea8a5f7747e02230f816c2c80b060d9517 ] + +After removing uneccessary TLBIs, the next bottleneck when creating the +page tables for the linear map is DSB and ISB, which were previously +issued per-pte in __set_pte(). Since we are writing multiple ptes in a +given pte table, we can elide these barriers and insert them once we +have finished writing to the table. + +Execution time of map_mem(), which creates the kernel linear map page +tables, was measured on different machines with different RAM configs: + + | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra + | VM, 16G | VM, 64G | VM, 256G | Metal, 512G +---------------|-------------|-------------|-------------|------------- + | ms (%) | ms (%) | ms (%) | ms (%) +---------------|-------------|-------------|-------------|------------- +before | 78 (0%) | 435 (0%) | 1723 (0%) | 3779 (0%) +after | 11 (-86%) | 161 (-63%) | 656 (-62%) | 1654 (-56%) + +Signed-off-by: Ryan Roberts +Tested-by: Itaru Kitayama +Tested-by: Eric Chanudet +Reviewed-by: Mark Rutland +Reviewed-by: Ard Biesheuvel +Link: https://lore.kernel.org/r/20240412131908.433043-3-ryan.roberts@arm.com +Signed-off-by: Will Deacon +[ Ryan: Trivial backport ] +Signed-off-by: Ryan Roberts +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/pgtable.h | 7 ++++++- + arch/arm64/mm/mmu.c | 11 ++++++++++- + 2 files changed, 16 insertions(+), 2 deletions(-) + +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -262,9 +262,14 @@ static inline pte_t pte_mkdevmap(pte_t p + return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); + } + +-static inline void set_pte(pte_t *ptep, pte_t pte) ++static inline void set_pte_nosync(pte_t *ptep, pte_t pte) + { + WRITE_ONCE(*ptep, pte); ++} ++ ++static inline void set_pte(pte_t *ptep, pte_t pte) ++{ ++ set_pte_nosync(ptep, pte); + + /* + * Only if the new pte is valid and kernel, otherwise TLB maintenance +--- a/arch/arm64/mm/mmu.c ++++ b/arch/arm64/mm/mmu.c +@@ -175,7 +175,11 @@ static void init_pte(pte_t *ptep, unsign + do { + pte_t old_pte = READ_ONCE(*ptep); + +- set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); ++ /* ++ * Required barriers to make this visible to the table walker ++ * are deferred to the end of alloc_init_cont_pte(). ++ */ ++ set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + + /* + * After the PTE entry has been populated once, we +@@ -229,6 +233,11 @@ static void alloc_init_cont_pte(pmd_t *p + phys += next - addr; + } while (addr = next, addr != end); + ++ /* ++ * Note: barriers and maintenance necessary to clear the fixmap slot ++ * ensure that all previous pgtable writes are visible to the table ++ * walker. ++ */ + pte_clear_fixmap(); + } + diff --git a/queue-6.6/arm64-mm-don-t-remap-pgtables-for-allocate-vs-populate.patch b/queue-6.6/arm64-mm-don-t-remap-pgtables-for-allocate-vs-populate.patch new file mode 100644 index 0000000000..9638591e7a --- /dev/null +++ b/queue-6.6/arm64-mm-don-t-remap-pgtables-for-allocate-vs-populate.patch @@ -0,0 +1,177 @@ +From stable+bounces-216827-greg=kroah.com@vger.kernel.org Tue Feb 17 14:34:55 2026 +From: Ryan Roberts +Date: Tue, 17 Feb 2026 13:34:08 +0000 +Subject: arm64: mm: Don't remap pgtables for allocate vs populate +To: stable@vger.kernel.org +Cc: Ryan Roberts , catalin.marinas@arm.com, will@kernel.org, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, Jack Aboutboul , Sharath George John , Noah Meyerhans , Jim Perrin , Mark Rutland , Itaru Kitayama , Eric Chanudet , Ard Biesheuvel +Message-ID: <20260217133411.2881311-4-ryan.roberts@arm.com> + +From: Ryan Roberts + +[ Upstream commit 0e9df1c905d8293d333ace86c13d147382f5caf9 ] + +During linear map pgtable creation, each pgtable is fixmapped / +fixunmapped twice; once during allocation to zero the memory, and a +again during population to write the entries. This means each table has +2 TLB invalidations issued against it. Let's fix this so that each table +is only fixmapped/fixunmapped once, halving the number of TLBIs, and +improving performance. + +Achieve this by separating allocation and initialization (zeroing) of +the page. The allocated page is now fixmapped directly by the walker and +initialized, before being populated and finally fixunmapped. + +This approach keeps the change small, but has the side effect that late +allocations (using __get_free_page()) must also go through the generic +memory clearing routine. So let's tell __get_free_page() not to zero the +memory to avoid duplication. + +Additionally this approach means that fixmap/fixunmap is still used for +late pgtable modifications. That's not technically needed since the +memory is all mapped in the linear map by that point. That's left as a +possible future optimization if found to be needed. + +Execution time of map_mem(), which creates the kernel linear map page +tables, was measured on different machines with different RAM configs: + + | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra + | VM, 16G | VM, 64G | VM, 256G | Metal, 512G +---------------|-------------|-------------|-------------|------------- + | ms (%) | ms (%) | ms (%) | ms (%) +---------------|-------------|-------------|-------------|------------- +before | 11 (0%) | 161 (0%) | 656 (0%) | 1654 (0%) +after | 10 (-11%) | 104 (-35%) | 438 (-33%) | 1223 (-26%) + +Signed-off-by: Ryan Roberts +Suggested-by: Mark Rutland +Tested-by: Itaru Kitayama +Tested-by: Eric Chanudet +Reviewed-by: Mark Rutland +Reviewed-by: Ard Biesheuvel +Link: https://lore.kernel.org/r/20240412131908.433043-4-ryan.roberts@arm.com +Signed-off-by: Will Deacon +[ Ryan: Trivial backport ] +Signed-off-by: Ryan Roberts +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/mm/mmu.c | 58 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 29 insertions(+), 29 deletions(-) + +--- a/arch/arm64/mm/mmu.c ++++ b/arch/arm64/mm/mmu.c +@@ -106,28 +106,12 @@ EXPORT_SYMBOL(phys_mem_access_prot); + static phys_addr_t __init early_pgtable_alloc(int shift) + { + phys_addr_t phys; +- void *ptr; + + phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_NOLEAKTRACE); + if (!phys) + panic("Failed to allocate page table page\n"); + +- /* +- * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE +- * slot will be free, so we can (ab)use the FIX_PTE slot to initialise +- * any level of table. +- */ +- ptr = pte_set_fixmap(phys); +- +- memset(ptr, 0, PAGE_SIZE); +- +- /* +- * Implicit barriers also ensure the zeroed page is visible to the page +- * table walker +- */ +- pte_clear_fixmap(); +- + return phys; + } + +@@ -169,6 +153,14 @@ bool pgattr_change_is_safe(u64 old, u64 + return ((old ^ new) & ~mask) == 0; + } + ++static void init_clear_pgtable(void *table) ++{ ++ clear_page(table); ++ ++ /* Ensure the zeroing is observed by page table walks. */ ++ dsb(ishst); ++} ++ + static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot) + { +@@ -211,12 +203,15 @@ static void alloc_init_cont_pte(pmd_t *p + pmdval |= PMD_TABLE_PXN; + BUG_ON(!pgtable_alloc); + pte_phys = pgtable_alloc(PAGE_SHIFT); ++ ptep = pte_set_fixmap(pte_phys); ++ init_clear_pgtable(ptep); ++ ptep += pte_index(addr); + __pmd_populate(pmdp, pte_phys, pmdval); +- pmd = READ_ONCE(*pmdp); ++ } else { ++ BUG_ON(pmd_bad(pmd)); ++ ptep = pte_set_fixmap_offset(pmdp, addr); + } +- BUG_ON(pmd_bad(pmd)); + +- ptep = pte_set_fixmap_offset(pmdp, addr); + do { + pgprot_t __prot = prot; + +@@ -295,12 +290,15 @@ static void alloc_init_cont_pmd(pud_t *p + pudval |= PUD_TABLE_PXN; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(PMD_SHIFT); ++ pmdp = pmd_set_fixmap(pmd_phys); ++ init_clear_pgtable(pmdp); ++ pmdp += pmd_index(addr); + __pud_populate(pudp, pmd_phys, pudval); +- pud = READ_ONCE(*pudp); ++ } else { ++ BUG_ON(pud_bad(pud)); ++ pmdp = pmd_set_fixmap_offset(pudp, addr); + } +- BUG_ON(pud_bad(pud)); + +- pmdp = pmd_set_fixmap_offset(pudp, addr); + do { + pgprot_t __prot = prot; + +@@ -338,12 +336,15 @@ static void alloc_init_pud(pgd_t *pgdp, + p4dval |= P4D_TABLE_PXN; + BUG_ON(!pgtable_alloc); + pud_phys = pgtable_alloc(PUD_SHIFT); ++ pudp = pud_set_fixmap(pud_phys); ++ init_clear_pgtable(pudp); ++ pudp += pud_index(addr); + __p4d_populate(p4dp, pud_phys, p4dval); +- p4d = READ_ONCE(*p4dp); ++ } else { ++ BUG_ON(p4d_bad(p4d)); ++ pudp = pud_set_fixmap_offset(p4dp, addr); + } +- BUG_ON(p4d_bad(p4d)); + +- pudp = pud_set_fixmap_offset(p4dp, addr); + do { + pud_t old_pud = READ_ONCE(*pudp); + +@@ -425,11 +426,10 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdi + + static phys_addr_t __pgd_pgtable_alloc(int shift) + { +- void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL); +- BUG_ON(!ptr); ++ /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ ++ void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); + +- /* Ensure the zeroed page is visible to the page table walker */ +- dsb(ishst); ++ BUG_ON(!ptr); + return __pa(ptr); + } + diff --git a/queue-6.6/arm64-mm-don-t-remap-pgtables-per-cont-pte-pmd-block.patch b/queue-6.6/arm64-mm-don-t-remap-pgtables-per-cont-pte-pmd-block.patch new file mode 100644 index 0000000000..14483a1934 --- /dev/null +++ b/queue-6.6/arm64-mm-don-t-remap-pgtables-per-cont-pte-pmd-block.patch @@ -0,0 +1,160 @@ +From stable+bounces-216825-greg=kroah.com@vger.kernel.org Tue Feb 17 14:34:47 2026 +From: Ryan Roberts +Date: Tue, 17 Feb 2026 13:34:06 +0000 +Subject: arm64: mm: Don't remap pgtables per-cont(pte|pmd) block +To: stable@vger.kernel.org +Cc: Ryan Roberts , catalin.marinas@arm.com, will@kernel.org, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, Jack Aboutboul , Sharath George John , Noah Meyerhans , Jim Perrin , Itaru Kitayama , Eric Chanudet , Mark Rutland , Ard Biesheuvel +Message-ID: <20260217133411.2881311-2-ryan.roberts@arm.com> + +From: Ryan Roberts + +[ Upstream commit 5c63db59c5f89925add57642be4f789d0d671ccd ] + +A large part of the kernel boot time is creating the kernel linear map +page tables. When rodata=full, all memory is mapped by pte. And when +there is lots of physical ram, there are lots of pte tables to populate. +The primary cost associated with this is mapping and unmapping the pte +table memory in the fixmap; at unmap time, the TLB entry must be +invalidated and this is expensive. + +Previously, each pmd and pte table was fixmapped/fixunmapped for each +cont(pte|pmd) block of mappings (16 entries with 4K granule). This means +we ended up issuing 32 TLBIs per (pmd|pte) table during the population +phase. + +Let's fix that, and fixmap/fixunmap each page once per population, for a +saving of 31 TLBIs per (pmd|pte) table. This gives a significant boot +speedup. + +Execution time of map_mem(), which creates the kernel linear map page +tables, was measured on different machines with different RAM configs: + + | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra + | VM, 16G | VM, 64G | VM, 256G | Metal, 512G +---------------|-------------|-------------|-------------|------------- + | ms (%) | ms (%) | ms (%) | ms (%) +---------------|-------------|-------------|-------------|------------- +before | 168 (0%) | 2198 (0%) | 8644 (0%) | 17447 (0%) +after | 78 (-53%) | 435 (-80%) | 1723 (-80%) | 3779 (-78%) + +Signed-off-by: Ryan Roberts +Tested-by: Itaru Kitayama +Tested-by: Eric Chanudet +Reviewed-by: Mark Rutland +Reviewed-by: Ard Biesheuvel +Link: https://lore.kernel.org/r/20240412131908.433043-2-ryan.roberts@arm.com +Signed-off-by: Will Deacon +[ Ryan: Trivial backport ] +Signed-off-by: Ryan Roberts +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/mm/mmu.c | 27 ++++++++++++++------------- + 1 file changed, 14 insertions(+), 13 deletions(-) + +--- a/arch/arm64/mm/mmu.c ++++ b/arch/arm64/mm/mmu.c +@@ -169,12 +169,9 @@ bool pgattr_change_is_safe(u64 old, u64 + return ((old ^ new) & ~mask) == 0; + } + +-static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ++static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot) + { +- pte_t *ptep; +- +- ptep = pte_set_fixmap_offset(pmdp, addr); + do { + pte_t old_pte = READ_ONCE(*ptep); + +@@ -189,8 +186,6 @@ static void init_pte(pmd_t *pmdp, unsign + + phys += PAGE_SIZE; + } while (ptep++, addr += PAGE_SIZE, addr != end); +- +- pte_clear_fixmap(); + } + + static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, +@@ -201,6 +196,7 @@ static void alloc_init_cont_pte(pmd_t *p + { + unsigned long next; + pmd_t pmd = READ_ONCE(*pmdp); ++ pte_t *ptep; + + BUG_ON(pmd_sect(pmd)); + if (pmd_none(pmd)) { +@@ -216,6 +212,7 @@ static void alloc_init_cont_pte(pmd_t *p + } + BUG_ON(pmd_bad(pmd)); + ++ ptep = pte_set_fixmap_offset(pmdp, addr); + do { + pgprot_t __prot = prot; + +@@ -226,20 +223,21 @@ static void alloc_init_cont_pte(pmd_t *p + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + +- init_pte(pmdp, addr, next, phys, __prot); ++ init_pte(ptep, addr, next, phys, __prot); + ++ ptep += pte_index(next) - pte_index(addr); + phys += next - addr; + } while (addr = next, addr != end); ++ ++ pte_clear_fixmap(); + } + +-static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, ++static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(int), int flags) + { + unsigned long next; +- pmd_t *pmdp; + +- pmdp = pmd_set_fixmap_offset(pudp, addr); + do { + pmd_t old_pmd = READ_ONCE(*pmdp); + +@@ -265,8 +263,6 @@ static void init_pmd(pud_t *pudp, unsign + } + phys += next - addr; + } while (pmdp++, addr = next, addr != end); +- +- pmd_clear_fixmap(); + } + + static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, +@@ -276,6 +272,7 @@ static void alloc_init_cont_pmd(pud_t *p + { + unsigned long next; + pud_t pud = READ_ONCE(*pudp); ++ pmd_t *pmdp; + + /* + * Check for initial section mappings in the pgd/pud. +@@ -294,6 +291,7 @@ static void alloc_init_cont_pmd(pud_t *p + } + BUG_ON(pud_bad(pud)); + ++ pmdp = pmd_set_fixmap_offset(pudp, addr); + do { + pgprot_t __prot = prot; + +@@ -304,10 +302,13 @@ static void alloc_init_cont_pmd(pud_t *p + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + +- init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags); ++ init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); + ++ pmdp += pmd_index(next) - pmd_index(addr); + phys += next - addr; + } while (addr = next, addr != end); ++ ++ pmd_clear_fixmap(); + } + + static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, diff --git a/queue-6.6/btrfs-always-fallback-to-buffered-write-if-the-inode-requires-checksum.patch b/queue-6.6/btrfs-always-fallback-to-buffered-write-if-the-inode-requires-checksum.patch new file mode 100644 index 0000000000..60f2aca4b3 --- /dev/null +++ b/queue-6.6/btrfs-always-fallback-to-buffered-write-if-the-inode-requires-checksum.patch @@ -0,0 +1,108 @@ +From stable+bounces-217704-greg=kroah.com@vger.kernel.org Mon Feb 23 09:07:37 2026 +From: Qu Wenruo +Date: Mon, 23 Feb 2026 18:33:48 +1030 +Subject: btrfs: always fallback to buffered write if the inode requires checksum +To: linux-btrfs@vger.kernel.org +Cc: stable@vger.kernel.org, Christoph Hellwig , Filipe Manana , David Sterba +Message-ID: <5c3a9c8f484ed1ba8fe897e67057eec24968f7bd.1771833812.git.wqu@suse.com> + +From: Qu Wenruo + +commit 968f19c5b1b7d5595423b0ac0020cc18dfed8cb5 upstream. + +[BUG] +It is a long known bug that VM image on btrfs can lead to data csum +mismatch, if the qemu is using direct-io for the image (this is commonly +known as cache mode 'none'). + +[CAUSE] +Inside the VM, if the fs is EXT4 or XFS, or even NTFS from Windows, the +fs is allowed to dirty/modify the folio even if the folio is under +writeback (as long as the address space doesn't have AS_STABLE_WRITES +flag inherited from the block device). + +This is a valid optimization to improve the concurrency, and since these +filesystems have no extra checksum on data, the content change is not a +problem at all. + +But the final write into the image file is handled by btrfs, which needs +the content not to be modified during writeback, or the checksum will +not match the data (checksum is calculated before submitting the bio). + +So EXT4/XFS/NTRFS assume they can modify the folio under writeback, but +btrfs requires no modification, this leads to the false csum mismatch. + +This is only a controlled example, there are even cases where +multi-thread programs can submit a direct IO write, then another thread +modifies the direct IO buffer for whatever reason. + +For such cases, btrfs has no sane way to detect such cases and leads to +false data csum mismatch. + +[FIX] +I have considered the following ideas to solve the problem: + +- Make direct IO to always skip data checksum + This not only requires a new incompatible flag, as it breaks the + current per-inode NODATASUM flag. + But also requires extra handling for no csum found cases. + + And this also reduces our checksum protection. + +- Let hardware handle all the checksum + AKA, just nodatasum mount option. + That requires trust for hardware (which is not that trustful in a lot + of cases), and it's not generic at all. + +- Always fallback to buffered write if the inode requires checksum + This was suggested by Christoph, and is the solution utilized by this + patch. + + The cost is obvious, the extra buffer copying into page cache, thus it + reduces the performance. + But at least it's still user configurable, if the end user still wants + the zero-copy performance, just set NODATASUM flag for the inode + (which is a common practice for VM images on btrfs). + + Since we cannot trust user space programs to keep the buffer + consistent during direct IO, we have no choice but always falling back + to buffered IO. At least by this, we avoid the more deadly false data + checksum mismatch error. + +Cc: stable@vger.kernel.org # 6.6 +[ Conflicts caused by code extracted into direct-io.c ] +Suggested-by: Christoph Hellwig +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/file.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1514,6 +1514,22 @@ relock: + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } ++ /* ++ * We can't control the folios being passed in, applications can write ++ * to them while a direct IO write is in progress. This means the ++ * content might change after we calculated the data checksum. ++ * Therefore we can end up storing a checksum that doesn't match the ++ * persisted data. ++ * ++ * To be extra safe and avoid false data checksum mismatch, if the ++ * inode requires data checksum, just fallback to buffered IO. ++ * For buffered IO we have full control of page cache and can ensure ++ * no one is modifying the content during writeback. ++ */ ++ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ++ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ++ goto buffered; ++ } + + /* + * The iov_iter can be mapped to the same file range we are writing to. diff --git a/queue-6.6/btrfs-fix-null-dereference-on-root-when-tracing-inode-eviction.patch b/queue-6.6/btrfs-fix-null-dereference-on-root-when-tracing-inode-eviction.patch new file mode 100644 index 0000000000..f626d226b9 --- /dev/null +++ b/queue-6.6/btrfs-fix-null-dereference-on-root-when-tracing-inode-eviction.patch @@ -0,0 +1,49 @@ +From stable+bounces-217853-greg=kroah.com@vger.kernel.org Tue Feb 24 04:37:02 2026 +From: Bin Lan +Date: Tue, 24 Feb 2026 03:32:14 +0000 +Subject: btrfs: fix NULL dereference on root when tracing inode eviction +To: stable@vger.kernel.org, gregkh@linuxfoundation.org +Cc: "Miquel Sabaté Solà" , syzbot+d991fea1b4b23b1f6bf8@syzkaller.appspotmail.com, "David Sterba" , "Bin Lan" +Message-ID: <20260224033214.4976-1-lanbincn@139.com> + +From: Miquel Sabaté Solà + +[ Upstream commit f157dd661339fc6f5f2b574fe2429c43bd309534 ] + +When evicting an inode the first thing we do is to setup tracing for it, +which implies fetching the root's id. But in btrfs_evict_inode() the +root might be NULL, as implied in the next check that we do in +btrfs_evict_inode(). + +Hence, we either should set the ->root_objectid to 0 in case the root is +NULL, or we move tracing setup after checking that the root is not +NULL. Setting the rootid to 0 at least gives us the possibility to trace +this call even in the case when the root is NULL, so that's the solution +taken here. + +Fixes: 1abe9b8a138c ("Btrfs: add initial tracepoint support for btrfs") +Reported-by: syzbot+d991fea1b4b23b1f6bf8@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=d991fea1b4b23b1f6bf8 +Signed-off-by: Miquel Sabaté Solà +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[ Adjust context ] +Signed-off-by: Bin Lan +Signed-off-by: Greg Kroah-Hartman +--- + include/trace/events/btrfs.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/include/trace/events/btrfs.h ++++ b/include/trace/events/btrfs.h +@@ -225,8 +225,8 @@ DECLARE_EVENT_CLASS(btrfs__inode, + __entry->generation = BTRFS_I(inode)->generation; + __entry->last_trans = BTRFS_I(inode)->last_trans; + __entry->logged_trans = BTRFS_I(inode)->logged_trans; +- __entry->root_objectid = +- BTRFS_I(inode)->root->root_key.objectid; ++ __entry->root_objectid = BTRFS_I(inode)->root ? ++ btrfs_root_id(BTRFS_I(inode)->root) : 0; + ), + + TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu " diff --git a/queue-6.6/dm-verity-disable-recursive-forward-error-correction.patch b/queue-6.6/dm-verity-disable-recursive-forward-error-correction.patch new file mode 100644 index 0000000000..cb952a7d46 --- /dev/null +++ b/queue-6.6/dm-verity-disable-recursive-forward-error-correction.patch @@ -0,0 +1,68 @@ +From stable+bounces-219751-greg=kroah.com@vger.kernel.org Thu Feb 26 06:05:28 2026 +From: Rahul Sharma +Date: Thu, 26 Feb 2026 13:04:18 +0800 +Subject: dm-verity: disable recursive forward error correction +To: gregkh@linuxfoundation.org, stable@vger.kernel.org +Cc: linux-kernel@vger.kernel.org, Mikulas Patocka , Guangwu Zhang , Sami Tolvanen , Eric Biggers , Rahul Sharma +Message-ID: <20260226050418.159241-1-black.hawk@163.com> + +From: Mikulas Patocka + +[ Upstream commit d9f3e47d3fae0c101d9094bc956ed24e7a0ee801 ] + +There are two problems with the recursive correction: + +1. It may cause denial-of-service. In fec_read_bufs, there is a loop that +has 253 iterations. For each iteration, we may call verity_hash_for_block +recursively. There is a limit of 4 nested recursions - that means that +there may be at most 253^4 (4 billion) iterations. Red Hat QE team +actually created an image that pushes dm-verity to this limit - and this +image just makes the udev-worker process get stuck in the 'D' state. + +2. It doesn't work. In fec_read_bufs we store data into the variable +"fio->bufs", but fio bufs is shared between recursive invocations, if +"verity_hash_for_block" invoked correction recursively, it would +overwrite partially filled fio->bufs. + +Signed-off-by: Mikulas Patocka +Reported-by: Guangwu Zhang +Reviewed-by: Sami Tolvanen +Reviewed-by: Eric Biggers +[ The context change is due to the commit bdf253d580d7 +("dm-verity: remove support for asynchronous hashes") +in v6.18 and the commit 9356fcfe0ac4 +("dm verity: set DM_TARGET_SINGLETON feature flag") in v6.9 +which are irrelevant to the logic of this patch. ] +Signed-off-by: Rahul Sharma +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-verity-fec.c | 4 +--- + drivers/md/dm-verity-fec.h | 3 --- + 2 files changed, 1 insertion(+), 6 deletions(-) + +--- a/drivers/md/dm-verity-fec.c ++++ b/drivers/md/dm-verity-fec.c +@@ -439,10 +439,8 @@ int verity_fec_decode(struct dm_verity * + if (!verity_fec_is_enabled(v)) + return -EOPNOTSUPP; + +- if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { +- DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); ++ if (fio->level) + return -EIO; +- } + + fio->level++; + +--- a/drivers/md/dm-verity-fec.h ++++ b/drivers/md/dm-verity-fec.h +@@ -23,9 +23,6 @@ + #define DM_VERITY_FEC_BUF_MAX \ + (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) + +-/* maximum recursion level for verity_fec_decode */ +-#define DM_VERITY_FEC_MAX_RECURSION 4 +- + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" + #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" + #define DM_VERITY_OPT_FEC_START "fec_start" diff --git a/queue-6.6/dst-fix-races-in-rt6_uncached_list_del-and-rt_del_uncached_list.patch b/queue-6.6/dst-fix-races-in-rt6_uncached_list_del-and-rt_del_uncached_list.patch new file mode 100644 index 0000000000..85db19853d --- /dev/null +++ b/queue-6.6/dst-fix-races-in-rt6_uncached_list_del-and-rt_del_uncached_list.patch @@ -0,0 +1,263 @@ +From stable+bounces-217861-greg=kroah.com@vger.kernel.org Tue Feb 24 06:50:52 2026 +From: Rahul Sharma +Date: Tue, 24 Feb 2026 13:49:43 +0800 +Subject: dst: fix races in rt6_uncached_list_del() and rt_del_uncached_list() +To: gregkh@linuxfoundation.org, stable@vger.kernel.org +Cc: linux-kernel@vger.kernel.org, Eric Dumazet , syzbot+179fc225724092b8b2b2@syzkaller.appspotmail.com, Martin KaFai Lau , David Ahern , Jakub Kicinski , Rahul Sharma +Message-ID: <20260224054943.3324184-1-black.hawk@163.com> + +From: Eric Dumazet + +[ Upstream commit 9a6f0c4d5796ab89b5a28a890ce542344d58bd69 ] + +syzbot was able to crash the kernel in rt6_uncached_list_flush_dev() +in an interesting way [1] + +Crash happens in list_del_init()/INIT_LIST_HEAD() while writing +list->prev, while the prior write on list->next went well. + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + WRITE_ONCE(list->next, list); // This went well + WRITE_ONCE(list->prev, list); // Crash, @list has been freed. +} + +Issue here is that rt6_uncached_list_del() did not attempt to lock +ul->lock, as list_empty(&rt->dst.rt_uncached) returned +true because the WRITE_ONCE(list->next, list) happened on the other CPU. + +We might use list_del_init_careful() and list_empty_careful(), +or make sure rt6_uncached_list_del() always grabs the spinlock +whenever rt->dst.rt_uncached_list has been set. + +A similar fix is neeed for IPv4. + +[1] + + BUG: KASAN: slab-use-after-free in INIT_LIST_HEAD include/linux/list.h:46 [inline] + BUG: KASAN: slab-use-after-free in list_del_init include/linux/list.h:296 [inline] + BUG: KASAN: slab-use-after-free in rt6_uncached_list_flush_dev net/ipv6/route.c:191 [inline] + BUG: KASAN: slab-use-after-free in rt6_disable_ip+0x633/0x730 net/ipv6/route.c:5020 +Write of size 8 at addr ffff8880294cfa78 by task kworker/u8:14/3450 + +CPU: 0 UID: 0 PID: 3450 Comm: kworker/u8:14 Tainted: G L syzkaller #0 PREEMPT_{RT,(full)} +Tainted: [L]=SOFTLOCKUP +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 +Workqueue: netns cleanup_net +Call Trace: + + dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:378 [inline] + print_report+0xca/0x240 mm/kasan/report.c:482 + kasan_report+0x118/0x150 mm/kasan/report.c:595 + INIT_LIST_HEAD include/linux/list.h:46 [inline] + list_del_init include/linux/list.h:296 [inline] + rt6_uncached_list_flush_dev net/ipv6/route.c:191 [inline] + rt6_disable_ip+0x633/0x730 net/ipv6/route.c:5020 + addrconf_ifdown+0x143/0x18a0 net/ipv6/addrconf.c:3853 + addrconf_notify+0x1bc/0x1050 net/ipv6/addrconf.c:-1 + notifier_call_chain+0x19d/0x3a0 kernel/notifier.c:85 + call_netdevice_notifiers_extack net/core/dev.c:2268 [inline] + call_netdevice_notifiers net/core/dev.c:2282 [inline] + netif_close_many+0x29c/0x410 net/core/dev.c:1785 + unregister_netdevice_many_notify+0xb50/0x2330 net/core/dev.c:12353 + ops_exit_rtnl_list net/core/net_namespace.c:187 [inline] + ops_undo_list+0x3dc/0x990 net/core/net_namespace.c:248 + cleanup_net+0x4de/0x7b0 net/core/net_namespace.c:696 + process_one_work kernel/workqueue.c:3257 [inline] + process_scheduled_works+0xad1/0x1770 kernel/workqueue.c:3340 + worker_thread+0x8a0/0xda0 kernel/workqueue.c:3421 + kthread+0x711/0x8a0 kernel/kthread.c:463 + ret_from_fork+0x510/0xa50 arch/x86/kernel/process.c:158 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 + + +Allocated by task 803: + kasan_save_stack mm/kasan/common.c:57 [inline] + kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 + unpoison_slab_object mm/kasan/common.c:340 [inline] + __kasan_slab_alloc+0x6c/0x80 mm/kasan/common.c:366 + kasan_slab_alloc include/linux/kasan.h:253 [inline] + slab_post_alloc_hook mm/slub.c:4953 [inline] + slab_alloc_node mm/slub.c:5263 [inline] + kmem_cache_alloc_noprof+0x18d/0x6c0 mm/slub.c:5270 + dst_alloc+0x105/0x170 net/core/dst.c:89 + ip6_dst_alloc net/ipv6/route.c:342 [inline] + icmp6_dst_alloc+0x75/0x460 net/ipv6/route.c:3333 + mld_sendpack+0x683/0xe60 net/ipv6/mcast.c:1844 + mld_send_cr net/ipv6/mcast.c:2154 [inline] + mld_ifc_work+0x83e/0xd60 net/ipv6/mcast.c:2693 + process_one_work kernel/workqueue.c:3257 [inline] + process_scheduled_works+0xad1/0x1770 kernel/workqueue.c:3340 + worker_thread+0x8a0/0xda0 kernel/workqueue.c:3421 + kthread+0x711/0x8a0 kernel/kthread.c:463 + ret_from_fork+0x510/0xa50 arch/x86/kernel/process.c:158 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 + +Freed by task 20: + kasan_save_stack mm/kasan/common.c:57 [inline] + kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 + kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 + poison_slab_object mm/kasan/common.c:253 [inline] + __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285 + kasan_slab_free include/linux/kasan.h:235 [inline] + slab_free_hook mm/slub.c:2540 [inline] + slab_free mm/slub.c:6670 [inline] + kmem_cache_free+0x18f/0x8d0 mm/slub.c:6781 + dst_destroy+0x235/0x350 net/core/dst.c:121 + rcu_do_batch kernel/rcu/tree.c:2605 [inline] + rcu_core kernel/rcu/tree.c:2857 [inline] + rcu_cpu_kthread+0xba5/0x1af0 kernel/rcu/tree.c:2945 + smpboot_thread_fn+0x542/0xa60 kernel/smpboot.c:160 + kthread+0x711/0x8a0 kernel/kthread.c:463 + ret_from_fork+0x510/0xa50 arch/x86/kernel/process.c:158 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 + +Last potentially related work creation: + kasan_save_stack+0x3e/0x60 mm/kasan/common.c:57 + kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:556 + __call_rcu_common kernel/rcu/tree.c:3119 [inline] + call_rcu+0xee/0x890 kernel/rcu/tree.c:3239 + refdst_drop include/net/dst.h:266 [inline] + skb_dst_drop include/net/dst.h:278 [inline] + skb_release_head_state+0x71/0x360 net/core/skbuff.c:1156 + skb_release_all net/core/skbuff.c:1180 [inline] + __kfree_skb net/core/skbuff.c:1196 [inline] + sk_skb_reason_drop+0xe9/0x170 net/core/skbuff.c:1234 + kfree_skb_reason include/linux/skbuff.h:1322 [inline] + tcf_kfree_skb_list include/net/sch_generic.h:1127 [inline] + __dev_xmit_skb net/core/dev.c:4260 [inline] + __dev_queue_xmit+0x26aa/0x3210 net/core/dev.c:4785 + NF_HOOK_COND include/linux/netfilter.h:307 [inline] + ip6_output+0x340/0x550 net/ipv6/ip6_output.c:247 + NF_HOOK+0x9e/0x380 include/linux/netfilter.h:318 + mld_sendpack+0x8d4/0xe60 net/ipv6/mcast.c:1855 + mld_send_cr net/ipv6/mcast.c:2154 [inline] + mld_ifc_work+0x83e/0xd60 net/ipv6/mcast.c:2693 + process_one_work kernel/workqueue.c:3257 [inline] + process_scheduled_works+0xad1/0x1770 kernel/workqueue.c:3340 + worker_thread+0x8a0/0xda0 kernel/workqueue.c:3421 + kthread+0x711/0x8a0 kernel/kthread.c:463 + ret_from_fork+0x510/0xa50 arch/x86/kernel/process.c:158 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 + +The buggy address belongs to the object at ffff8880294cfa00 + which belongs to the cache ip6_dst_cache of size 232 +The buggy address is located 120 bytes inside of + freed 232-byte region [ffff8880294cfa00, ffff8880294cfae8) + +The buggy address belongs to the physical page: +page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x294cf +memcg:ffff88803536b781 +flags: 0x80000000000000(node=0|zone=1) +page_type: f5(slab) +raw: 0080000000000000 ffff88802ff1c8c0 ffffea0000bf2bc0 dead000000000006 +raw: 0000000000000000 00000000800c000c 00000000f5000000 ffff88803536b781 +page dumped because: kasan: bad access detected +page_owner tracks the page as allocated +page last allocated via order 0, migratetype Unmovable, gfp_mask 0x52820(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 9, tgid 9 (kworker/0:0), ts 91119585830, free_ts 91088628818 + set_page_owner include/linux/page_owner.h:32 [inline] + post_alloc_hook+0x234/0x290 mm/page_alloc.c:1857 + prep_new_page mm/page_alloc.c:1865 [inline] + get_page_from_freelist+0x28c0/0x2960 mm/page_alloc.c:3915 + __alloc_frozen_pages_noprof+0x181/0x370 mm/page_alloc.c:5210 + alloc_pages_mpol+0xd1/0x380 mm/mempolicy.c:2486 + alloc_slab_page mm/slub.c:3075 [inline] + allocate_slab+0x86/0x3b0 mm/slub.c:3248 + new_slab mm/slub.c:3302 [inline] + ___slab_alloc+0xb10/0x13e0 mm/slub.c:4656 + __slab_alloc+0xc6/0x1f0 mm/slub.c:4779 + __slab_alloc_node mm/slub.c:4855 [inline] + slab_alloc_node mm/slub.c:5251 [inline] + kmem_cache_alloc_noprof+0x101/0x6c0 mm/slub.c:5270 + dst_alloc+0x105/0x170 net/core/dst.c:89 + ip6_dst_alloc net/ipv6/route.c:342 [inline] + icmp6_dst_alloc+0x75/0x460 net/ipv6/route.c:3333 + mld_sendpack+0x683/0xe60 net/ipv6/mcast.c:1844 + mld_send_cr net/ipv6/mcast.c:2154 [inline] + mld_ifc_work+0x83e/0xd60 net/ipv6/mcast.c:2693 + process_one_work kernel/workqueue.c:3257 [inline] + process_scheduled_works+0xad1/0x1770 kernel/workqueue.c:3340 + worker_thread+0x8a0/0xda0 kernel/workqueue.c:3421 + kthread+0x711/0x8a0 kernel/kthread.c:463 + ret_from_fork+0x510/0xa50 arch/x86/kernel/process.c:158 +page last free pid 5859 tgid 5859 stack trace: + reset_page_owner include/linux/page_owner.h:25 [inline] + free_pages_prepare mm/page_alloc.c:1406 [inline] + __free_frozen_pages+0xfe1/0x1170 mm/page_alloc.c:2943 + discard_slab mm/slub.c:3346 [inline] + __put_partials+0x149/0x170 mm/slub.c:3886 + __slab_free+0x2af/0x330 mm/slub.c:5952 + qlink_free mm/kasan/quarantine.c:163 [inline] + qlist_free_all+0x97/0x100 mm/kasan/quarantine.c:179 + kasan_quarantine_reduce+0x148/0x160 mm/kasan/quarantine.c:286 + __kasan_slab_alloc+0x22/0x80 mm/kasan/common.c:350 + kasan_slab_alloc include/linux/kasan.h:253 [inline] + slab_post_alloc_hook mm/slub.c:4953 [inline] + slab_alloc_node mm/slub.c:5263 [inline] + kmem_cache_alloc_noprof+0x18d/0x6c0 mm/slub.c:5270 + getname_flags+0xb8/0x540 fs/namei.c:146 + getname include/linux/fs.h:2498 [inline] + do_sys_openat2+0xbc/0x200 fs/open.c:1426 + do_sys_open fs/open.c:1436 [inline] + __do_sys_openat fs/open.c:1452 [inline] + __se_sys_openat fs/open.c:1447 [inline] + __x64_sys_openat+0x138/0x170 fs/open.c:1447 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 + +Fixes: 8d0b94afdca8 ("ipv6: Keep track of DST_NOCACHE routes in case of iface down/unregister") +Fixes: 78df76a065ae ("ipv4: take rt_uncached_lock only if needed") +Reported-by: syzbot+179fc225724092b8b2b2@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/netdev/6964cdf2.050a0220.eaf7.009d.GAE@google.com/T/#u +Signed-off-by: Eric Dumazet +Cc: Martin KaFai Lau +Reviewed-by: David Ahern +Link: https://patch.msgid.link/20260112103825.3810713-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Rahul Sharma +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dst.c | 1 + + net/ipv4/route.c | 4 ++-- + net/ipv6/route.c | 4 ++-- + 3 files changed, 5 insertions(+), 4 deletions(-) + +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -68,6 +68,7 @@ void dst_init(struct dst_entry *dst, str + dst->lwtstate = NULL; + rcuref_init(&dst->__rcuref, initial_ref); + INIT_LIST_HEAD(&dst->rt_uncached); ++ dst->rt_uncached_list = NULL; + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1546,9 +1546,9 @@ void rt_add_uncached_list(struct rtable + + void rt_del_uncached_list(struct rtable *rt) + { +- if (!list_empty(&rt->dst.rt_uncached)) { +- struct uncached_list *ul = rt->dst.rt_uncached_list; ++ struct uncached_list *ul = rt->dst.rt_uncached_list; + ++ if (ul) { + spin_lock_bh(&ul->lock); + list_del_init(&rt->dst.rt_uncached); + spin_unlock_bh(&ul->lock); +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -149,9 +149,9 @@ void rt6_uncached_list_add(struct rt6_in + + void rt6_uncached_list_del(struct rt6_info *rt) + { +- if (!list_empty(&rt->dst.rt_uncached)) { +- struct uncached_list *ul = rt->dst.rt_uncached_list; ++ struct uncached_list *ul = rt->dst.rt_uncached_list; + ++ if (ul) { + spin_lock_bh(&ul->lock); + list_del_init(&rt->dst.rt_uncached); + spin_unlock_bh(&ul->lock); diff --git a/queue-6.6/eth-bnxt-always-recalculate-features-after-xdp-clearing-fix-null-deref.patch b/queue-6.6/eth-bnxt-always-recalculate-features-after-xdp-clearing-fix-null-deref.patch new file mode 100644 index 0000000000..38b2963eae --- /dev/null +++ b/queue-6.6/eth-bnxt-always-recalculate-features-after-xdp-clearing-fix-null-deref.patch @@ -0,0 +1,162 @@ +From stable+bounces-219193-greg=kroah.com@vger.kernel.org Wed Feb 25 07:01:18 2026 +From: Rahul Sharma +Date: Wed, 25 Feb 2026 14:00:20 +0800 +Subject: eth: bnxt: always recalculate features after XDP clearing, fix null-deref +To: gregkh@linuxfoundation.org, stable@vger.kernel.org +Cc: linux-kernel@vger.kernel.org, Jakub Kicinski , Michael Chan , Somnath Kotur , Rahul Sharma +Message-ID: <20260225060020.3361855-1-black.hawk@163.com> + +From: Jakub Kicinski + +[ Upstream commit f0aa6a37a3dbb40b272df5fc6db93c114688adcd ] + +Recalculate features when XDP is detached. + +Before: + # ip li set dev eth0 xdp obj xdp_dummy.bpf.o sec xdp + # ip li set dev eth0 xdp off + # ethtool -k eth0 | grep gro + rx-gro-hw: off [requested on] + +After: + # ip li set dev eth0 xdp obj xdp_dummy.bpf.o sec xdp + # ip li set dev eth0 xdp off + # ethtool -k eth0 | grep gro + rx-gro-hw: on + +The fact that HW-GRO doesn't get re-enabled automatically is just +a minor annoyance. The real issue is that the features will randomly +come back during another reconfiguration which just happens to invoke +netdev_update_features(). The driver doesn't handle reconfiguring +two things at a time very robustly. + +Starting with commit 98ba1d931f61 ("bnxt_en: Fix RSS logic in +__bnxt_reserve_rings()") we only reconfigure the RSS hash table +if the "effective" number of Rx rings has changed. If HW-GRO is +enabled "effective" number of rings is 2x what user sees. +So if we are in the bad state, with HW-GRO re-enablement "pending" +after XDP off, and we lower the rings by / 2 - the HW-GRO rings +doing 2x and the ethtool -L doing / 2 may cancel each other out, +and the: + + if (old_rx_rings != bp->hw_resc.resv_rx_rings && + +condition in __bnxt_reserve_rings() will be false. +The RSS map won't get updated, and we'll crash with: + + BUG: kernel NULL pointer dereference, address: 0000000000000168 + RIP: 0010:__bnxt_hwrm_vnic_set_rss+0x13a/0x1a0 + bnxt_hwrm_vnic_rss_cfg_p5+0x47/0x180 + __bnxt_setup_vnic_p5+0x58/0x110 + bnxt_init_nic+0xb72/0xf50 + __bnxt_open_nic+0x40d/0xab0 + bnxt_open_nic+0x2b/0x60 + ethtool_set_channels+0x18c/0x1d0 + +As we try to access a freed ring. + +The issue is present since XDP support was added, really, but +prior to commit 98ba1d931f61 ("bnxt_en: Fix RSS logic in +__bnxt_reserve_rings()") it wasn't causing major issues. + +Fixes: 1054aee82321 ("bnxt_en: Use NETIF_F_GRO_HW.") +Fixes: 98ba1d931f61 ("bnxt_en: Fix RSS logic in __bnxt_reserve_rings()") +Reviewed-by: Michael Chan +Reviewed-by: Somnath Kotur +Link: https://patch.msgid.link/20250109043057.2888953-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +[ The context change is due to the commit 1f6e77cb9b32 +("bnxt_en: Add bnxt_l2_filter hash table.") in v6.8 and the commit +8336a974f37d ("bnxt_en: Save user configured filters in a lookup list") +in v6.9 which are irrelevant to the logic of this patch. ] +Signed-off-by: Rahul Sharma +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 ++++++++++++++++++++----- + drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- + drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 ------- + 3 files changed, 21 insertions(+), 13 deletions(-) + +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -3996,7 +3996,7 @@ void bnxt_set_ring_params(struct bnxt *b + /* Changing allocation mode of RX rings. + * TODO: Update when extending xdp_rxq_info to support allocation modes. + */ +-int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) ++static void __bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) + { + struct net_device *dev = bp->dev; + +@@ -4017,15 +4017,30 @@ int bnxt_set_rx_skb_mode(struct bnxt *bp + bp->rx_skb_func = bnxt_rx_page_skb; + } + bp->rx_dir = DMA_BIDIRECTIONAL; +- /* Disable LRO or GRO_HW */ +- netdev_update_features(dev); + } else { + dev->max_mtu = bp->max_mtu; + bp->flags &= ~BNXT_FLAG_RX_PAGE_MODE; + bp->rx_dir = DMA_FROM_DEVICE; + bp->rx_skb_func = bnxt_rx_skb; + } +- return 0; ++} ++ ++void bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) ++{ ++ __bnxt_set_rx_skb_mode(bp, page_mode); ++ ++ if (!page_mode) { ++ int rx, tx; ++ ++ bnxt_get_max_rings(bp, &rx, &tx, true); ++ if (rx > 1) { ++ bp->flags &= ~BNXT_FLAG_NO_AGG_RINGS; ++ bp->dev->hw_features |= NETIF_F_LRO; ++ } ++ } ++ ++ /* Update LRO and GRO_HW availability */ ++ netdev_update_features(bp->dev); + } + + static void bnxt_free_vnic_attributes(struct bnxt *bp) +@@ -13773,7 +13788,7 @@ static int bnxt_init_one(struct pci_dev + if (rc) + goto init_err_pci_clean; + +- bnxt_set_rx_skb_mode(bp, false); ++ __bnxt_set_rx_skb_mode(bp, false); + bnxt_set_tpa_flags(bp); + bnxt_set_ring_params(bp); + rc = bnxt_set_dflt_rings(bp, true); +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h +@@ -2332,7 +2332,7 @@ void bnxt_reuse_rx_data(struct bnxt_rx_r + u32 bnxt_fw_health_readl(struct bnxt *bp, int reg_idx); + void bnxt_set_tpa_flags(struct bnxt *bp); + void bnxt_set_ring_params(struct bnxt *); +-int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode); ++void bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode); + int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp, unsigned long *bmap, + int bmap_size, bool async_only); + int bnxt_hwrm_func_drv_unrgtr(struct bnxt *bp); +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +@@ -422,15 +422,8 @@ static int bnxt_xdp_set(struct bnxt *bp, + bnxt_set_rx_skb_mode(bp, true); + xdp_features_set_redirect_target(dev, true); + } else { +- int rx, tx; +- + xdp_features_clear_redirect_target(dev); + bnxt_set_rx_skb_mode(bp, false); +- bnxt_get_max_rings(bp, &rx, &tx, true); +- if (rx > 1) { +- bp->flags &= ~BNXT_FLAG_NO_AGG_RINGS; +- bp->dev->hw_features |= NETIF_F_LRO; +- } + } + bp->tx_nr_rings_xdp = tx_xdp; + bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tc + tx_xdp; diff --git a/queue-6.6/ext4-always-allocate-blocks-only-from-groups-inode-can-use.patch b/queue-6.6/ext4-always-allocate-blocks-only-from-groups-inode-can-use.patch new file mode 100644 index 0000000000..799d13f20d --- /dev/null +++ b/queue-6.6/ext4-always-allocate-blocks-only-from-groups-inode-can-use.patch @@ -0,0 +1,71 @@ +From stable+bounces-219686-greg=kroah.com@vger.kernel.org Wed Feb 25 19:15:41 2026 +From: Sasha Levin +Date: Wed, 25 Feb 2026 13:15:35 -0500 +Subject: ext4: always allocate blocks only from groups inode can use +To: stable@vger.kernel.org +Cc: Jan Kara , Baokun Li , Zhang Yi , Pedro Falcato , stable@kernel.org, Theodore Ts'o , Sasha Levin +Message-ID: <20260225181535.912817-1-sashal@kernel.org> + +From: Jan Kara + +[ Upstream commit 4865c768b563deff1b6a6384e74a62f143427b42 ] + +For filesystems with more than 2^32 blocks inodes using indirect block +based format cannot use blocks beyond the 32-bit limit. +ext4_mb_scan_groups_linear() takes care to not select these unsupported +groups for such inodes however other functions selecting groups for +allocation don't. So far this is harmless because the other selection +functions are used only with mb_optimize_scan and this is currently +disabled for inodes with indirect blocks however in the following patch +we want to enable mb_optimize_scan regardless of inode format. + +Reviewed-by: Baokun Li +Reviewed-by: Zhang Yi +Signed-off-by: Jan Kara +Acked-by: Pedro Falcato +Cc: stable@kernel.org +Link: https://patch.msgid.link/20260114182836.14120-3-jack@suse.cz +Signed-off-by: Theodore Ts'o +[ Drop a few hunks not needed in older trees ] +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -883,6 +883,21 @@ mb_update_avg_fragment_size(struct super + } + } + ++static ext4_group_t ext4_get_allocation_groups_count( ++ struct ext4_allocation_context *ac) ++{ ++ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); ++ ++ /* non-extent files are limited to low blocks/groups */ ++ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ++ ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups; ++ ++ /* Pairs with smp_wmb() in ext4_update_super() */ ++ smp_rmb(); ++ ++ return ngroups; ++} ++ + /* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. +@@ -2817,10 +2832,7 @@ ext4_mb_regular_allocator(struct ext4_al + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +- ngroups = ext4_get_groups_count(sb); +- /* non-extent files are limited to low blocks/groups */ +- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) +- ngroups = sbi->s_blockfile_groups; ++ ngroups = ext4_get_allocation_groups_count(ac); + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + diff --git a/queue-6.6/ext4-fix-dirtyclusters-double-decrement-on-fs-shutdown.patch b/queue-6.6/ext4-fix-dirtyclusters-double-decrement-on-fs-shutdown.patch new file mode 100644 index 0000000000..fc2f05c96e --- /dev/null +++ b/queue-6.6/ext4-fix-dirtyclusters-double-decrement-on-fs-shutdown.patch @@ -0,0 +1,109 @@ +From stable+bounces-219625-greg=kroah.com@vger.kernel.org Wed Feb 25 14:52:44 2026 +From: Sasha Levin +Date: Wed, 25 Feb 2026 08:47:44 -0500 +Subject: ext4: fix dirtyclusters double decrement on fs shutdown +To: stable@vger.kernel.org +Cc: Brian Foster , Baokun Li , Theodore Ts'o , stable@kernel.org, Sasha Levin +Message-ID: <20260225134744.311174-1-sashal@kernel.org> + +From: Brian Foster + +[ Upstream commit 94a8cea54cd935c54fa2fba70354757c0fc245e3 ] + +fstests test generic/388 occasionally reproduces a warning in +ext4_put_super() associated with the dirty clusters count: + + WARNING: CPU: 7 PID: 76064 at fs/ext4/super.c:1324 ext4_put_super+0x48c/0x590 [ext4] + +Tracing the failure shows that the warning fires due to an +s_dirtyclusters_counter value of -1. IOW, this appears to be a +spurious decrement as opposed to some sort of leak. Further tracing +of the dirty cluster count deltas and an LLM scan of the resulting +output identified the cause as a double decrement in the error path +between ext4_mb_mark_diskspace_used() and the caller +ext4_mb_new_blocks(). + +First, note that generic/388 is a shutdown vs. fsstress test and so +produces a random set of operations and shutdown injections. In the +problematic case, the shutdown triggers an error return from the +ext4_handle_dirty_metadata() call(s) made from +ext4_mb_mark_context(). The changed value is non-zero at this point, +so ext4_mb_mark_diskspace_used() does not exit after the error +bubbles up from ext4_mb_mark_context(). Instead, the former +decrements both cluster counters and returns the error up to +ext4_mb_new_blocks(). The latter falls into the !ar->len out path +which decrements the dirty clusters counter a second time, creating +the inconsistency. + +To avoid this problem and simplify ownership of the cluster +reservation in this codepath, lift the counter reduction to a single +place in the caller. This makes it more clear that +ext4_mb_new_blocks() is responsible for acquiring cluster +reservation (via ext4_claim_free_clusters()) in the !delalloc case +as well as releasing it, regardless of whether it ends up consumed +or returned due to failure. + +Fixes: 0087d9fb3f29 ("ext4: Fix s_dirty_blocks_counter if block allocation failed with nodelalloc") +Signed-off-by: Brian Foster +Reviewed-by: Baokun Li +Link: https://patch.msgid.link/20260113171905.118284-1-bfoster@redhat.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +[ Drop mballoc-test changes ] +Signed-off-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 21 +++++---------------- + 1 file changed, 5 insertions(+), 16 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3999,8 +3999,7 @@ void ext4_exit_mballoc(void) + * Returns 0 if success or error code + */ + static noinline_for_stack int +-ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, +- handle_t *handle, unsigned int reserv_clstrs) ++ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle) + { + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; +@@ -4086,13 +4085,6 @@ ext4_mb_mark_diskspace_used(struct ext4_ + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); +- /* +- * Now reduce the dirty block count also. Should not go negative +- */ +- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) +- /* release all the reserved blocks if non delalloc */ +- percpu_counter_sub(&sbi->s_dirtyclusters_counter, +- reserv_clstrs); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, +@@ -6265,7 +6257,7 @@ repeat: + ext4_mb_pa_put_free(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { +- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); ++ *errp = ext4_mb_mark_diskspace_used(ac, handle); + if (*errp) { + ext4_discard_allocated_blocks(ac); + goto errout; +@@ -6296,12 +6288,9 @@ errout: + out: + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); +- if (!ar->len) { +- if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) +- /* release all the reserved blocks if non delalloc */ +- percpu_counter_sub(&sbi->s_dirtyclusters_counter, +- reserv_clstrs); +- } ++ /* release any reserved blocks */ ++ if (reserv_clstrs) ++ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); + diff --git a/queue-6.6/net-stmmac-dwmac-loongson-set-clk_csr_i-to-100-150mhz.patch b/queue-6.6/net-stmmac-dwmac-loongson-set-clk_csr_i-to-100-150mhz.patch new file mode 100644 index 0000000000..802c744fd2 --- /dev/null +++ b/queue-6.6/net-stmmac-dwmac-loongson-set-clk_csr_i-to-100-150mhz.patch @@ -0,0 +1,38 @@ +From stable+bounces-217275-greg=kroah.com@vger.kernel.org Wed Feb 18 13:13:16 2026 +From: Huacai Chen +Date: Wed, 18 Feb 2026 20:12:42 +0800 +Subject: net: stmmac: dwmac-loongson: Set clk_csr_i to 100-150MHz +To: Greg Kroah-Hartman , Sasha Levin , Huacai Chen +Cc: Xuerui Wang , stable@vger.kernel.org, Andrew Lunn , "David S . Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Yanteng Si , linux-kernel@vger.kernel.org, loongarch@lists.linux.dev, netdev@vger.kernel.org, Huacai Chen , Hongliang Wang +Message-ID: <20260218121242.2545128-1-chenhuacai@loongson.cn> + +From: Huacai Chen + +commit e1aa5ef892fb4fa9014a25e87b64b97347919d37 upstream. + +Current clk_csr_i setting of Loongson STMMAC (including LS7A1000/2000 +and LS2K1000/2000/3000) are copy & paste from other drivers. In fact, +Loongson STMMAC use 125MHz clocks and need 62 freq division to within +2.5MHz, meeting most PHY MDC requirement. So fix by setting clk_csr_i +to 100-150MHz, otherwise some PHYs may link fail. + +Cc: stable@vger.kernel.org +Fixes: 30bba69d7db40e7 ("stmmac: pci: Add dwmac support for Loongson") +Signed-off-by: Hongliang Wang +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +@@ -14,7 +14,7 @@ + static int loongson_default_data(struct plat_stmmacenet_data *plat) + + { +- plat->clk_csr = 2; /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ ++ plat->clk_csr = 1; /* clk_csr_i = 100-150MHz & MDC = clk_csr_i/62 */ + plat->has_gmac = 1; + plat->force_sf_dma_mode = 1; + diff --git a/queue-6.6/nfs-fix-a-deadlock-involving-nfs_release_folio.patch b/queue-6.6/nfs-fix-a-deadlock-involving-nfs_release_folio.patch new file mode 100644 index 0000000000..4167b87d8c --- /dev/null +++ b/queue-6.6/nfs-fix-a-deadlock-involving-nfs_release_folio.patch @@ -0,0 +1,110 @@ +From stable+bounces-217867-greg=kroah.com@vger.kernel.org Tue Feb 24 08:02:54 2026 +From: Li hongliang <1468888505@139.com> +Date: Tue, 24 Feb 2026 15:02:37 +0800 +Subject: NFS: Fix a deadlock involving nfs_release_folio() +To: gregkh@linuxfoundation.org, stable@vger.kernel.org, trond.myklebust@hammerspace.com +Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org, anna@kernel.org, linux-nfs@vger.kernel.org, wangzhaolong@huaweicloud.com +Message-ID: <20260224070237.2933965-1-1468888505@139.com> + +From: Trond Myklebust + +[ Upstream commit cce0be6eb4971456b703aaeafd571650d314bcca ] + +Wang Zhaolong reports a deadlock involving NFSv4.1 state recovery +waiting on kthreadd, which is attempting to reclaim memory by calling +nfs_release_folio(). The latter cannot make progress due to state +recovery being needed. + +It seems that the only safe thing to do here is to kick off a writeback +of the folio, without waiting for completion, or else kicking off an +asynchronous commit. + +Reported-by: Wang Zhaolong +Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page") +Signed-off-by: Trond Myklebust +[ Minor conflict resolved. ] +Signed-off-by: Li hongliang <1468888505@139.com> +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfs/file.c | 3 ++- + fs/nfs/nfstrace.h | 3 +++ + fs/nfs/write.c | 33 +++++++++++++++++++++++++++++++++ + include/linux/nfs_fs.h | 1 + + 4 files changed, 39 insertions(+), 1 deletion(-) + +--- a/fs/nfs/file.c ++++ b/fs/nfs/file.c +@@ -459,7 +459,8 @@ static bool nfs_release_folio(struct fol + if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || + current_is_kswapd() || current_is_kcompactd()) + return false; +- if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0) ++ if (nfs_wb_folio_reclaim(folio_file_mapping(folio)->host, folio) < 0 || ++ folio_test_private(folio)) + return false; + } + return nfs_fscache_release_folio(folio, gfp); +--- a/fs/nfs/nfstrace.h ++++ b/fs/nfs/nfstrace.h +@@ -1033,6 +1033,9 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done + DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); + DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); + ++DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio_reclaim); ++DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_reclaim_done); ++ + DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); + DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); + +--- a/fs/nfs/write.c ++++ b/fs/nfs/write.c +@@ -2122,6 +2122,39 @@ int nfs_wb_folio_cancel(struct inode *in + } + + /** ++ * nfs_wb_folio_reclaim - Write back all requests on one page ++ * @inode: pointer to page ++ * @folio: pointer to folio ++ * ++ * Assumes that the folio has been locked by the caller ++ */ ++int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio) ++{ ++ loff_t range_start = folio_pos(folio); ++ size_t len = folio_size(folio); ++ struct writeback_control wbc = { ++ .sync_mode = WB_SYNC_ALL, ++ .nr_to_write = 0, ++ .range_start = range_start, ++ .range_end = range_start + len - 1, ++ .for_sync = 1, ++ }; ++ int ret; ++ ++ if (folio_test_writeback(folio)) ++ return -EBUSY; ++ if (folio_clear_dirty_for_io(folio)) { ++ trace_nfs_writeback_folio_reclaim(inode, range_start, len); ++ ret = nfs_writepage_locked(folio, &wbc); ++ trace_nfs_writeback_folio_reclaim_done(inode, range_start, len, ++ ret); ++ return ret; ++ } ++ nfs_commit_inode(inode, 0); ++ return 0; ++} ++ ++/** + * nfs_wb_folio - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio +--- a/include/linux/nfs_fs.h ++++ b/include/linux/nfs_fs.h +@@ -608,6 +608,7 @@ extern int nfs_update_folio(struct file + extern int nfs_sync_inode(struct inode *inode); + extern int nfs_wb_all(struct inode *inode); + extern int nfs_wb_folio(struct inode *inode, struct folio *folio); ++extern int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio); + int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); + extern int nfs_commit_inode(struct inode *, int); + extern struct nfs_commit_data *nfs_commitdata_alloc(void); diff --git a/queue-6.6/nfs-pass-explicit-offset-count-to-trace-events.patch b/queue-6.6/nfs-pass-explicit-offset-count-to-trace-events.patch new file mode 100644 index 0000000000..d5b4fd3a0c --- /dev/null +++ b/queue-6.6/nfs-pass-explicit-offset-count-to-trace-events.patch @@ -0,0 +1,226 @@ +From stable+bounces-217866-greg=kroah.com@vger.kernel.org Tue Feb 24 08:01:20 2026 +From: Li hongliang <1468888505@139.com> +Date: Tue, 24 Feb 2026 15:00:58 +0800 +Subject: nfs: pass explicit offset/count to trace events +To: gregkh@linuxfoundation.org, stable@vger.kernel.org, hch@lst.de +Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org, trond.myklebust@hammerspace.com, anna@kernel.org, linux-nfs@vger.kernel.org, chuck.lever@oracle.com, Anna.Schumaker@Netapp.com +Message-ID: <20260224070058.2933695-1-1468888505@139.com> + +From: Christoph Hellwig + +[ Upstream commit fada32ed6dbc748f447c8d050a961b75d946055a ] + +nfs_folio_length is unsafe to use without having the folio locked and a +check for a NULL ->f_mapping that protects against truncations and can +lead to kernel crashes. E.g. when running xfstests generic/065 with +all nfs trace points enabled. + +Follow the model of the XFS trace points and pass in an explіcit offset +and length. This has the additional benefit that these values can +be more accurate as some of the users touch partial folio ranges. + +Fixes: eb5654b3b89d ("NFS: Enable tracing of nfs_invalidate_folio() and nfs_launder_folio()") +Reported-by: Chuck Lever +Signed-off-by: Christoph Hellwig +Signed-off-by: Anna Schumaker +[ Minor conflict resolved. ] +Signed-off-by: Li hongliang <1468888505@139.com> +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfs/file.c | 5 +++-- + fs/nfs/nfstrace.h | 36 ++++++++++++++++++++---------------- + fs/nfs/read.c | 8 +++++--- + fs/nfs/write.c | 10 +++++----- + 4 files changed, 33 insertions(+), 26 deletions(-) + +--- a/fs/nfs/file.c ++++ b/fs/nfs/file.c +@@ -441,7 +441,7 @@ static void nfs_invalidate_folio(struct + /* Cancel any unstarted writes on this page */ + nfs_wb_folio_cancel(inode, folio); + folio_wait_fscache(folio); +- trace_nfs_invalidate_folio(inode, folio); ++ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); + } + + /* +@@ -509,7 +509,8 @@ static int nfs_launder_folio(struct foli + + folio_wait_fscache(folio); + ret = nfs_wb_folio(inode, folio); +- trace_nfs_launder_folio_done(inode, folio, ret); ++ trace_nfs_launder_folio_done(inode, folio_pos(folio), ++ folio_size(folio), ret); + return ret; + } + +--- a/fs/nfs/nfstrace.h ++++ b/fs/nfs/nfstrace.h +@@ -933,10 +933,11 @@ TRACE_EVENT(nfs_sillyrename_unlink, + DECLARE_EVENT_CLASS(nfs_folio_event, + TP_PROTO( + const struct inode *inode, +- struct folio *folio ++ loff_t offset, ++ size_t count + ), + +- TP_ARGS(inode, folio), ++ TP_ARGS(inode, offset, count), + + TP_STRUCT__entry( + __field(dev_t, dev) +@@ -944,7 +945,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) +- __field(u32, count) ++ __field(size_t, count) + ), + + TP_fast_assign( +@@ -954,13 +955,13 @@ DECLARE_EVENT_CLASS(nfs_folio_event, + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); +- __entry->offset = folio_file_pos(folio); +- __entry->count = nfs_folio_length(folio); ++ __entry->offset = offset, ++ __entry->count = count; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " +- "offset=%lld count=%u", ++ "offset=%lld count=%zu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, +@@ -972,18 +973,20 @@ DECLARE_EVENT_CLASS(nfs_folio_event, + DEFINE_EVENT(nfs_folio_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ +- struct folio *folio \ ++ loff_t offset, \ ++ size_t count \ + ), \ +- TP_ARGS(inode, folio)) ++ TP_ARGS(inode, offset, count)) + + DECLARE_EVENT_CLASS(nfs_folio_event_done, + TP_PROTO( + const struct inode *inode, +- struct folio *folio, ++ loff_t offset, ++ size_t count, + int ret + ), + +- TP_ARGS(inode, folio, ret), ++ TP_ARGS(inode, offset, count, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) +@@ -992,7 +995,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) +- __field(u32, count) ++ __field(size_t, count) + ), + + TP_fast_assign( +@@ -1002,14 +1005,14 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); +- __entry->offset = folio_file_pos(folio); +- __entry->count = nfs_folio_length(folio); ++ __entry->offset = offset, ++ __entry->count = count, + __entry->ret = ret; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " +- "offset=%lld count=%u ret=%d", ++ "offset=%lld count=%zu ret=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, +@@ -1021,10 +1024,11 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done + DEFINE_EVENT(nfs_folio_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ +- struct folio *folio, \ ++ loff_t offset, \ ++ size_t count, \ + int ret \ + ), \ +- TP_ARGS(inode, folio, ret)) ++ TP_ARGS(inode, offset, count, ret)) + + DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); + DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); +--- a/fs/nfs/read.c ++++ b/fs/nfs/read.c +@@ -333,13 +333,15 @@ out: + int nfs_read_folio(struct file *file, struct folio *folio) + { + struct inode *inode = file_inode(file); ++ loff_t pos = folio_pos(folio); ++ size_t len = folio_size(folio); + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; + int ret; + +- trace_nfs_aop_readpage(inode, folio); ++ trace_nfs_aop_readpage(inode, pos, len); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); +- task_io_account_read(folio_size(folio)); ++ task_io_account_read(len); + + /* + * Try to flush any pending writes to the file.. +@@ -382,7 +384,7 @@ int nfs_read_folio(struct file *file, st + out_put: + put_nfs_open_context(ctx); + out: +- trace_nfs_aop_readpage_done(inode, folio, ret); ++ trace_nfs_aop_readpage_done(inode, pos, len, ret); + return ret; + out_unlock: + folio_unlock(folio); +--- a/fs/nfs/write.c ++++ b/fs/nfs/write.c +@@ -2131,17 +2131,17 @@ int nfs_wb_folio_cancel(struct inode *in + */ + int nfs_wb_folio(struct inode *inode, struct folio *folio) + { +- loff_t range_start = folio_file_pos(folio); +- loff_t range_end = range_start + (loff_t)folio_size(folio) - 1; ++ loff_t range_start = folio_pos(folio); ++ size_t len = folio_size(folio); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, +- .range_end = range_end, ++ .range_end = range_start + len - 1, + }; + int ret; + +- trace_nfs_writeback_folio(inode, folio); ++ trace_nfs_writeback_folio(inode, range_start, len); + + for (;;) { + folio_wait_writeback(folio); +@@ -2159,7 +2159,7 @@ int nfs_wb_folio(struct inode *inode, st + goto out_error; + } + out_error: +- trace_nfs_writeback_folio_done(inode, folio, ret); ++ trace_nfs_writeback_folio_done(inode, range_start, len, ret); + return ret; + } + diff --git a/queue-6.6/pnfs-fix-a-deadlock-when-returning-a-delegation-during-open.patch b/queue-6.6/pnfs-fix-a-deadlock-when-returning-a-delegation-during-open.patch new file mode 100644 index 0000000000..01e6fbea70 --- /dev/null +++ b/queue-6.6/pnfs-fix-a-deadlock-when-returning-a-delegation-during-open.patch @@ -0,0 +1,214 @@ +From stable+bounces-217868-greg=kroah.com@vger.kernel.org Tue Feb 24 08:22:45 2026 +From: Li hongliang <1468888505@139.com> +Date: Tue, 24 Feb 2026 15:22:02 +0800 +Subject: pNFS: Fix a deadlock when returning a delegation during open() +To: gregkh@linuxfoundation.org, stable@vger.kernel.org, trond.myklebust@hammerspace.com +Cc: patches@lists.linux.dev, linux-kernel@vger.kernel.org, bcodding@hammerspace.com, anna@kernel.org, linux-nfs@vger.kernel.org, wangzhaolong@huaweicloud.com +Message-ID: <20260224072202.2940831-1-1468888505@139.com> + +From: Trond Myklebust + +[ Upstream commit 857bf9056291a16785ae3be1d291026b2437fc48 ] + +Ben Coddington reports seeing a hang in the following stack trace: + 0 [ffffd0b50e1774e0] __schedule at ffffffff9ca05415 + 1 [ffffd0b50e177548] schedule at ffffffff9ca05717 + 2 [ffffd0b50e177558] bit_wait at ffffffff9ca061e1 + 3 [ffffd0b50e177568] __wait_on_bit at ffffffff9ca05cfb + 4 [ffffd0b50e1775c8] out_of_line_wait_on_bit at ffffffff9ca05ea5 + 5 [ffffd0b50e177618] pnfs_roc at ffffffffc154207b [nfsv4] + 6 [ffffd0b50e1776b8] _nfs4_proc_delegreturn at ffffffffc1506586 [nfsv4] + 7 [ffffd0b50e177788] nfs4_proc_delegreturn at ffffffffc1507480 [nfsv4] + 8 [ffffd0b50e1777f8] nfs_do_return_delegation at ffffffffc1523e41 [nfsv4] + 9 [ffffd0b50e177838] nfs_inode_set_delegation at ffffffffc1524a75 [nfsv4] + 10 [ffffd0b50e177888] nfs4_process_delegation at ffffffffc14f41dd [nfsv4] + 11 [ffffd0b50e1778a0] _nfs4_opendata_to_nfs4_state at ffffffffc1503edf [nfsv4] + 12 [ffffd0b50e1778c0] _nfs4_open_and_get_state at ffffffffc1504e56 [nfsv4] + 13 [ffffd0b50e177978] _nfs4_do_open at ffffffffc15051b8 [nfsv4] + 14 [ffffd0b50e1779f8] nfs4_do_open at ffffffffc150559c [nfsv4] + 15 [ffffd0b50e177a80] nfs4_atomic_open at ffffffffc15057fb [nfsv4] + 16 [ffffd0b50e177ad0] nfs4_file_open at ffffffffc15219be [nfsv4] + 17 [ffffd0b50e177b78] do_dentry_open at ffffffff9c09e6ea + 18 [ffffd0b50e177ba8] vfs_open at ffffffff9c0a082e + 19 [ffffd0b50e177bd0] dentry_open at ffffffff9c0a0935 + +The issue is that the delegreturn is being asked to wait for a layout +return that cannot complete because a state recovery was initiated. The +state recovery cannot complete until the open() finishes processing the +delegations it was given. + +The solution is to propagate the existing flags that indicate a +non-blocking call to the function pnfs_roc(), so that it knows not to +wait in this situation. + +Reported-by: Benjamin Coddington +Fixes: 29ade5db1293 ("pNFS: Wait on outstanding layoutreturns to complete in pnfs_roc()") +Signed-off-by: Trond Myklebust +[ Minor conflict resolved. ] +Signed-off-by: Li hongliang <1468888505@139.com> +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfs/nfs4proc.c | 6 ++--- + fs/nfs/pnfs.c | 58 ++++++++++++++++++++++++++++++++++++++---------------- + fs/nfs/pnfs.h | 17 ++++++--------- + 3 files changed, 51 insertions(+), 30 deletions(-) + +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -3792,8 +3792,8 @@ int nfs4_do_close(struct nfs4_state *sta + calldata->res.seqid = calldata->arg.seqid; + calldata->res.server = server; + calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; +- calldata->lr.roc = pnfs_roc(state->inode, +- &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred); ++ calldata->lr.roc = pnfs_roc(state->inode, &calldata->lr.arg, ++ &calldata->lr.res, msg.rpc_cred, wait); + if (calldata->lr.roc) { + calldata->arg.lr_args = &calldata->lr.arg; + calldata->res.lr_res = &calldata->lr.res; +@@ -6742,7 +6742,7 @@ static int _nfs4_proc_delegreturn(struct + data->inode = nfs_igrab_and_active(inode); + if (data->inode || issync) { + data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, +- cred); ++ cred, issync); + if (data->lr.roc) { + data->args.lr_args = &data->lr.arg; + data->res.lr_res = &data->lr.res; +--- a/fs/nfs/pnfs.c ++++ b/fs/nfs/pnfs.c +@@ -1427,10 +1427,9 @@ pnfs_commit_and_return_layout(struct ino + return ret; + } + +-bool pnfs_roc(struct inode *ino, +- struct nfs4_layoutreturn_args *args, +- struct nfs4_layoutreturn_res *res, +- const struct cred *cred) ++bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, ++ struct nfs4_layoutreturn_res *res, const struct cred *cred, ++ bool sync) + { + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_open_context *ctx; +@@ -1441,7 +1440,7 @@ bool pnfs_roc(struct inode *ino, + nfs4_stateid stateid; + enum pnfs_iomode iomode = 0; + bool layoutreturn = false, roc = false; +- bool skip_read = false; ++ bool skip_read; + + if (!nfs_have_layout(ino)) + return false; +@@ -1454,20 +1453,14 @@ retry: + lo = NULL; + goto out_noroc; + } +- pnfs_get_layout_hdr(lo); +- if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { +- spin_unlock(&ino->i_lock); +- rcu_read_unlock(); +- wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, +- TASK_UNINTERRUPTIBLE); +- pnfs_put_layout_hdr(lo); +- goto retry; +- } + + /* no roc if we hold a delegation */ ++ skip_read = false; + if (nfs4_check_delegation(ino, FMODE_READ)) { +- if (nfs4_check_delegation(ino, FMODE_WRITE)) ++ if (nfs4_check_delegation(ino, FMODE_WRITE)) { ++ lo = NULL; + goto out_noroc; ++ } + skip_read = true; + } + +@@ -1476,12 +1469,43 @@ retry: + if (state == NULL) + continue; + /* Don't return layout if there is open file state */ +- if (state->state & FMODE_WRITE) ++ if (state->state & FMODE_WRITE) { ++ lo = NULL; + goto out_noroc; ++ } + if (state->state & FMODE_READ) + skip_read = true; + } + ++ if (skip_read) { ++ bool writes = false; ++ ++ list_for_each_entry(lseg, &lo->plh_segs, pls_list) { ++ if (lseg->pls_range.iomode != IOMODE_READ) { ++ writes = true; ++ break; ++ } ++ } ++ if (!writes) { ++ lo = NULL; ++ goto out_noroc; ++ } ++ } ++ ++ pnfs_get_layout_hdr(lo); ++ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { ++ if (!sync) { ++ pnfs_set_plh_return_info( ++ lo, skip_read ? IOMODE_RW : IOMODE_ANY, 0); ++ goto out_noroc; ++ } ++ spin_unlock(&ino->i_lock); ++ rcu_read_unlock(); ++ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, ++ TASK_UNINTERRUPTIBLE); ++ pnfs_put_layout_hdr(lo); ++ goto retry; ++ } + + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { + if (skip_read && lseg->pls_range.iomode == IOMODE_READ) +@@ -1521,7 +1545,7 @@ retry: + out_noroc: + spin_unlock(&ino->i_lock); + rcu_read_unlock(); +- pnfs_layoutcommit_inode(ino, true); ++ pnfs_layoutcommit_inode(ino, sync); + if (roc) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + if (ld->prepare_layoutreturn) +--- a/fs/nfs/pnfs.h ++++ b/fs/nfs/pnfs.h +@@ -295,10 +295,9 @@ int pnfs_mark_matching_lsegs_return(stru + u32 seq); + int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list); +-bool pnfs_roc(struct inode *ino, +- struct nfs4_layoutreturn_args *args, +- struct nfs4_layoutreturn_res *res, +- const struct cred *cred); ++bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, ++ struct nfs4_layoutreturn_res *res, const struct cred *cred, ++ bool sync); + int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, int *ret); + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, +@@ -769,12 +768,10 @@ pnfs_layoutcommit_outstanding(struct ino + return false; + } + +- +-static inline bool +-pnfs_roc(struct inode *ino, +- struct nfs4_layoutreturn_args *args, +- struct nfs4_layoutreturn_res *res, +- const struct cred *cred) ++static inline bool pnfs_roc(struct inode *ino, ++ struct nfs4_layoutreturn_args *args, ++ struct nfs4_layoutreturn_res *res, ++ const struct cred *cred, bool sync) + { + return false; + } diff --git a/queue-6.6/rxrpc-fix-recvmsg-unconditional-requeue.patch b/queue-6.6/rxrpc-fix-recvmsg-unconditional-requeue.patch new file mode 100644 index 0000000000..29912a0ef8 --- /dev/null +++ b/queue-6.6/rxrpc-fix-recvmsg-unconditional-requeue.patch @@ -0,0 +1,106 @@ +From stable+bounces-219745-greg=kroah.com@vger.kernel.org Thu Feb 26 03:42:48 2026 +From: Robert Garcia +Date: Thu, 26 Feb 2026 10:41:02 +0800 +Subject: rxrpc: Fix recvmsg() unconditional requeue +To: stable@vger.kernel.org, David Howells +Cc: Marc Dionne , Robert Garcia , Steven Rostedt , linux-kernel@vger.kernel.org, Masami Hiramatsu , "David S . Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , linux-afs@lists.infradead.org, linux-trace-kernel@vger.kernel.org, netdev@vger.kernel.org, Faith , Pumpkin Chang , Nir Ohfeld , Willy Tarreau , Simon Horman +Message-ID: <20260226024102.3522867-1-rob_garcia@163.com> + +From: David Howells + +[ Upstream commit 2c28769a51deb6022d7fbd499987e237a01dd63a ] + +If rxrpc_recvmsg() fails because MSG_DONTWAIT was specified but the call at +the front of the recvmsg queue already has its mutex locked, it requeues +the call - whether or not the call is already queued. The call may be on +the queue because MSG_PEEK was also passed and so the call was not dequeued +or because the I/O thread requeued it. + +The unconditional requeue may then corrupt the recvmsg queue, leading to +things like UAFs or refcount underruns. + +Fix this by only requeuing the call if it isn't already on the queue - and +moving it to the front if it is already queued. If we don't queue it, we +have to put the ref we obtained by dequeuing it. + +Also, MSG_PEEK doesn't dequeue the call so shouldn't call +rxrpc_notify_socket() for the call if we didn't use up all the data on the +queue, so fix that also. + +Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") +Reported-by: Faith +Reported-by: Pumpkin Chang +Signed-off-by: David Howells +Acked-by: Marc Dionne +cc: Nir Ohfeld +cc: Willy Tarreau +cc: Simon Horman +cc: linux-afs@lists.infradead.org +cc: stable@kernel.org +Link: https://patch.msgid.link/95163.1768428203@warthog.procyon.org.uk +Signed-off-by: Jakub Kicinski +[Use spin_unlock instead of spin_unlock_irq to maintain context consistency.] +Signed-off-by: Robert Garcia +Signed-off-by: Greg Kroah-Hartman +--- + include/trace/events/rxrpc.h | 4 ++++ + net/rxrpc/recvmsg.c | 19 +++++++++++++++---- + 2 files changed, 19 insertions(+), 4 deletions(-) + +--- a/include/trace/events/rxrpc.h ++++ b/include/trace/events/rxrpc.h +@@ -270,6 +270,7 @@ + EM(rxrpc_call_put_kernel, "PUT kernel ") \ + EM(rxrpc_call_put_poke, "PUT poke ") \ + EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ ++ EM(rxrpc_call_put_recvmsg_peek_nowait, "PUT peek-nwt") \ + EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ + EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ + EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ +@@ -287,6 +288,9 @@ + EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ + EM(rxrpc_call_see_input, "SEE input ") \ + EM(rxrpc_call_see_recvmsg, "SEE recvmsg ") \ ++ EM(rxrpc_call_see_recvmsg_requeue, "SEE recv-rqu") \ ++ EM(rxrpc_call_see_recvmsg_requeue_first, "SEE recv-rqF") \ ++ EM(rxrpc_call_see_recvmsg_requeue_move, "SEE recv-rqM") \ + EM(rxrpc_call_see_release, "SEE release ") \ + EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ + EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ +--- a/net/rxrpc/recvmsg.c ++++ b/net/rxrpc/recvmsg.c +@@ -430,7 +430,8 @@ try_again: + if (rxrpc_call_has_failed(call)) + goto call_failed; + +- if (!skb_queue_empty(&call->recvmsg_queue)) ++ if (!(flags & MSG_PEEK) && ++ !skb_queue_empty(&call->recvmsg_queue)) + rxrpc_notify_socket(call); + goto not_yet_complete; + +@@ -461,11 +462,21 @@ error_unlock_call: + error_requeue_call: + if (!(flags & MSG_PEEK)) { + spin_lock(&rx->recvmsg_lock); +- list_add(&call->recvmsg_link, &rx->recvmsg_q); +- spin_unlock(&rx->recvmsg_lock); ++ if (list_empty(&call->recvmsg_link)) { ++ list_add(&call->recvmsg_link, &rx->recvmsg_q); ++ rxrpc_see_call(call, rxrpc_call_see_recvmsg_requeue); ++ spin_unlock(&rx->recvmsg_lock); ++ } else if (list_is_first(&call->recvmsg_link, &rx->recvmsg_q)) { ++ spin_unlock(&rx->recvmsg_lock); ++ rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_first); ++ } else { ++ list_move(&call->recvmsg_link, &rx->recvmsg_q); ++ spin_unlock(&rx->recvmsg_lock); ++ rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_move); ++ } + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); + } else { +- rxrpc_put_call(call, rxrpc_call_put_recvmsg); ++ rxrpc_put_call(call, rxrpc_call_put_recvmsg_peek_nowait); + } + error_no_call: + release_sock(&rx->sk); diff --git a/queue-6.6/series b/queue-6.6/series index 37a1df4b0c..725e1c387d 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -396,3 +396,19 @@ smb-client-compare-macs-in-constant-time.patch ksmbd-compare-macs-in-constant-time.patch net-tcp-md5-fix-mac-comparison-to-be-constant-time.patch f2fs-fix-to-avoid-migrating-empty-section.patch +ext4-fix-dirtyclusters-double-decrement-on-fs-shutdown.patch +btrfs-always-fallback-to-buffered-write-if-the-inode-requires-checksum.patch +net-stmmac-dwmac-loongson-set-clk_csr_i-to-100-150mhz.patch +arm64-mm-don-t-remap-pgtables-per-cont-pte-pmd-block.patch +arm64-mm-batch-dsb-and-isb-when-populating-pgtables.patch +arm64-mm-don-t-remap-pgtables-for-allocate-vs-populate.patch +btrfs-fix-null-dereference-on-root-when-tracing-inode-eviction.patch +dst-fix-races-in-rt6_uncached_list_del-and-rt_del_uncached_list.patch +nfs-pass-explicit-offset-count-to-trace-events.patch +nfs-fix-a-deadlock-involving-nfs_release_folio.patch +pnfs-fix-a-deadlock-when-returning-a-delegation-during-open.patch +usb-typec-ucsi-move-unregister-out-of-atomic-section.patch +eth-bnxt-always-recalculate-features-after-xdp-clearing-fix-null-deref.patch +ext4-always-allocate-blocks-only-from-groups-inode-can-use.patch +rxrpc-fix-recvmsg-unconditional-requeue.patch +dm-verity-disable-recursive-forward-error-correction.patch diff --git a/queue-6.6/usb-typec-ucsi-move-unregister-out-of-atomic-section.patch b/queue-6.6/usb-typec-ucsi-move-unregister-out-of-atomic-section.patch new file mode 100644 index 0000000000..8bd1d46a02 --- /dev/null +++ b/queue-6.6/usb-typec-ucsi-move-unregister-out-of-atomic-section.patch @@ -0,0 +1,131 @@ +From black.hawk@163.com Wed Feb 25 06:10:42 2026 +From: Rahul Sharma +Date: Wed, 25 Feb 2026 13:10:08 +0800 +Subject: usb: typec: ucsi: Move unregister out of atomic section +To: gregkh@linuxfoundation.org, stable@vger.kernel.org +Cc: linux-kernel@vger.kernel.org, Bjorn Andersson , Heikki Krogerus , Neil Armstrong , Dmitry Baryshkov , Amit Pundir , Johan Hovold , Bjorn Andersson , Rahul Sharma +Message-ID: <20260225051008.2547855-1-black.hawk@163.com> + +From: Bjorn Andersson + +[ Upstream commit 11bb2ffb679399f99041540cf662409905179e3a ] + +Commit '9329933699b3 ("soc: qcom: pmic_glink: Make client-lock +non-sleeping")' moved the pmic_glink client list under a spinlock, as it +is accessed by the rpmsg/glink callback, which in turn is invoked from +IRQ context. + +This means that ucsi_unregister() is now called from atomic context, +which isn't feasible as it's expecting a sleepable context. An effort is +under way to get GLINK to invoke its callbacks in a sleepable context, +but until then lets schedule the unregistration. + +A side effect of this is that ucsi_unregister() can now happen +after the remote processor, and thereby the communication link with it, is +gone. pmic_glink_send() is amended with a check to avoid the resulting NULL +pointer dereference. +This does however result in the user being informed about this error by +the following entry in the kernel log: + + ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5 + +Fixes: 9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping") +Cc: stable@vger.kernel.org +Reviewed-by: Heikki Krogerus +Reviewed-by: Neil Armstrong +Reviewed-by: Dmitry Baryshkov +Tested-by: Amit Pundir +Reviewed-by: Johan Hovold +Tested-by: Johan Hovold +Signed-off-by: Bjorn Andersson +Link: https://lore.kernel.org/r/20240820-pmic-glink-v6-11-races-v3-2-eec53c750a04@quicinc.com +Signed-off-by: Bjorn Andersson +[ The context change is due to the commit 584e8df58942 +("usb: typec: ucsi: extract common code for command handling") +in v6.11 which is irrelevant to the logic of this patch. ] +Signed-off-by: Rahul Sharma +Signed-off-by: Greg Kroah-Hartman +--- + drivers/soc/qcom/pmic_glink.c | 10 +++++++++- + drivers/usb/typec/ucsi/ucsi_glink.c | 27 ++++++++++++++++++++++----- + 2 files changed, 31 insertions(+), 6 deletions(-) + +--- a/drivers/soc/qcom/pmic_glink.c ++++ b/drivers/soc/qcom/pmic_glink.c +@@ -115,8 +115,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_client_regi + int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len) + { + struct pmic_glink *pg = client->pg; ++ int ret; + +- return rpmsg_send(pg->ept, data, len); ++ mutex_lock(&pg->state_lock); ++ if (!pg->ept) ++ ret = -ECONNRESET; ++ else ++ ret = rpmsg_send(pg->ept, data, len); ++ mutex_unlock(&pg->state_lock); ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(pmic_glink_send); + +--- a/drivers/usb/typec/ucsi/ucsi_glink.c ++++ b/drivers/usb/typec/ucsi/ucsi_glink.c +@@ -72,6 +72,9 @@ struct pmic_glink_ucsi { + + struct work_struct notify_work; + struct work_struct register_work; ++ spinlock_t state_lock; ++ bool ucsi_registered; ++ bool pd_running; + + u8 read_buf[UCSI_BUF_SIZE]; + }; +@@ -270,8 +273,20 @@ static void pmic_glink_ucsi_notify(struc + static void pmic_glink_ucsi_register(struct work_struct *work) + { + struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work); ++ unsigned long flags; ++ bool pd_running; + +- ucsi_register(ucsi->ucsi); ++ spin_lock_irqsave(&ucsi->state_lock, flags); ++ pd_running = ucsi->pd_running; ++ spin_unlock_irqrestore(&ucsi->state_lock, flags); ++ ++ if (!ucsi->ucsi_registered && pd_running) { ++ ucsi_register(ucsi->ucsi); ++ ucsi->ucsi_registered = true; ++ } else if (ucsi->ucsi_registered && !pd_running) { ++ ucsi_unregister(ucsi->ucsi); ++ ucsi->ucsi_registered = false; ++ } + } + + static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv) +@@ -295,11 +310,12 @@ static void pmic_glink_ucsi_callback(con + static void pmic_glink_ucsi_pdr_notify(void *priv, int state) + { + struct pmic_glink_ucsi *ucsi = priv; ++ unsigned long flags; + +- if (state == SERVREG_SERVICE_STATE_UP) +- schedule_work(&ucsi->register_work); +- else if (state == SERVREG_SERVICE_STATE_DOWN) +- ucsi_unregister(ucsi->ucsi); ++ spin_lock_irqsave(&ucsi->state_lock, flags); ++ ucsi->pd_running = (state == SERVREG_SERVICE_STATE_UP); ++ spin_unlock_irqrestore(&ucsi->state_lock, flags); ++ schedule_work(&ucsi->register_work); + } + + static void pmic_glink_ucsi_destroy(void *data) +@@ -332,6 +348,7 @@ static int pmic_glink_ucsi_probe(struct + init_completion(&ucsi->read_ack); + init_completion(&ucsi->write_ack); + init_completion(&ucsi->sync_ack); ++ spin_lock_init(&ucsi->state_lock); + mutex_init(&ucsi->lock); + + ucsi->ucsi = ucsi_create(dev, &pmic_glink_ucsi_ops);