From: Sasha Levin Date: Tue, 16 Jun 2026 02:52:23 +0000 (-0400) Subject: Fixes for all trees X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5424ef9f0e398723151ef54dfb967143384377d0;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for all trees Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch b/queue-5.15/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch new file mode 100644 index 0000000000..49745331d8 --- /dev/null +++ b/queue-5.15/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch @@ -0,0 +1,62 @@ +From 73842fe6334738462d552db8c40885a156aaebad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 15:37:31 -0400 +Subject: mm/damon/ops-common: call folio_test_lru() after folio_get() + +From: SeongJae Park + +[ Upstream commit d6b8b02a27b3dd09ec12144322b3dac46d9bc9ef ] + +damon_get_folio() speculatively calls folio_test_lru() before +folio_try_get(). The folio can get freed and reallocated to a tail page. +In the case, VM_BUG_ON_PGFLAGS() in const_folio_flags() can be triggered. +Remove the speculative call. + +Also mark folio_test_lru() check right after folio_try_get() success as no +more unlikely. + +The race should be rare. Also the problem can happen only if the kernel +has enabled CONFIG_DEBUG_VM_PGFLAGS. No real world report of this issue +has been made so far. This fix is based on only theoretical analysis. +That said, a bug is a bug. A similar issue was also fixed via commit +3203b3ab0fcf ("mm/filemap: don't call folio_test_locked() without a +reference in next_uptodate_folio()"). I don't expect this change will +make a meaningful impact to DAMON performance in the real world, though I +will be happy to be corrected from the real world reports. + +The issue was discovered [1] by Sashiko. + +Link: https://lore.kernel.org/20260525162256.8317-1-sj@kernel.org +Link: https://lore.kernel.org/20260517234112.89245-1-sj@kernel.org [1] +Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") +Signed-off-by: SeongJae Park +Cc: Fernand Sieber +Cc: Leonard Foerster +Cc: Shakeel Butt +Cc: # 5.15.x +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/damon/vaddr.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c +index 6d8036671e60e5..dbb0f0fb2e598c 100644 +--- a/mm/damon/vaddr.c ++++ b/mm/damon/vaddr.c +@@ -383,10 +383,10 @@ static struct page *damon_get_page(unsigned long pfn) + { + struct page *page = pfn_to_online_page(pfn); + +- if (!page || !PageLRU(page) || !get_page_unless_zero(page)) ++ if (!page || !get_page_unless_zero(page)) + return NULL; + +- if (unlikely(!PageLRU(page))) { ++ if (!PageLRU(page)) { + put_page(page); + page = NULL; + } +-- +2.53.0 + diff --git a/queue-5.15/series b/queue-5.15/series index 6140606334..692b3964d5 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -244,3 +244,4 @@ drm-amd-display-clamp-vbios-hdmi-retimer-register-count-to-array-size.patch drm-amd-display-fix-null-deref-and-buffer-over-read-in-sdp-debugfs.patch drm-amd-display-use-krealloc_array-in-dal_vector_reserve.patch fs-fcntl-fix-softirq-unsafe-lock-order-in-fasync-signaling.patch +mm-damon-ops-common-call-folio_test_lru-after-folio_.patch diff --git a/queue-6.1/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch b/queue-6.1/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch new file mode 100644 index 0000000000..e01f65291b --- /dev/null +++ b/queue-6.1/mm-damon-ops-common-call-folio_test_lru-after-folio_.patch @@ -0,0 +1,62 @@ +From 650644bfc9f69ad9f66a3b03a9807f3298524ad7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 15:03:29 -0400 +Subject: mm/damon/ops-common: call folio_test_lru() after folio_get() + +From: SeongJae Park + +[ Upstream commit d6b8b02a27b3dd09ec12144322b3dac46d9bc9ef ] + +damon_get_folio() speculatively calls folio_test_lru() before +folio_try_get(). The folio can get freed and reallocated to a tail page. +In the case, VM_BUG_ON_PGFLAGS() in const_folio_flags() can be triggered. +Remove the speculative call. + +Also mark folio_test_lru() check right after folio_try_get() success as no +more unlikely. + +The race should be rare. Also the problem can happen only if the kernel +has enabled CONFIG_DEBUG_VM_PGFLAGS. No real world report of this issue +has been made so far. This fix is based on only theoretical analysis. +That said, a bug is a bug. A similar issue was also fixed via commit +3203b3ab0fcf ("mm/filemap: don't call folio_test_locked() without a +reference in next_uptodate_folio()"). I don't expect this change will +make a meaningful impact to DAMON performance in the real world, though I +will be happy to be corrected from the real world reports. + +The issue was discovered [1] by Sashiko. + +Link: https://lore.kernel.org/20260525162256.8317-1-sj@kernel.org +Link: https://lore.kernel.org/20260517234112.89245-1-sj@kernel.org [1] +Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") +Signed-off-by: SeongJae Park +Cc: Fernand Sieber +Cc: Leonard Foerster +Cc: Shakeel Butt +Cc: # 5.15.x +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/damon/ops-common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c +index 0b75a8d5c70684..cea4401e95a35e 100644 +--- a/mm/damon/ops-common.c ++++ b/mm/damon/ops-common.c +@@ -23,10 +23,10 @@ struct page *damon_get_page(unsigned long pfn) + { + struct page *page = pfn_to_online_page(pfn); + +- if (!page || !PageLRU(page) || !get_page_unless_zero(page)) ++ if (!page || !get_page_unless_zero(page)) + return NULL; + +- if (unlikely(!PageLRU(page))) { ++ if (!PageLRU(page)) { + put_page(page); + page = NULL; + } +-- +2.53.0 + diff --git a/queue-6.1/mm-huge_memory-update-file-pmd-counter-before-folio_.patch b/queue-6.1/mm-huge_memory-update-file-pmd-counter-before-folio_.patch new file mode 100644 index 0000000000..db28aa02f3 --- /dev/null +++ b/queue-6.1/mm-huge_memory-update-file-pmd-counter-before-folio_.patch @@ -0,0 +1,58 @@ +From 3d59160c13601f82a5cf36e80b867367bf1bbba1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 19:58:37 -0400 +Subject: mm/huge_memory: update file PMD counter before folio_put() + +From: Yin Tirui + +[ Upstream commit 8d878059924f12c1bc24556a92ec56add74de3c8 ] + +__split_huge_pmd_locked() updates the file/shmem RSS counter after +dropping the PMD mapping's folio reference. If folio_put() drops the last +reference, mm_counter_file() can later read freed folio state via +folio_test_swapbacked(). + +Move the counter update before folio_put(). + +Link: https://lore.kernel.org/20260526101337.1984081-1-yintirui@huawei.com +Fixes: fadae2953072 ("thp: use mm_file_counter to determine update which rss counter") +Signed-off-by: Yin Tirui +Reviewed-by: Lorenzo Stoakes +Acked-by: David Hildenbrand (arm) +Reviewed-by: Lance Yang +Reviewed-by: Dev Jain +Cc: Baolin Wang +Cc: Barry Song +Cc: Chen Jun +Cc: Kefeng Wang +Cc: Liam R. Howlett +Cc: Nico Pache +Cc: Ryan Roberts +Cc: Vlastimil Babka +Cc: Yang Shi +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +[ changed folio API calls (folio_remove_rmap_pmd/mm_counter_file(folio)/folio_put) to page-based equivalents (page_remove_rmap/mm_counter_file(page)/put_page) ] +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 2c118713f77126..7023bdf4896055 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2085,7 +2085,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, vma, true); ++ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + put_page(page); ++ return; + } + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + return; +-- +2.53.0 + diff --git a/queue-6.1/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch b/queue-6.1/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch new file mode 100644 index 0000000000..dcc99c7fb8 --- /dev/null +++ b/queue-6.1/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch @@ -0,0 +1,255 @@ +From 1331ac0bcc81dbe6c2ec492d01c4b75218c7dbda Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 17:49:19 +0100 +Subject: mm/hugetlb: avoid false positive lockdep assertion + +From: Lorenzo Stoakes + +[ Upstream commit b4aea43cd37afad714b5684fe9fdfcb0e78dba26 ] + +Commit 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, +not before") changed the locking model around hugetlbfs PMD unsharing on +VMA split, but did not update the function which asserts the locks, +hugetlb_vma_assert_locked(). + +This function asserts that either the hugetlb VMA lock is held (if a +shared mapping) or that the reservation map lock is held (if private). + +If you get an unfortunate race between something which results in one of +these locks being released and a hugetlb VMA split and you have +CONFIG_LOCKDEP enabled, you can therefore see a false positive assertion +arise when there is in fact no issue. + +Since this change introduced a new take_locks parameter to +hugetlb_unshare_pmds(), which, when set to false, indicates that locking +is sufficient, simply pass this to the unsharing logic and predicate the +lock assertions on this. + +This is safe, as we already asserted the file rmap lock and the VMA write +lock prior to this (implying exclusive mmap write lock), so we cannot be +raced by either rmap or page fault page table walkers which the asserted +locks are intended to protect against (we don't mind GUP-fast). + +Separate out huge_pmd_unshare() into __huge_pmd_unshare() to add a +check_locks parameter, and update hugetlb_unshare_pmds() to pass this +parameter to it. + +This leaves all other callers of huge_pmd_unshare() still correctly +asserting the locks. + +The below reproducer will trigger the assert in a kernel with +CONFIG_LOCKDEP enabled by racing process teardown (which will release the +hugetlb lock) against a hugetlb split. + +void execute_one(void) +{ + void *ptr; + pid_t pid; + + /* + * Create a hugetlb mapping spanning a PUD entry. + * + * We force the hugetlb page allocation with populate and + * noreserve. + * + * |---------------------| + * | | + * |---------------------| + * 0 PUD boundary + */ + ptr = mmap(0, PUD_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED | MAP_ANON | + MAP_NORESERVE | MAP_HUGETLB | MAP_POPULATE, + -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + /* + * Fork but with a bogus stack pointer so we try to execute code in + * a non-VM_EXEC VMA, causing segfault + teardown via exit_mmap(). + * + * The clone will cause PMD page table sharing between the + * processes first via: + * copy_process() -> ... -> huge_pte_alloc() -> huge_pmd_share() + * + * Then tear down and release the hugetlb 'VMA' lock via: + * exit_mmap() -> ... -> vma_close() -> hugetlb_vma_lock_free() + */ + pid = syscall(__NR_clone, 0, 2 * PMD_SIZE, 0, 0, 0); + if (pid < 0) { + perror("clone"); + exit(EXIT_FAILURE); + } if (pid == 0) { + /* Pop stack... */ + return; + } + + /* + * We are the parent process. + * + * Race the child process's teardown with a PMD unshare. + * + * We do this by triggering: + * + * __split_vma() -> hugetlb_split() -> hugetlb_unshare_pmds() + * + * Which, importantly, doesn't hold the hugetlb VMA lock (nor can + * it), meaning we assert in hugetlb_vma_assert_locked(). + * + * . + * |----------.----------| + * | . | + * |----------.----------| + * 0 . PUD boundary + */ + mmap(0, PUD_SIZE / 2, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); +} + +int main(void) +{ + int i; + + /* Kick off fork children. */ + for (i = 0; i < NUM_FORKS; i++) { + pid_t pid = fork(); + + if (pid < 0) { + perror("fork"); + exit(EXIT_FAILURE); + } + + /* Fork children do their work and exit. */ + if (!pid) { + int j; + + for (j = 0; j < NUM_ITERS; j++) + execute_one(); + return EXIT_SUCCESS; + } + } + + /* If we succeeded, wait on children. */ + for (i = 0; i < NUM_FORKS; i++) + wait(NULL); + + return EXIT_SUCCESS; +} + +[ljs@kernel.org: account for the !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING case] + Link: https://lore.kernel.org/agWZsPGYid08uU6O@lucifer +Link: https://lore.kernel.org/20260513085658.45264-1-ljs@kernel.org +Fixes: 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, not before") +Signed-off-by: Lorenzo Stoakes +Acked-by: David Hildenbrand (Arm) +Acked-by: Oscar Salvador +Cc: Jann Horn +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Lorenzo Stoakes +Signed-off-by: Sasha Levin +--- + mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 37 insertions(+), 19 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 161f95473c2ac2..6585389f93199d 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -94,6 +94,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta); + static void hugetlb_vma_lock_free(struct vm_area_struct *vma); + static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); + static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks); + static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end, bool take_locks); + static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +@@ -7116,6 +7119,31 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return pte; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ unsigned long sz = huge_page_size(hstate_vma(vma)); ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd = pgd_offset(mm, addr); ++ p4d_t *p4d = p4d_offset(pgd, addr); ++ pud_t *pud = pud_offset(p4d, addr); ++ ++ if (sz != PMD_SIZE) ++ return 0; ++ if (!atomic_read(&virt_to_page(ptep)->pt_share_count)) ++ return 0; ++ i_mmap_assert_write_locked(vma->vm_file->f_mapping); ++ if (check_locks) ++ hugetlb_vma_assert_locked(vma); ++ pud_clear(pud); ++ ++ tlb_unshare_pmd_ptdesc(tlb, virt_to_page(ptep), addr); ++ ++ mm_dec_nr_pmds(mm); ++ return 1; ++} ++ + /** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. +@@ -7135,24 +7163,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +- unsigned long sz = huge_page_size(hstate_vma(vma)); +- struct mm_struct *mm = vma->vm_mm; +- pgd_t *pgd = pgd_offset(mm, addr); +- p4d_t *p4d = p4d_offset(pgd, addr); +- pud_t *pud = pud_offset(p4d, addr); +- +- i_mmap_assert_write_locked(vma->vm_file->f_mapping); +- hugetlb_vma_assert_locked(vma); +- if (sz != PMD_SIZE) +- return 0; +- if (!atomic_read(&virt_to_page(ptep)->pt_share_count)) +- return 0; +- +- pud_clear(pud); +- tlb_unshare_pmd_ptdesc(tlb, virt_to_page(ptep), addr); +- +- mm_dec_nr_pmds(mm); +- return 1; ++ return __huge_pmd_unshare(tlb, vma, addr, ptep, /*check_locks=*/true); + } + + /* +@@ -7186,6 +7197,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return NULL; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ return 0; ++} ++ + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -7569,7 +7587,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); +- huge_pmd_unshare(&tlb, vma, address, ptep); ++ __huge_pmd_unshare(&tlb, vma, address, ptep, take_locks); + spin_unlock(ptl); + } + huge_pmd_unshare_flush(&tlb, vma); +-- +2.53.0 + diff --git a/queue-6.1/series b/queue-6.1/series index db1649eb75..4744f6bc27 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -325,3 +325,6 @@ drm-amd-display-clamp-vbios-hdmi-retimer-register-count-to-array-size.patch drm-amd-display-fix-null-deref-and-buffer-over-read-in-sdp-debugfs.patch drm-amd-display-use-krealloc_array-in-dal_vector_reserve.patch fs-fcntl-fix-softirq-unsafe-lock-order-in-fasync-signaling.patch +mm-hugetlb-avoid-false-positive-lockdep-assertion.patch +mm-damon-ops-common-call-folio_test_lru-after-folio_.patch +mm-huge_memory-update-file-pmd-counter-before-folio_.patch diff --git a/queue-6.12/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch b/queue-6.12/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch new file mode 100644 index 0000000000..aa6fc32068 --- /dev/null +++ b/queue-6.12/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch @@ -0,0 +1,256 @@ +From 7e18802d214cd8cce91548d5dcce2b2e6b1e59d5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 17:09:55 +0100 +Subject: mm/hugetlb: avoid false positive lockdep assertion + +From: Lorenzo Stoakes + +[ Upstream commit b4aea43cd37afad714b5684fe9fdfcb0e78dba26 ] + +Commit 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, +not before") changed the locking model around hugetlbfs PMD unsharing on +VMA split, but did not update the function which asserts the locks, +hugetlb_vma_assert_locked(). + +This function asserts that either the hugetlb VMA lock is held (if a +shared mapping) or that the reservation map lock is held (if private). + +If you get an unfortunate race between something which results in one of +these locks being released and a hugetlb VMA split and you have +CONFIG_LOCKDEP enabled, you can therefore see a false positive assertion +arise when there is in fact no issue. + +Since this change introduced a new take_locks parameter to +hugetlb_unshare_pmds(), which, when set to false, indicates that locking +is sufficient, simply pass this to the unsharing logic and predicate the +lock assertions on this. + +This is safe, as we already asserted the file rmap lock and the VMA write +lock prior to this (implying exclusive mmap write lock), so we cannot be +raced by either rmap or page fault page table walkers which the asserted +locks are intended to protect against (we don't mind GUP-fast). + +Separate out huge_pmd_unshare() into __huge_pmd_unshare() to add a +check_locks parameter, and update hugetlb_unshare_pmds() to pass this +parameter to it. + +This leaves all other callers of huge_pmd_unshare() still correctly +asserting the locks. + +The below reproducer will trigger the assert in a kernel with +CONFIG_LOCKDEP enabled by racing process teardown (which will release the +hugetlb lock) against a hugetlb split. + +void execute_one(void) +{ + void *ptr; + pid_t pid; + + /* + * Create a hugetlb mapping spanning a PUD entry. + * + * We force the hugetlb page allocation with populate and + * noreserve. + * + * |---------------------| + * | | + * |---------------------| + * 0 PUD boundary + */ + ptr = mmap(0, PUD_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED | MAP_ANON | + MAP_NORESERVE | MAP_HUGETLB | MAP_POPULATE, + -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + /* + * Fork but with a bogus stack pointer so we try to execute code in + * a non-VM_EXEC VMA, causing segfault + teardown via exit_mmap(). + * + * The clone will cause PMD page table sharing between the + * processes first via: + * copy_process() -> ... -> huge_pte_alloc() -> huge_pmd_share() + * + * Then tear down and release the hugetlb 'VMA' lock via: + * exit_mmap() -> ... -> vma_close() -> hugetlb_vma_lock_free() + */ + pid = syscall(__NR_clone, 0, 2 * PMD_SIZE, 0, 0, 0); + if (pid < 0) { + perror("clone"); + exit(EXIT_FAILURE); + } if (pid == 0) { + /* Pop stack... */ + return; + } + + /* + * We are the parent process. + * + * Race the child process's teardown with a PMD unshare. + * + * We do this by triggering: + * + * __split_vma() -> hugetlb_split() -> hugetlb_unshare_pmds() + * + * Which, importantly, doesn't hold the hugetlb VMA lock (nor can + * it), meaning we assert in hugetlb_vma_assert_locked(). + * + * . + * |----------.----------| + * | . | + * |----------.----------| + * 0 . PUD boundary + */ + mmap(0, PUD_SIZE / 2, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); +} + +int main(void) +{ + int i; + + /* Kick off fork children. */ + for (i = 0; i < NUM_FORKS; i++) { + pid_t pid = fork(); + + if (pid < 0) { + perror("fork"); + exit(EXIT_FAILURE); + } + + /* Fork children do their work and exit. */ + if (!pid) { + int j; + + for (j = 0; j < NUM_ITERS; j++) + execute_one(); + return EXIT_SUCCESS; + } + } + + /* If we succeeded, wait on children. */ + for (i = 0; i < NUM_FORKS; i++) + wait(NULL); + + return EXIT_SUCCESS; +} + +[ljs@kernel.org: account for the !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING case] + Link: https://lore.kernel.org/agWZsPGYid08uU6O@lucifer +Link: https://lore.kernel.org/20260513085658.45264-1-ljs@kernel.org +Fixes: 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, not before") +Signed-off-by: Lorenzo Stoakes +Acked-by: David Hildenbrand (Arm) +Acked-by: Oscar Salvador +Cc: Jann Horn +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Lorenzo Stoakes +Signed-off-by: Sasha Levin +--- + mm/hugetlb.c | 57 ++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 37 insertions(+), 20 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 0f0b9483df6328..75ab83bfec9379 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -86,6 +86,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta); + static void hugetlb_vma_lock_free(struct vm_area_struct *vma); + static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); + static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks); + static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end, bool take_locks); + static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +@@ -7225,6 +7228,31 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return pte; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ unsigned long sz = huge_page_size(hstate_vma(vma)); ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd = pgd_offset(mm, addr); ++ p4d_t *p4d = p4d_offset(pgd, addr); ++ pud_t *pud = pud_offset(p4d, addr); ++ ++ if (sz != PMD_SIZE) ++ return 0; ++ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) ++ return 0; ++ i_mmap_assert_write_locked(vma->vm_file->f_mapping); ++ if (check_locks) ++ hugetlb_vma_assert_locked(vma); ++ pud_clear(pud); ++ ++ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); ++ ++ mm_dec_nr_pmds(mm); ++ return 1; ++} ++ + /** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. +@@ -7244,25 +7272,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +- unsigned long sz = huge_page_size(hstate_vma(vma)); +- struct mm_struct *mm = vma->vm_mm; +- pgd_t *pgd = pgd_offset(mm, addr); +- p4d_t *p4d = p4d_offset(pgd, addr); +- pud_t *pud = pud_offset(p4d, addr); +- +- i_mmap_assert_write_locked(vma->vm_file->f_mapping); +- hugetlb_vma_assert_locked(vma); +- if (sz != PMD_SIZE) +- return 0; +- if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) +- return 0; +- +- pud_clear(pud); +- +- tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); +- +- mm_dec_nr_pmds(mm); +- return 1; ++ return __huge_pmd_unshare(tlb, vma, addr, ptep, /*check_locks=*/true); + } + + /* +@@ -7296,6 +7306,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return NULL; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ return 0; ++} ++ + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -7566,7 +7583,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); +- huge_pmd_unshare(&tlb, vma, address, ptep); ++ __huge_pmd_unshare(&tlb, vma, address, ptep, take_locks); + spin_unlock(ptl); + } + huge_pmd_unshare_flush(&tlb, vma); +-- +2.53.0 + diff --git a/queue-6.12/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch b/queue-6.12/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch new file mode 100644 index 0000000000..8e5e918889 --- /dev/null +++ b/queue-6.12/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch @@ -0,0 +1,76 @@ +From 3f543974aae8ef708c658233a113e6a721d18d34 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 13:37:30 -0400 +Subject: sched_ext: Don't warn on NULL cgrp_moving_from in + scx_cgroup_move_task() + +From: Tejun Heo + +[ Upstream commit 02e545c4297a26dbbc41df81b831e7f605bcd306 ] + +A WARN fires when systemd's user manager writes "+cpu +memory +pids" to +its own subtree_control while a sched_ext scheduler is loaded: + + WARNING: at kernel/sched/ext.c:3227 scx_cgroup_move_task+0xa8/0xb0 + scx_cgroup_move_task+0xa8/0xb0 + sched_move_task+0x134/0x290 + cpu_cgroup_attach+0x39/0x70 + cgroup_migrate_execute+0x37d/0x450 + cgroup_update_dfl_csses+0x1e3/0x270 + cgroup_subtree_control_write+0x3e7/0x440 + +scx_cgroup_can_attach() arms cgrp_moving_from only when a task's cpu +cgroup changes. It can still be NULL when scx_cgroup_move_task() runs, +through this sequence: + + Step Result + --------------------------------- ---------------------------------- + 1. cpu enabled on cgroup G cpu css = A + 2. cpu toggled off then on for G A killed, B created (same cgroup) + 3. an exiting task keeps A alive migration skips it, A now stale + 4. +memory migrates G stale A vs current B pulls cpu in + 5. cpu attach runs for all tasks hits a live, cpu-unchanged task + 6. scx_cgroup_move_task() on it cgrp_moving_from NULL -> WARN + +The mismatch is that scx_cgroup_can_attach() keys on cgroup identity +while migration drives the move on css identity, so a NULL cgrp_moving_from +here is a legitimate css-only migration, not a missing prep. + +The call is already gated on cgrp_moving_from, so just drop the warning. +ops.cgroup_prep_move() and ops.cgroup_move() stay paired. + +Fixes: 819513666966 ("sched_ext: Add cgroup support") +Cc: stable@vger.kernel.org # v6.12+ +Reported-by: Matt Fleming +Closes: https://lore.kernel.org/all/20260601124156.2205704-1-mfleming@cloudflare.com/ +Signed-off-by: Tejun Heo +Reviewed-by: Andrea Righi +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 01dc2a613868fc..428cde37130dfc 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4069,10 +4069,13 @@ void scx_cgroup_move_task(struct task_struct *p) + return; + + /* +- * @p must have ops.cgroup_prep_move() called on it and thus +- * cgrp_moving_from set. ++ * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's ++ * cgroup changes. Migration keys off css rather than cgroup identity, ++ * so it can hand an unchanged-cgroup task here with cgrp_moving_from ++ * NULL. Nothing to report to the BPF scheduler then, so skip it and ++ * keep prep_move and move paired. + */ +- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) ++ if (SCX_HAS_OP(cgroup_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +-- +2.53.0 + diff --git a/queue-6.12/series b/queue-6.12/series index 83fdb38b68..99a4987133 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -228,3 +228,5 @@ drm-amd-display-fix-null-deref-and-buffer-over-read-in-sdp-debugfs.patch drm-amd-display-use-krealloc_array-in-dal_vector_reserve.patch fs-fcntl-fix-softirq-unsafe-lock-order-in-fasync-signaling.patch driver-core-reject-devices-with-unregistered-buses.patch +mm-hugetlb-avoid-false-positive-lockdep-assertion.patch +sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch diff --git a/queue-6.18/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch b/queue-6.18/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch new file mode 100644 index 0000000000..3c15ef9e55 --- /dev/null +++ b/queue-6.18/sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch @@ -0,0 +1,80 @@ +From fe5e6948c5f81949919d15bb5bd19bad65a06ba5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 13:47:03 +0100 +Subject: sched_ext: Don't warn on NULL cgrp_moving_from in + scx_cgroup_move_task() + +From: Tejun Heo + +commit 02e545c4297a26dbbc41df81b831e7f605bcd306 upstream. + +A WARN fires when systemd's user manager writes "+cpu +memory +pids" to +its own subtree_control while a sched_ext scheduler is loaded: + + WARNING: at kernel/sched/ext.c:3227 scx_cgroup_move_task+0xa8/0xb0 + scx_cgroup_move_task+0xa8/0xb0 + sched_move_task+0x134/0x290 + cpu_cgroup_attach+0x39/0x70 + cgroup_migrate_execute+0x37d/0x450 + cgroup_update_dfl_csses+0x1e3/0x270 + cgroup_subtree_control_write+0x3e7/0x440 + +scx_cgroup_can_attach() arms cgrp_moving_from only when a task's cpu +cgroup changes. It can still be NULL when scx_cgroup_move_task() runs, +through this sequence: + + Step Result + --------------------------------- ---------------------------------- + 1. cpu enabled on cgroup G cpu css = A + 2. cpu toggled off then on for G A killed, B created (same cgroup) + 3. an exiting task keeps A alive migration skips it, A now stale + 4. +memory migrates G stale A vs current B pulls cpu in + 5. cpu attach runs for all tasks hits a live, cpu-unchanged task + 6. scx_cgroup_move_task() on it cgrp_moving_from NULL -> WARN + +The mismatch is that scx_cgroup_can_attach() keys on cgroup identity +while migration drives the move on css identity, so a NULL cgrp_moving_from +here is a legitimate css-only migration, not a missing prep. + +The call is already gated on cgrp_moving_from, so just drop the warning. +ops.cgroup_prep_move() and ops.cgroup_move() stay paired. + +Fixes: 819513666966 ("sched_ext: Add cgroup support") +Cc: stable@vger.kernel.org # v6.12+ +Reported-by: Matt Fleming +Closes: https://lore.kernel.org/all/20260601124156.2205704-1-mfleming@cloudflare.com/ +Signed-off-by: Tejun Heo +Reviewed-by: Andrea Righi +[ mfleming: keep the 6.18.y SCX_KF_REST argument in the + SCX_CALL_OP_TASK() call. ] +Signed-off-by: Matt Fleming +Signed-off-by: Sasha Levin +--- + kernel/sched/ext.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 7b750bf42698cc..d8280f87443310 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -3221,11 +3221,13 @@ void scx_cgroup_move_task(struct task_struct *p) + return; + + /* +- * @p must have ops.cgroup_prep_move() called on it and thus +- * cgrp_moving_from set. ++ * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's ++ * cgroup changes. Migration keys off css rather than cgroup identity, ++ * so it can hand an unchanged-cgroup task here with cgrp_moving_from ++ * NULL. Nothing to report to the BPF scheduler then, so skip it and ++ * keep prep_move and move paired. + */ +- if (SCX_HAS_OP(sch, cgroup_move) && +- !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) ++ if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, cgroup_move, task_rq(p), + p, p->scx.cgrp_moving_from, + tg_cgrp(task_group(p))); +-- +2.53.0 + diff --git a/queue-6.18/series b/queue-6.18/series index 7d568f769a..1a922230db 100644 --- a/queue-6.18/series +++ b/queue-6.18/series @@ -303,3 +303,4 @@ drm-amd-display-fix-out-of-bounds-read-in-dp_get_eq_aux_rd_interval.patch drm-amd-display-use-krealloc_array-in-dal_vector_reserve.patch fs-fcntl-fix-softirq-unsafe-lock-order-in-fasync-signaling.patch driver-core-reject-devices-with-unregistered-buses.patch +sched_ext-don-t-warn-on-null-cgrp_moving_from-in-scx.patch diff --git a/queue-6.6/mm-huge_memory-update-file-pmd-counter-before-folio_.patch b/queue-6.6/mm-huge_memory-update-file-pmd-counter-before-folio_.patch new file mode 100644 index 0000000000..ad127c8da5 --- /dev/null +++ b/queue-6.6/mm-huge_memory-update-file-pmd-counter-before-folio_.patch @@ -0,0 +1,58 @@ +From 5ec0c0bf6aa28b92b8f507146f78db231142530d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 18:52:33 -0400 +Subject: mm/huge_memory: update file PMD counter before folio_put() + +From: Yin Tirui + +[ Upstream commit 8d878059924f12c1bc24556a92ec56add74de3c8 ] + +__split_huge_pmd_locked() updates the file/shmem RSS counter after +dropping the PMD mapping's folio reference. If folio_put() drops the last +reference, mm_counter_file() can later read freed folio state via +folio_test_swapbacked(). + +Move the counter update before folio_put(). + +Link: https://lore.kernel.org/20260526101337.1984081-1-yintirui@huawei.com +Fixes: fadae2953072 ("thp: use mm_file_counter to determine update which rss counter") +Signed-off-by: Yin Tirui +Reviewed-by: Lorenzo Stoakes +Acked-by: David Hildenbrand (arm) +Reviewed-by: Lance Yang +Reviewed-by: Dev Jain +Cc: Baolin Wang +Cc: Barry Song +Cc: Chen Jun +Cc: Kefeng Wang +Cc: Liam R. Howlett +Cc: Nico Pache +Cc: Ryan Roberts +Cc: Vlastimil Babka +Cc: Yang Shi +Cc: Zi Yan +Cc: +Signed-off-by: Andrew Morton +[ changed folio API calls (folio_remove_rmap_pmd/mm_counter_file(folio)/folio_put) to page-based equivalents (page_remove_rmap/mm_counter_file(page)/put_page) ] +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 78f5df12b8eb37..4443cc44cbf9f1 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2095,7 +2095,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, vma, true); ++ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + put_page(page); ++ return; + } + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + return; +-- +2.53.0 + diff --git a/queue-6.6/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch b/queue-6.6/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch new file mode 100644 index 0000000000..c0a81112e7 --- /dev/null +++ b/queue-6.6/mm-hugetlb-avoid-false-positive-lockdep-assertion.patch @@ -0,0 +1,256 @@ +From e59b2c227b95933909bc95053cd36ad10fe33dbe Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 17:43:54 +0100 +Subject: mm/hugetlb: avoid false positive lockdep assertion + +From: Lorenzo Stoakes + +[ Upstream commit b4aea43cd37afad714b5684fe9fdfcb0e78dba26 ] + +Commit 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, +not before") changed the locking model around hugetlbfs PMD unsharing on +VMA split, but did not update the function which asserts the locks, +hugetlb_vma_assert_locked(). + +This function asserts that either the hugetlb VMA lock is held (if a +shared mapping) or that the reservation map lock is held (if private). + +If you get an unfortunate race between something which results in one of +these locks being released and a hugetlb VMA split and you have +CONFIG_LOCKDEP enabled, you can therefore see a false positive assertion +arise when there is in fact no issue. + +Since this change introduced a new take_locks parameter to +hugetlb_unshare_pmds(), which, when set to false, indicates that locking +is sufficient, simply pass this to the unsharing logic and predicate the +lock assertions on this. + +This is safe, as we already asserted the file rmap lock and the VMA write +lock prior to this (implying exclusive mmap write lock), so we cannot be +raced by either rmap or page fault page table walkers which the asserted +locks are intended to protect against (we don't mind GUP-fast). + +Separate out huge_pmd_unshare() into __huge_pmd_unshare() to add a +check_locks parameter, and update hugetlb_unshare_pmds() to pass this +parameter to it. + +This leaves all other callers of huge_pmd_unshare() still correctly +asserting the locks. + +The below reproducer will trigger the assert in a kernel with +CONFIG_LOCKDEP enabled by racing process teardown (which will release the +hugetlb lock) against a hugetlb split. + +void execute_one(void) +{ + void *ptr; + pid_t pid; + + /* + * Create a hugetlb mapping spanning a PUD entry. + * + * We force the hugetlb page allocation with populate and + * noreserve. + * + * |---------------------| + * | | + * |---------------------| + * 0 PUD boundary + */ + ptr = mmap(0, PUD_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED | MAP_ANON | + MAP_NORESERVE | MAP_HUGETLB | MAP_POPULATE, + -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + /* + * Fork but with a bogus stack pointer so we try to execute code in + * a non-VM_EXEC VMA, causing segfault + teardown via exit_mmap(). + * + * The clone will cause PMD page table sharing between the + * processes first via: + * copy_process() -> ... -> huge_pte_alloc() -> huge_pmd_share() + * + * Then tear down and release the hugetlb 'VMA' lock via: + * exit_mmap() -> ... -> vma_close() -> hugetlb_vma_lock_free() + */ + pid = syscall(__NR_clone, 0, 2 * PMD_SIZE, 0, 0, 0); + if (pid < 0) { + perror("clone"); + exit(EXIT_FAILURE); + } if (pid == 0) { + /* Pop stack... */ + return; + } + + /* + * We are the parent process. + * + * Race the child process's teardown with a PMD unshare. + * + * We do this by triggering: + * + * __split_vma() -> hugetlb_split() -> hugetlb_unshare_pmds() + * + * Which, importantly, doesn't hold the hugetlb VMA lock (nor can + * it), meaning we assert in hugetlb_vma_assert_locked(). + * + * . + * |----------.----------| + * | . | + * |----------.----------| + * 0 . PUD boundary + */ + mmap(0, PUD_SIZE / 2, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); +} + +int main(void) +{ + int i; + + /* Kick off fork children. */ + for (i = 0; i < NUM_FORKS; i++) { + pid_t pid = fork(); + + if (pid < 0) { + perror("fork"); + exit(EXIT_FAILURE); + } + + /* Fork children do their work and exit. */ + if (!pid) { + int j; + + for (j = 0; j < NUM_ITERS; j++) + execute_one(); + return EXIT_SUCCESS; + } + } + + /* If we succeeded, wait on children. */ + for (i = 0; i < NUM_FORKS; i++) + wait(NULL); + + return EXIT_SUCCESS; +} + +[ljs@kernel.org: account for the !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING case] + Link: https://lore.kernel.org/agWZsPGYid08uU6O@lucifer +Link: https://lore.kernel.org/20260513085658.45264-1-ljs@kernel.org +Fixes: 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, not before") +Signed-off-by: Lorenzo Stoakes +Acked-by: David Hildenbrand (Arm) +Acked-by: Oscar Salvador +Cc: Jann Horn +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Lorenzo Stoakes +Signed-off-by: Sasha Levin +--- + mm/hugetlb.c | 57 ++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 37 insertions(+), 20 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 8c2128a8c3a844..f6be7c93251be6 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -95,6 +95,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta); + static void hugetlb_vma_lock_free(struct vm_area_struct *vma); + static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); + static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks); + static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end, bool take_locks); + static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +@@ -7032,6 +7035,31 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return pte; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ unsigned long sz = huge_page_size(hstate_vma(vma)); ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd = pgd_offset(mm, addr); ++ p4d_t *p4d = p4d_offset(pgd, addr); ++ pud_t *pud = pud_offset(p4d, addr); ++ ++ if (sz != PMD_SIZE) ++ return 0; ++ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) ++ return 0; ++ i_mmap_assert_write_locked(vma->vm_file->f_mapping); ++ if (check_locks) ++ hugetlb_vma_assert_locked(vma); ++ pud_clear(pud); ++ ++ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); ++ ++ mm_dec_nr_pmds(mm); ++ return 1; ++} ++ + /** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. +@@ -7051,25 +7079,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +- unsigned long sz = huge_page_size(hstate_vma(vma)); +- struct mm_struct *mm = vma->vm_mm; +- pgd_t *pgd = pgd_offset(mm, addr); +- p4d_t *p4d = p4d_offset(pgd, addr); +- pud_t *pud = pud_offset(p4d, addr); +- +- i_mmap_assert_write_locked(vma->vm_file->f_mapping); +- hugetlb_vma_assert_locked(vma); +- if (sz != PMD_SIZE) +- return 0; +- if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) +- return 0; +- +- pud_clear(pud); +- +- tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); +- +- mm_dec_nr_pmds(mm); +- return 1; ++ return __huge_pmd_unshare(tlb, vma, addr, ptep, /*check_locks=*/true); + } + + /* +@@ -7103,6 +7113,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + return NULL; + } + ++static int __huge_pmd_unshare(struct mmu_gather *tlb, ++ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, ++ bool check_locks) ++{ ++ return 0; ++} ++ + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -7377,7 +7394,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); +- huge_pmd_unshare(&tlb, vma, address, ptep); ++ __huge_pmd_unshare(&tlb, vma, address, ptep, take_locks); + spin_unlock(ptl); + } + huge_pmd_unshare_flush(&tlb, vma); +-- +2.53.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 19ef521c32..c1f13dd174 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -367,3 +367,6 @@ drm-amd-display-fix-null-deref-and-buffer-over-read-in-sdp-debugfs.patch drm-amd-display-use-krealloc_array-in-dal_vector_reserve.patch fs-fcntl-fix-softirq-unsafe-lock-order-in-fasync-signaling.patch driver-core-reject-devices-with-unregistered-buses.patch +mm-hugetlb-avoid-false-positive-lockdep-assertion.patch +soc-qcom-ice-fix-race-between-qcom_ice_probe-and-of_.patch +mm-huge_memory-update-file-pmd-counter-before-folio_.patch diff --git a/queue-6.6/soc-qcom-ice-fix-race-between-qcom_ice_probe-and-of_.patch b/queue-6.6/soc-qcom-ice-fix-race-between-qcom_ice_probe-and-of_.patch new file mode 100644 index 0000000000..6dea7715ea --- /dev/null +++ b/queue-6.6/soc-qcom-ice-fix-race-between-qcom_ice_probe-and-of_.patch @@ -0,0 +1,155 @@ +From 3e9022aa00900a87c1f7d3b342b58d1b0b824800 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 15 Jun 2026 10:43:52 -0400 +Subject: soc: qcom: ice: Fix race between qcom_ice_probe() and + of_qcom_ice_get() + +From: Manivannan Sadhasivam + +[ Upstream commit d922113ef91e6e7e8065e9070f349365341ba32e ] + +The current platform driver design causes probe ordering races with +consumers (UFS, eMMC) due to ICE's dependency on SCM firmware calls. If ICE +probe fails (missing ICE SCM or DT registers), devm_of_qcom_ice_get() loops +with -EPROBE_DEFER, leaving consumers non-functional even when ICE should +be gracefully disabled. devm_of_qcom_ice_get() doesn't know if the ICE +driver probe has failed due to above reasons or it is waiting for the SCM +driver. + +Moreover, there is no devlink dependency between ICE and consumer drivers +as 'qcom,ice' is not considered as a DT 'supplier'. So the consumer drivers +have no idea of when the ICE driver is going to probe. + +To address these issues, store the error pointer in a global xarray with +ice node phandle as a key during probe in addition to the valid ice pointer +and synchronize both qcom_ice_probe() and of_qcom_ice_get() using a mutex. + +If the xarray entry is NULL, then it implies that the driver is not +probed yet, so return -EPROBE_DEFER. If it has any error pointer, return +that error pointer directly. Otherwise, add the devlink as usual and return +the valid pointer to the consumer. + +Xarray is used instead of platform drvdata, since driver core frees the +drvdata during probe failure. So it cannot be used to pass the error +pointer to the consumers. + +Note that this change only fixes the standalone ICE DT node bindings and +not the ones with 'ice' range embedded in the consumer nodes, where there +is no issue. + +Fixes: 2afbf43a4aec ("soc: qcom: Make the Qualcomm UFS/SDCC ICE a dedicated driver") +Reported-by: Sumit Garg +Tested-by: Sumit Garg # OP-TEE as TZ +Acked-by: Sumit Garg +Cc: stable@vger.kernel.org # 6.4 +Signed-off-by: Manivannan Sadhasivam +Link: https://lore.kernel.org/r/20260518-qcom-ice-fix-v7-1-2a595382185b@oss.qualcomm.com +Signed-off-by: Bjorn Andersson +[ changed `.remove` to `.remove_new` for the void callback and replaced the `__free(device_node)` direct-return with an explicit `goto out` in `of_qcom_ice_get()` ] +Signed-off-by: Sasha Levin +--- + drivers/soc/qcom/ice.c | 36 +++++++++++++++++++++++++++++------- + 1 file changed, 29 insertions(+), 7 deletions(-) + +diff --git a/drivers/soc/qcom/ice.c b/drivers/soc/qcom/ice.c +index d6e205e3812a96..94e91835062b26 100644 +--- a/drivers/soc/qcom/ice.c ++++ b/drivers/soc/qcom/ice.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #include + +@@ -49,6 +50,9 @@ struct qcom_ice { + struct clk *core_clk; + }; + ++static DEFINE_XARRAY(ice_handles); ++static DEFINE_MUTEX(ice_mutex); ++ + static bool qcom_ice_check_supported(struct qcom_ice *ice) + { + u32 regval = qcom_ice_readl(ice, QCOM_ICE_REG_VERSION); +@@ -288,6 +292,8 @@ struct qcom_ice *of_qcom_ice_get(struct device *dev) + return qcom_ice_create(&pdev->dev, base); + } + ++ guard(mutex)(&ice_mutex); ++ + /* + * If the consumer node does not provider an 'ice' reg range + * (legacy DT binding), then it must at least provide a phandle +@@ -304,12 +310,11 @@ struct qcom_ice *of_qcom_ice_get(struct device *dev) + goto out; + } + +- ice = platform_get_drvdata(pdev); +- if (!ice) { +- dev_err(dev, "Cannot get ice instance from %s\n", +- dev_name(&pdev->dev)); ++ ice = xa_load(&ice_handles, pdev->dev.of_node->phandle); ++ if (IS_ERR_OR_NULL(ice)) { + platform_device_put(pdev); +- ice = ERR_PTR(-EPROBE_DEFER); ++ if (!ice) ++ ice = ERR_PTR(-EPROBE_DEFER); + goto out; + } + +@@ -378,24 +383,40 @@ EXPORT_SYMBOL_GPL(devm_of_qcom_ice_get); + + static int qcom_ice_probe(struct platform_device *pdev) + { ++ unsigned long phandle = pdev->dev.of_node->phandle; + struct qcom_ice *engine; + void __iomem *base; + ++ guard(mutex)(&ice_mutex); ++ + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) { + dev_warn(&pdev->dev, "ICE registers not found\n"); ++ /* Store the error pointer for devm_of_qcom_ice_get() */ ++ xa_store(&ice_handles, phandle, (__force void *)base, GFP_KERNEL); + return PTR_ERR(base); + } + + engine = qcom_ice_create(&pdev->dev, base); +- if (IS_ERR(engine)) ++ if (IS_ERR(engine)) { ++ /* Store the error pointer for devm_of_qcom_ice_get() */ ++ xa_store(&ice_handles, phandle, engine, GFP_KERNEL); + return PTR_ERR(engine); ++ } + +- platform_set_drvdata(pdev, engine); ++ xa_store(&ice_handles, phandle, engine, GFP_KERNEL); + + return 0; + } + ++static void qcom_ice_remove(struct platform_device *pdev) ++{ ++ unsigned long phandle = pdev->dev.of_node->phandle; ++ ++ guard(mutex)(&ice_mutex); ++ xa_store(&ice_handles, phandle, NULL, GFP_KERNEL); ++} ++ + static const struct of_device_id qcom_ice_of_match_table[] = { + { .compatible = "qcom,inline-crypto-engine" }, + { }, +@@ -404,6 +425,7 @@ MODULE_DEVICE_TABLE(of, qcom_ice_of_match_table); + + static struct platform_driver qcom_ice_driver = { + .probe = qcom_ice_probe, ++ .remove_new = qcom_ice_remove, + .driver = { + .name = "qcom-ice", + .of_match_table = qcom_ice_of_match_table, +-- +2.53.0 +