From: Greg Kroah-Hartman Date: Mon, 6 Jan 2025 12:09:40 +0000 (+0100) Subject: 6.12-stable patches X-Git-Tag: v5.4.289~22 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=eff404c366e000a14bb5ce97e7a172dc96f60f25;p=thirdparty%2Fkernel%2Fstable-queue.git 6.12-stable patches added patches: mm-hugetlb-independent-pmd-page-table-shared-count.patch mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch --- diff --git a/queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch b/queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch new file mode 100644 index 00000000000..8df7a8880be --- /dev/null +++ b/queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch @@ -0,0 +1,197 @@ +From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 +From: Liu Shixin +Date: Mon, 16 Dec 2024 15:11:47 +0800 +Subject: mm: hugetlb: independent PMD page table shared count + +From: Liu Shixin + +commit 59d9094df3d79443937add8700b2ef1a866b1081 upstream. + +The folio refcount may be increased unexpectly through try_get_folio() by +caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount +to check whether a pmd page table is shared. The check is incorrect if +the refcount is increased by the above caller, and this can cause the page +table leaked: + + BUG: Bad page state in process sh pfn:109324 + page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 + flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) + page_type: f2(table) + raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 + raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 + page dumped because: nonzero mapcount + ... + CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 + Tainted: [B]=BAD_PAGE + Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 + Call trace: + show_stack+0x20/0x38 (C) + dump_stack_lvl+0x80/0xf8 + dump_stack+0x18/0x28 + bad_page+0x8c/0x130 + free_page_is_bad_report+0xa4/0xb0 + free_unref_page+0x3cc/0x620 + __folio_put+0xf4/0x158 + split_huge_pages_all+0x1e0/0x3e8 + split_huge_pages_write+0x25c/0x2d8 + full_proxy_write+0x64/0xd8 + vfs_write+0xcc/0x280 + ksys_write+0x70/0x110 + __arm64_sys_write+0x24/0x38 + invoke_syscall+0x50/0x120 + el0_svc_common.constprop.0+0xc8/0xf0 + do_el0_svc+0x24/0x38 + el0_svc+0x34/0x128 + el0t_64_sync_handler+0xc8/0xd0 + el0t_64_sync+0x190/0x198 + +The issue may be triggered by damon, offline_page, page_idle, etc, which +will increase the refcount of page table. + +1. The page table itself will be discarded after reporting the + "nonzero mapcount". + +2. The HugeTLB page mapped by the page table miss freeing since we + treat the page table as shared and a shared page table will not be + unmapped. + +Fix it by introducing independent PMD page table shared count. As +described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 +gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv +pmds, so we can reuse the field as pt_share_count. + +Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com +Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") +Signed-off-by: Liu Shixin +Cc: Kefeng Wang +Cc: Ken Chen +Cc: Muchun Song +Cc: Nanyong Sun +Cc: Jane Chu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mm.h | 1 + + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ + mm/hugetlb.c | 16 +++++++--------- + 3 files changed, 38 insertions(+), 9 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3100,6 +3100,7 @@ static inline bool pagetable_pmd_ctor(st + if (!pmd_ptlock_init(ptdesc)) + return false; + __folio_set_pgtable(folio); ++ ptdesc_pmd_pts_init(ptdesc); + lruvec_stat_add_folio(folio, NR_PAGETABLE); + return true; + } +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); + * @pt_index: Used for s390 gmap. + * @pt_mm: Used for x86 pgds. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. ++ * @pt_share_count: Used for HugeTLB PMD page table share count. + * @_pt_pad_2: Padding to ensure proper alignment. + * @ptl: Lock for the page table. + * @__page_type: Same as page->page_type. Unused for page tables. +@@ -471,6 +472,9 @@ struct ptdesc { + pgoff_t pt_index; + struct mm_struct *pt_mm; + atomic_t pt_frag_refcount; ++#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING ++ atomic_t pt_share_count; ++#endif + }; + + union { +@@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= s + const struct page *: (const struct ptdesc *)(p), \ + struct page *: (struct ptdesc *)(p))) + ++#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING ++static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) ++{ ++ atomic_set(&ptdesc->pt_share_count, 0); ++} ++ ++static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) ++{ ++ atomic_inc(&ptdesc->pt_share_count); ++} ++ ++static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) ++{ ++ atomic_dec(&ptdesc->pt_share_count); ++} ++ ++static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) ++{ ++ return atomic_read(&ptdesc->pt_share_count); ++} ++#else ++static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) ++{ ++} ++#endif ++ + /* + * Used for sizing the vmemmap region on some architectures + */ +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -7200,7 +7200,7 @@ pte_t *huge_pmd_share(struct mm_struct * + spte = hugetlb_walk(svma, saddr, + vma_mmu_pagesize(svma)); + if (spte) { +- get_page(virt_to_page(spte)); ++ ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); + break; + } + } +@@ -7215,7 +7215,7 @@ pte_t *huge_pmd_share(struct mm_struct * + (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); + } else { +- put_page(virt_to_page(spte)); ++ ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); + } + spin_unlock(&mm->page_table_lock); + out: +@@ -7227,10 +7227,6 @@ out: + /* + * unmap huge page backed by shared pte. + * +- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared +- * indicated by page_count > 1, unmap is achieved by clearing pud and +- * decrementing the ref count. If count == 1, the pte page is not shared. +- * + * Called with page table lock held. + * + * returns: 1 successfully unmapped a shared pte page +@@ -7239,18 +7235,20 @@ out: + int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { ++ unsigned long sz = huge_page_size(hstate_vma(vma)); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); +- BUG_ON(page_count(virt_to_page(ptep)) == 0); +- if (page_count(virt_to_page(ptep)) == 1) ++ if (sz != PMD_SIZE) ++ return 0; ++ if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + return 0; + + pud_clear(pud); +- put_page(virt_to_page(ptep)); ++ ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + mm_dec_nr_pmds(mm); + return 1; + } diff --git a/queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch b/queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch new file mode 100644 index 00000000000..90f6ea371ba --- /dev/null +++ b/queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch @@ -0,0 +1,233 @@ +From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 +From: Lorenzo Stoakes +Date: Thu, 28 Nov 2024 15:06:17 +0000 +Subject: mm: reinstate ability to map write-sealed memfd mappings read-only + +From: Lorenzo Stoakes + +commit 8ec396d05d1b737c87311fb7311f753b02c2a6b1 upstream. + +Patch series "mm: reinstate ability to map write-sealed memfd mappings +read-only". + +In commit 158978945f31 ("mm: perform the mapping_map_writable() check +after call_mmap()") (and preceding changes in the same series) it became +possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. + +Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path +behaviour") unintentionally undid this logic by moving the +mapping_map_writable() check before the shmem_mmap() hook is invoked, +thereby regressing this change. + +This series reworks how we both permit write-sealed mappings being mapped +read-only and disallow mprotect() from undoing the write-seal, fixing this +regression. + +We also add a regression test to ensure that we do not accidentally +regress this in future. + +Thanks to Julian Orth for reporting this regression. + + +This patch (of 2): + +In commit 158978945f31 ("mm: perform the mapping_map_writable() check +after call_mmap()") (and preceding changes in the same series) it became +possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. + +This was previously unnecessarily disallowed, despite the man page +documentation indicating that it would be, thereby limiting the usefulness +of F_SEAL_WRITE logic. + +We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE +seal (one which disallows future writes to the memfd) to also be used for +F_SEAL_WRITE. + +For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a +read-only mapping to disallow mprotect() from overriding the seal - an +operation performed by seal_check_write(), invoked from shmem_mmap(), the +f_op->mmap() hook used by shmem mappings. + +By extending this to F_SEAL_WRITE and critically - checking +mapping_map_writable() to determine if we may map the memfd AFTER we +invoke shmem_mmap() - the desired logic becomes possible. This is because +mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will +have cleared. + +Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path +behaviour") unintentionally undid this logic by moving the +mapping_map_writable() check before the shmem_mmap() hook is invoked, +thereby regressing this change. + +We reinstate this functionality by moving the check out of shmem_mmap() +and instead performing it in do_mmap() at the point at which VMA flags are +being determined, which seems in any case to be a more appropriate place +in which to make this determination. + +In order to achieve this we rework memfd seal logic to allow us access to +this information using existing logic and eliminate the clearing of +VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() +instead. + +Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com +Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") +Signed-off-by: Lorenzo Stoakes +Reported-by: Julian Orth +Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ +Cc: Jann Horn +Cc: Liam R. Howlett +Cc: Linus Torvalds +Cc: Shuah Khan +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/memfd.h | 14 ++++++++++++ + include/linux/mm.h | 58 ++++++++++++++++++++++++++++++++++---------------- + mm/memfd.c | 2 - + mm/mmap.c | 4 +++ + 4 files changed, 59 insertions(+), 19 deletions(-) + +--- a/include/linux/memfd.h ++++ b/include/linux/memfd.h +@@ -7,6 +7,7 @@ + #ifdef CONFIG_MEMFD_CREATE + extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); + struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); ++unsigned int *memfd_file_seals_ptr(struct file *file); + #else + static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) + { +@@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_ + { + return ERR_PTR(-EINVAL); + } ++ ++static inline unsigned int *memfd_file_seals_ptr(struct file *file) ++{ ++ return NULL; ++} + #endif + ++/* Retrieve memfd seals associated with the file, if any. */ ++static inline unsigned int memfd_file_seals(struct file *file) ++{ ++ unsigned int *sealsp = memfd_file_seals_ptr(file); ++ ++ return sealsp ? *sealsp : 0; ++} ++ + #endif /* __LINUX_MEMFD_H */ +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -4079,6 +4079,37 @@ void mem_dump_obj(void *object); + static inline void mem_dump_obj(void *object) {} + #endif + ++static inline bool is_write_sealed(int seals) ++{ ++ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); ++} ++ ++/** ++ * is_readonly_sealed - Checks whether write-sealed but mapped read-only, ++ * in which case writes should be disallowing moving ++ * forwards. ++ * @seals: the seals to check ++ * @vm_flags: the VMA flags to check ++ * ++ * Returns whether readonly sealed, in which case writess should be disallowed ++ * going forward. ++ */ ++static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) ++{ ++ /* ++ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as ++ * MAP_SHARED and read-only, take care to not allow mprotect to ++ * revert protections on such mappings. Do this only for shared ++ * mappings. For private mappings, don't need to mask ++ * VM_MAYWRITE as we still want them to be COW-writable. ++ */ ++ if (is_write_sealed(seals) && ++ ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) ++ return true; ++ ++ return false; ++} ++ + /** + * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and + * handle them. +@@ -4090,24 +4121,15 @@ static inline void mem_dump_obj(void *ob + */ + static inline int seal_check_write(int seals, struct vm_area_struct *vma) + { +- if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { +- /* +- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when +- * write seals are active. +- */ +- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) +- return -EPERM; +- +- /* +- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as +- * MAP_SHARED and read-only, take care to not allow mprotect to +- * revert protections on such mappings. Do this only for shared +- * mappings. For private mappings, don't need to mask +- * VM_MAYWRITE as we still want them to be COW-writable. +- */ +- if (vma->vm_flags & VM_SHARED) +- vm_flags_clear(vma, VM_MAYWRITE); +- } ++ if (!is_write_sealed(seals)) ++ return 0; ++ ++ /* ++ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when ++ * write seals are active. ++ */ ++ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) ++ return -EPERM; + + return 0; + } +--- a/mm/memfd.c ++++ b/mm/memfd.c +@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct ad + return error; + } + +-static unsigned int *memfd_file_seals_ptr(struct file *file) ++unsigned int *memfd_file_seals_ptr(struct file *file) + { + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, + + if (file) { + struct inode *inode = file_inode(file); ++ unsigned int seals = memfd_file_seals(file); + unsigned long flags_mask; + + if (!file_mmap_ok(file, inode, pgoff, len)) +@@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); ++ else if (is_readonly_sealed(seals, vm_flags)) ++ vm_flags &= ~VM_MAYWRITE; + fallthrough; + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) diff --git a/queue-6.12/series b/queue-6.12/series index eee70ffbe6c..99624302040 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -150,3 +150,5 @@ gve-trigger-rx-napi-instead-of-tx-napi-in-gve_xsk_wakeup.patch mm-readahead-fix-large-folio-support-in-async-readahead.patch mm-kmemleak-fix-sleeping-function-called-from-invalid-context-at-print-message.patch mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch +mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch +mm-hugetlb-independent-pmd-page-table-shared-count.patch