]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.12-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 6 Jan 2025 12:09:40 +0000 (13:09 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 6 Jan 2025 12:09:40 +0000 (13:09 +0100)
added patches:
mm-hugetlb-independent-pmd-page-table-shared-count.patch
mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch

queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch [new file with mode: 0644]
queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch [new file with mode: 0644]
queue-6.12/series

diff --git a/queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch b/queue-6.12/mm-hugetlb-independent-pmd-page-table-shared-count.patch
new file mode 100644 (file)
index 0000000..8df7a88
--- /dev/null
@@ -0,0 +1,197 @@
+From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001
+From: Liu Shixin <liushixin2@huawei.com>
+Date: Mon, 16 Dec 2024 15:11:47 +0800
+Subject: mm: hugetlb: independent PMD page table shared count
+
+From: Liu Shixin <liushixin2@huawei.com>
+
+commit 59d9094df3d79443937add8700b2ef1a866b1081 upstream.
+
+The folio refcount may be increased unexpectly through try_get_folio() by
+caller such as split_huge_pages.  In huge_pmd_unshare(), we use refcount
+to check whether a pmd page table is shared.  The check is incorrect if
+the refcount is increased by the above caller, and this can cause the page
+table leaked:
+
+ BUG: Bad page state in process sh  pfn:109324
+ page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
+ flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
+ page_type: f2(table)
+ raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
+ raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
+ page dumped because: nonzero mapcount
+ ...
+ CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G    B              6.13.0-rc2master+ #7
+ Tainted: [B]=BAD_PAGE
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  show_stack+0x20/0x38 (C)
+  dump_stack_lvl+0x80/0xf8
+  dump_stack+0x18/0x28
+  bad_page+0x8c/0x130
+  free_page_is_bad_report+0xa4/0xb0
+  free_unref_page+0x3cc/0x620
+  __folio_put+0xf4/0x158
+  split_huge_pages_all+0x1e0/0x3e8
+  split_huge_pages_write+0x25c/0x2d8
+  full_proxy_write+0x64/0xd8
+  vfs_write+0xcc/0x280
+  ksys_write+0x70/0x110
+  __arm64_sys_write+0x24/0x38
+  invoke_syscall+0x50/0x120
+  el0_svc_common.constprop.0+0xc8/0xf0
+  do_el0_svc+0x24/0x38
+  el0_svc+0x34/0x128
+  el0t_64_sync_handler+0xc8/0xd0
+  el0t_64_sync+0x190/0x198
+
+The issue may be triggered by damon, offline_page, page_idle, etc, which
+will increase the refcount of page table.
+
+1. The page table itself will be discarded after reporting the
+   "nonzero mapcount".
+
+2. The HugeTLB page mapped by the page table miss freeing since we
+   treat the page table as shared and a shared page table will not be
+   unmapped.
+
+Fix it by introducing independent PMD page table shared count.  As
+described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390
+gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv
+pmds, so we can reuse the field as pt_share_count.
+
+Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Ken Chen <kenneth.w.chen@intel.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Nanyong Sun <sunnanyong@huawei.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h       |    1 +
+ include/linux/mm_types.h |   30 ++++++++++++++++++++++++++++++
+ mm/hugetlb.c             |   16 +++++++---------
+ 3 files changed, 38 insertions(+), 9 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3100,6 +3100,7 @@ static inline bool pagetable_pmd_ctor(st
+       if (!pmd_ptlock_init(ptdesc))
+               return false;
+       __folio_set_pgtable(folio);
++      ptdesc_pmd_pts_init(ptdesc);
+       lruvec_stat_add_folio(folio, NR_PAGETABLE);
+       return true;
+ }
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a);
+  * @pt_index:         Used for s390 gmap.
+  * @pt_mm:            Used for x86 pgds.
+  * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
++ * @pt_share_count:   Used for HugeTLB PMD page table share count.
+  * @_pt_pad_2:        Padding to ensure proper alignment.
+  * @ptl:              Lock for the page table.
+  * @__page_type:      Same as page->page_type. Unused for page tables.
+@@ -471,6 +472,9 @@ struct ptdesc {
+               pgoff_t pt_index;
+               struct mm_struct *pt_mm;
+               atomic_t pt_frag_refcount;
++#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
++              atomic_t pt_share_count;
++#endif
+       };
+       union {
+@@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= s
+       const struct page *:            (const struct ptdesc *)(p),     \
+       struct page *:                  (struct ptdesc *)(p)))
++#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
++static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
++{
++      atomic_set(&ptdesc->pt_share_count, 0);
++}
++
++static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
++{
++      atomic_inc(&ptdesc->pt_share_count);
++}
++
++static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
++{
++      atomic_dec(&ptdesc->pt_share_count);
++}
++
++static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
++{
++      return atomic_read(&ptdesc->pt_share_count);
++}
++#else
++static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
++{
++}
++#endif
++
+ /*
+  * Used for sizing the vmemmap region on some architectures
+  */
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -7200,7 +7200,7 @@ pte_t *huge_pmd_share(struct mm_struct *
+                       spte = hugetlb_walk(svma, saddr,
+                                           vma_mmu_pagesize(svma));
+                       if (spte) {
+-                              get_page(virt_to_page(spte));
++                              ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
+                               break;
+                       }
+               }
+@@ -7215,7 +7215,7 @@ pte_t *huge_pmd_share(struct mm_struct *
+                               (pmd_t *)((unsigned long)spte & PAGE_MASK));
+               mm_inc_nr_pmds(mm);
+       } else {
+-              put_page(virt_to_page(spte));
++              ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
+       }
+       spin_unlock(&mm->page_table_lock);
+ out:
+@@ -7227,10 +7227,6 @@ out:
+ /*
+  * unmap huge page backed by shared pte.
+  *
+- * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+- * indicated by page_count > 1, unmap is achieved by clearing pud and
+- * decrementing the ref count. If count == 1, the pte page is not shared.
+- *
+  * Called with page table lock held.
+  *
+  * returns: 1 successfully unmapped a shared pte page
+@@ -7239,18 +7235,20 @@ out:
+ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+                                       unsigned long addr, pte_t *ptep)
+ {
++      unsigned long sz = huge_page_size(hstate_vma(vma));
+       pgd_t *pgd = pgd_offset(mm, addr);
+       p4d_t *p4d = p4d_offset(pgd, addr);
+       pud_t *pud = pud_offset(p4d, addr);
+       i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+       hugetlb_vma_assert_locked(vma);
+-      BUG_ON(page_count(virt_to_page(ptep)) == 0);
+-      if (page_count(virt_to_page(ptep)) == 1)
++      if (sz != PMD_SIZE)
++              return 0;
++      if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+               return 0;
+       pud_clear(pud);
+-      put_page(virt_to_page(ptep));
++      ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+       mm_dec_nr_pmds(mm);
+       return 1;
+ }
diff --git a/queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch b/queue-6.12/mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
new file mode 100644 (file)
index 0000000..90f6ea3
--- /dev/null
@@ -0,0 +1,233 @@
+From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Date: Thu, 28 Nov 2024 15:06:17 +0000
+Subject: mm: reinstate ability to map write-sealed memfd mappings read-only
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+commit 8ec396d05d1b737c87311fb7311f753b02c2a6b1 upstream.
+
+Patch series "mm: reinstate ability to map write-sealed memfd mappings
+read-only".
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+This series reworks how we both permit write-sealed mappings being mapped
+read-only and disallow mprotect() from undoing the write-seal, fixing this
+regression.
+
+We also add a regression test to ensure that we do not accidentally
+regress this in future.
+
+Thanks to Julian Orth for reporting this regression.
+
+
+This patch (of 2):
+
+In commit 158978945f31 ("mm: perform the mapping_map_writable() check
+after call_mmap()") (and preceding changes in the same series) it became
+possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only.
+
+This was previously unnecessarily disallowed, despite the man page
+documentation indicating that it would be, thereby limiting the usefulness
+of F_SEAL_WRITE logic.
+
+We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE
+seal (one which disallows future writes to the memfd) to also be used for
+F_SEAL_WRITE.
+
+For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a
+read-only mapping to disallow mprotect() from overriding the seal - an
+operation performed by seal_check_write(), invoked from shmem_mmap(), the
+f_op->mmap() hook used by shmem mappings.
+
+By extending this to F_SEAL_WRITE and critically - checking
+mapping_map_writable() to determine if we may map the memfd AFTER we
+invoke shmem_mmap() - the desired logic becomes possible.  This is because
+mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will
+have cleared.
+
+Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path
+behaviour") unintentionally undid this logic by moving the
+mapping_map_writable() check before the shmem_mmap() hook is invoked,
+thereby regressing this change.
+
+We reinstate this functionality by moving the check out of shmem_mmap()
+and instead performing it in do_mmap() at the point at which VMA flags are
+being determined, which seems in any case to be a more appropriate place
+in which to make this determination.
+
+In order to achieve this we rework memfd seal logic to allow us access to
+this information using existing logic and eliminate the clearing of
+VM_MAYWRITE from seal_check_write() which we are performing in do_mmap()
+instead.
+
+Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com
+Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reported-by: Julian Orth <ju.orth@gmail.com>
+Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/
+Cc: Jann Horn <jannh@google.com>
+Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memfd.h |   14 ++++++++++++
+ include/linux/mm.h    |   58 ++++++++++++++++++++++++++++++++++----------------
+ mm/memfd.c            |    2 -
+ mm/mmap.c             |    4 +++
+ 4 files changed, 59 insertions(+), 19 deletions(-)
+
+--- a/include/linux/memfd.h
++++ b/include/linux/memfd.h
+@@ -7,6 +7,7 @@
+ #ifdef CONFIG_MEMFD_CREATE
+ extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
+ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
++unsigned int *memfd_file_seals_ptr(struct file *file);
+ #else
+ static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
+ {
+@@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_
+ {
+       return ERR_PTR(-EINVAL);
+ }
++
++static inline unsigned int *memfd_file_seals_ptr(struct file *file)
++{
++      return NULL;
++}
+ #endif
++/* Retrieve memfd seals associated with the file, if any. */
++static inline unsigned int memfd_file_seals(struct file *file)
++{
++      unsigned int *sealsp = memfd_file_seals_ptr(file);
++
++      return sealsp ? *sealsp : 0;
++}
++
+ #endif /* __LINUX_MEMFD_H */
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -4079,6 +4079,37 @@ void mem_dump_obj(void *object);
+ static inline void mem_dump_obj(void *object) {}
+ #endif
++static inline bool is_write_sealed(int seals)
++{
++      return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
++}
++
++/**
++ * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
++ *                      in which case writes should be disallowing moving
++ *                      forwards.
++ * @seals: the seals to check
++ * @vm_flags: the VMA flags to check
++ *
++ * Returns whether readonly sealed, in which case writess should be disallowed
++ * going forward.
++ */
++static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
++{
++      /*
++       * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
++       * MAP_SHARED and read-only, take care to not allow mprotect to
++       * revert protections on such mappings. Do this only for shared
++       * mappings. For private mappings, don't need to mask
++       * VM_MAYWRITE as we still want them to be COW-writable.
++       */
++      if (is_write_sealed(seals) &&
++          ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
++              return true;
++
++      return false;
++}
++
+ /**
+  * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+  *                    handle them.
+@@ -4090,24 +4121,15 @@ static inline void mem_dump_obj(void *ob
+  */
+ static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+ {
+-      if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+-              /*
+-               * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+-               * write seals are active.
+-               */
+-              if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+-                      return -EPERM;
+-
+-              /*
+-               * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+-               * MAP_SHARED and read-only, take care to not allow mprotect to
+-               * revert protections on such mappings. Do this only for shared
+-               * mappings. For private mappings, don't need to mask
+-               * VM_MAYWRITE as we still want them to be COW-writable.
+-               */
+-              if (vma->vm_flags & VM_SHARED)
+-                      vm_flags_clear(vma, VM_MAYWRITE);
+-      }
++      if (!is_write_sealed(seals))
++              return 0;
++
++      /*
++       * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
++       * write seals are active.
++       */
++      if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
++              return -EPERM;
+       return 0;
+ }
+--- a/mm/memfd.c
++++ b/mm/memfd.c
+@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct ad
+       return error;
+ }
+-static unsigned int *memfd_file_seals_ptr(struct file *file)
++unsigned int *memfd_file_seals_ptr(struct file *file)
+ {
+       if (shmem_file(file))
+               return &SHMEM_I(file_inode(file))->seals;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -47,6 +47,7 @@
+ #include <linux/oom.h>
+ #include <linux/sched/mm.h>
+ #include <linux/ksm.h>
++#include <linux/memfd.h>
+ #include <linux/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file,
+       if (file) {
+               struct inode *inode = file_inode(file);
++              unsigned int seals = memfd_file_seals(file);
+               unsigned long flags_mask;
+               if (!file_mmap_ok(file, inode, pgoff, len))
+@@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file,
+                       vm_flags |= VM_SHARED | VM_MAYSHARE;
+                       if (!(file->f_mode & FMODE_WRITE))
+                               vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
++                      else if (is_readonly_sealed(seals, vm_flags))
++                              vm_flags &= ~VM_MAYWRITE;
+                       fallthrough;
+               case MAP_PRIVATE:
+                       if (!(file->f_mode & FMODE_READ))
index eee70ffbe6c65c75d5a67f4c0b55902f1978f05d..9962430204032beeb7e3965897a660e9695dd0aa 100644 (file)
@@ -150,3 +150,5 @@ gve-trigger-rx-napi-instead-of-tx-napi-in-gve_xsk_wakeup.patch
 mm-readahead-fix-large-folio-support-in-async-readahead.patch
 mm-kmemleak-fix-sleeping-function-called-from-invalid-context-at-print-message.patch
 mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch
+mm-reinstate-ability-to-map-write-sealed-memfd-mappings-read-only.patch
+mm-hugetlb-independent-pmd-page-table-shared-count.patch