]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2024 07:44:25 +0000 (09:44 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Jul 2024 07:44:25 +0000 (09:44 +0200)
added patches:
hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch
landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch
mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
mm-mglru-fix-ineffective-protection-calculation.patch
mm-mglru-fix-overshooting-shrinker-memory.patch
mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch
selftests-landlock-add-cred_transfer-test.patch
x86-efistub-avoid-returning-efi_success-on-error.patch
x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch

13 files changed:
queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch [new file with mode: 0644]
queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch [new file with mode: 0644]
queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch [new file with mode: 0644]
queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch [new file with mode: 0644]
queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch [new file with mode: 0644]
queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch [new file with mode: 0644]
queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch [new file with mode: 0644]
queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch [new file with mode: 0644]
queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch [new file with mode: 0644]
queue-6.10/selftests-landlock-add-cred_transfer-test.patch [new file with mode: 0644]
queue-6.10/series
queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch [new file with mode: 0644]
queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch [new file with mode: 0644]

diff --git a/queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch b/queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch
new file mode 100644 (file)
index 0000000..008f3d1
--- /dev/null
@@ -0,0 +1,131 @@
+From 003af997c8a945493859dd1a2d015cc9387ff27a Mon Sep 17 00:00:00 2001
+From: Aristeu Rozanski <aris@redhat.com>
+Date: Fri, 21 Jun 2024 15:00:50 -0400
+Subject: hugetlb: force allocating surplus hugepages on mempolicy allowed nodes
+
+From: Aristeu Rozanski <aris@redhat.com>
+
+commit 003af997c8a945493859dd1a2d015cc9387ff27a upstream.
+
+When trying to allocate a hugepage with no reserved ones free, it may be
+allowed in case a number of overcommit hugepages was configured (using
+/proc/sys/vm/nr_overcommit_hugepages) and that number wasn't reached.
+This allows for a behavior of having extra hugepages allocated
+dynamically, if there're resources for it.  Some sysadmins even prefer not
+reserving any hugepages and setting a big number of overcommit hugepages.
+
+But while attempting to allocate overcommit hugepages in a multi node
+system (either NUMA or mempolicy/cpuset) said allocations might randomly
+fail even when there're resources available for the allocation.
+
+This happens due to allowed_mems_nr() only accounting for the number of
+free hugepages in the nodes the current process belongs to and the surplus
+hugepage allocation is done so it can be allocated in any node.  In case
+one or more of the requested surplus hugepages are allocated in a
+different node, the whole allocation will fail due allowed_mems_nr()
+returning a lower value.
+
+So allocate surplus hugepages in one of the nodes the current process
+belongs to.
+
+Easy way to reproduce this issue is to use a 2+ NUMA nodes system:
+
+       # echo 0 >/proc/sys/vm/nr_hugepages
+       # echo 1 >/proc/sys/vm/nr_overcommit_hugepages
+       # numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2
+
+Repeating the execution of map_hugetlb test application will eventually
+fail when the hugepage ends up allocated in a different node.
+
+[aris@ruivo.org: v2]
+  Link: https://lkml.kernel.org/r/20240701212343.GG844599@cathedrallabs.org
+Link: https://lkml.kernel.org/r/20240621190050.mhxwb65zn37doegp@redhat.com
+Signed-off-by: Aristeu Rozanski <aris@redhat.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Aristeu Rozanski <aris@ruivo.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Vishal Moola <vishal.moola@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |   47 ++++++++++++++++++++++++++++-------------------
+ 1 file changed, 28 insertions(+), 19 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -2586,6 +2586,23 @@ struct folio *alloc_hugetlb_folio_nodema
+       return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
+ }
++static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
++{
++#ifdef CONFIG_NUMA
++      struct mempolicy *mpol = get_task_policy(current);
++
++      /*
++       * Only enforce MPOL_BIND policy which overlaps with cpuset policy
++       * (from policy_nodemask) specifically for hugetlb case
++       */
++      if (mpol->mode == MPOL_BIND &&
++              (apply_policy_zone(mpol, gfp_zone(gfp)) &&
++               cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
++              return &mpol->nodes;
++#endif
++      return NULL;
++}
++
+ /*
+  * Increase the hugetlb pool such that it can accommodate a reservation
+  * of size 'delta'.
+@@ -2599,6 +2616,8 @@ static int gather_surplus_pages(struct h
+       long i;
+       long needed, allocated;
+       bool alloc_ok = true;
++      int node;
++      nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+       lockdep_assert_held(&hugetlb_lock);
+       needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
+@@ -2613,8 +2632,15 @@ static int gather_surplus_pages(struct h
+ retry:
+       spin_unlock_irq(&hugetlb_lock);
+       for (i = 0; i < needed; i++) {
+-              folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+-                              NUMA_NO_NODE, NULL);
++              folio = NULL;
++              for_each_node_mask(node, cpuset_current_mems_allowed) {
++                      if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
++                              folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
++                                              node, NULL);
++                              if (folio)
++                                      break;
++                      }
++              }
+               if (!folio) {
+                       alloc_ok = false;
+                       break;
+@@ -4840,23 +4866,6 @@ static int __init default_hugepagesz_set
+ }
+ __setup("default_hugepagesz=", default_hugepagesz_setup);
+-static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+-{
+-#ifdef CONFIG_NUMA
+-      struct mempolicy *mpol = get_task_policy(current);
+-
+-      /*
+-       * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+-       * (from policy_nodemask) specifically for hugetlb case
+-       */
+-      if (mpol->mode == MPOL_BIND &&
+-              (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+-               cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+-              return &mpol->nodes;
+-#endif
+-      return NULL;
+-}
+-
+ static unsigned int allowed_mems_nr(struct hstate *h)
+ {
+       int node;
diff --git a/queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch b/queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch
new file mode 100644 (file)
index 0000000..76236de
--- /dev/null
@@ -0,0 +1,72 @@
+From 39705a6c29f8a2b93cf5b99528a55366c50014d1 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Wed, 24 Jul 2024 14:49:01 +0200
+Subject: landlock: Don't lose track of restrictions on cred_transfer
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jann Horn <jannh@google.com>
+
+commit 39705a6c29f8a2b93cf5b99528a55366c50014d1 upstream.
+
+When a process' cred struct is replaced, this _almost_ always invokes
+the cred_prepare LSM hook; but in one special case (when
+KEYCTL_SESSION_TO_PARENT updates the parent's credentials), the
+cred_transfer LSM hook is used instead.  Landlock only implements the
+cred_prepare hook, not cred_transfer, so KEYCTL_SESSION_TO_PARENT causes
+all information on Landlock restrictions to be lost.
+
+This basically means that a process with the ability to use the fork()
+and keyctl() syscalls can get rid of all Landlock restrictions on
+itself.
+
+Fix it by adding a cred_transfer hook that does the same thing as the
+existing cred_prepare hook. (Implemented by having hook_cred_prepare()
+call hook_cred_transfer() so that the two functions are less likely to
+accidentally diverge in the future.)
+
+Cc: stable@kernel.org
+Fixes: 385975dca53e ("landlock: Set up the security framework and manage credentials")
+Signed-off-by: Jann Horn <jannh@google.com>
+Link: https://lore.kernel.org/r/20240724-landlock-houdini-fix-v1-1-df89a4560ca3@google.com
+Signed-off-by: Mickaël Salaün <mic@digikod.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/landlock/cred.c |   11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/security/landlock/cred.c
++++ b/security/landlock/cred.c
+@@ -14,8 +14,8 @@
+ #include "ruleset.h"
+ #include "setup.h"
+-static int hook_cred_prepare(struct cred *const new,
+-                           const struct cred *const old, const gfp_t gfp)
++static void hook_cred_transfer(struct cred *const new,
++                             const struct cred *const old)
+ {
+       struct landlock_ruleset *const old_dom = landlock_cred(old)->domain;
+@@ -23,6 +23,12 @@ static int hook_cred_prepare(struct cred
+               landlock_get_ruleset(old_dom);
+               landlock_cred(new)->domain = old_dom;
+       }
++}
++
++static int hook_cred_prepare(struct cred *const new,
++                           const struct cred *const old, const gfp_t gfp)
++{
++      hook_cred_transfer(new, old);
+       return 0;
+ }
+@@ -36,6 +42,7 @@ static void hook_cred_free(struct cred *
+ static struct security_hook_list landlock_hooks[] __ro_after_init = {
+       LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
++      LSM_HOOK_INIT(cred_transfer, hook_cred_transfer),
+       LSM_HOOK_INIT(cred_free, hook_cred_free),
+ };
diff --git a/queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch b/queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
new file mode 100644 (file)
index 0000000..c262d9a
--- /dev/null
@@ -0,0 +1,197 @@
+From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001
+From: Gavin Shan <gshan@redhat.com>
+Date: Mon, 15 Jul 2024 10:04:23 +1000
+Subject: mm/huge_memory: avoid PMD-size page cache if needed
+
+From: Gavin Shan <gshan@redhat.com>
+
+commit d659b715e94ac039803d7601505d3473393fc0be upstream.
+
+xarray can't support arbitrary page cache size.  the largest and supported
+page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71
+("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray").  However,
+it's possible to have 512MB page cache in the huge memory's collapsing
+path on ARM64 system whose base page size is 64KB.  512MB page cache is
+breaking the limitation and a warning is raised when the xarray entry is
+split as shown in the following example.
+
+[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
+KernelPageSize:       64 kB
+[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
+   :
+int main(int argc, char **argv)
+{
+       const char *filename = TEST_XFS_FILENAME;
+       int fd = 0;
+       void *buf = (void *)-1, *p;
+       int pgsize = getpagesize();
+       int ret = 0;
+
+       if (pgsize != 0x10000) {
+               fprintf(stdout, "System with 64KB base page size is required!\n");
+               return -EPERM;
+       }
+
+       system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
+       system("echo 1 > /proc/sys/vm/drop_caches");
+
+       /* Open the xfs file */
+       fd = open(filename, O_RDONLY);
+       assert(fd > 0);
+
+       /* Create VMA */
+       buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
+       assert(buf != (void *)-1);
+       fprintf(stdout, "mapped buffer at 0x%p\n", buf);
+
+       /* Populate VMA */
+       ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
+       assert(ret == 0);
+       ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
+       assert(ret == 0);
+
+       /* Collapse VMA */
+       ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
+       assert(ret == 0);
+       ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
+       if (ret) {
+               fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
+               goto out;
+       }
+
+       /* Split xarray entry. Write permission is needed */
+       munmap(buf, TEST_MEM_SIZE);
+       buf = (void *)-1;
+       close(fd);
+       fd = open(filename, O_RDWR);
+       assert(fd > 0);
+       fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+                 TEST_MEM_SIZE - pgsize, pgsize);
+out:
+       if (buf != (void *)-1)
+               munmap(buf, TEST_MEM_SIZE);
+       if (fd > 0)
+               close(fd);
+
+       return ret;
+}
+
+[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
+[root@dhcp-10-26-1-207 ~]# /tmp/test
+ ------------[ cut here ]------------
+ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
+ Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib    \
+ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct      \
+ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4      \
+ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse   \
+ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net  \
+ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
+ CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
+ Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
+ pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
+ pc : xas_split_alloc+0xf8/0x128
+ lr : split_huge_page_to_list_to_order+0x1c4/0x780
+ sp : ffff8000ac32f660
+ x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
+ x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
+ x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
+ x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
+ x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
+ x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
+ x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
+ x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
+ x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
+ x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
+ Call trace:
+  xas_split_alloc+0xf8/0x128
+  split_huge_page_to_list_to_order+0x1c4/0x780
+  truncate_inode_partial_folio+0xdc/0x160
+  truncate_inode_pages_range+0x1b4/0x4a8
+  truncate_pagecache_range+0x84/0xa0
+  xfs_flush_unmap_range+0x70/0x90 [xfs]
+  xfs_file_fallocate+0xfc/0x4d8 [xfs]
+  vfs_fallocate+0x124/0x2f0
+  ksys_fallocate+0x4c/0xa0
+  __arm64_sys_fallocate+0x24/0x38
+  invoke_syscall.constprop.0+0x7c/0xd8
+  do_el0_svc+0xb4/0xd0
+  el0_svc+0x44/0x1d8
+  el0t_64_sync_handler+0x134/0x150
+  el0t_64_sync+0x17c/0x180
+
+Fix it by correcting the supported page cache orders, different sets for
+DAX and other files.  With it corrected, 512MB page cache becomes
+disallowed on all non-DAX files on ARM64 system where the base page size
+is 64KB.  After this patch is applied, the test program fails with error
+-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
+system call to collapse the page caches.
+
+Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com
+Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
+Signed-off-by: Gavin Shan <gshan@redhat.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
+Acked-by: Zi Yan <ziy@nvidia.com>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Barry Song <baohua@kernel.org>
+Cc: Don Dutile <ddutile@redhat.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: William Kucharski <william.kucharski@oracle.com>
+Cc: <stable@vger.kernel.org>   [5.17+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/huge_mm.h |   12 +++++++++---
+ mm/huge_memory.c        |   12 ++++++++++--
+ 2 files changed, 19 insertions(+), 5 deletions(-)
+
+--- a/include/linux/huge_mm.h
++++ b/include/linux/huge_mm.h
+@@ -72,14 +72,20 @@ extern struct kobj_attribute shmem_enabl
+ #define THP_ORDERS_ALL_ANON   ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
+ /*
+- * Mask of all large folio orders supported for file THP.
++ * Mask of all large folio orders supported for file THP. Folios in a DAX
++ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
++ * it.
+  */
+-#define THP_ORDERS_ALL_FILE   (BIT(PMD_ORDER) | BIT(PUD_ORDER))
++#define THP_ORDERS_ALL_FILE_DAX               \
++      (BIT(PMD_ORDER) | BIT(PUD_ORDER))
++#define THP_ORDERS_ALL_FILE_DEFAULT   \
++      ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
+ /*
+  * Mask of all large folio orders supported for THP.
+  */
+-#define THP_ORDERS_ALL                (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
++#define THP_ORDERS_ALL        \
++      (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
+ #define TVA_SMAPS             (1 << 0)        /* Will be used for procfs */
+ #define TVA_IN_PF             (1 << 1)        /* Page fault handler */
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders
+       bool smaps = tva_flags & TVA_SMAPS;
+       bool in_pf = tva_flags & TVA_IN_PF;
+       bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
++      unsigned long supported_orders;
++
+       /* Check the intersection of requested and supported orders. */
+-      orders &= vma_is_anonymous(vma) ?
+-                      THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
++      if (vma_is_anonymous(vma))
++              supported_orders = THP_ORDERS_ALL_ANON;
++      else if (vma_is_dax(vma))
++              supported_orders = THP_ORDERS_ALL_FILE_DAX;
++      else
++              supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
++
++      orders &= supported_orders;
+       if (!orders)
+               return 0;
diff --git a/queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch b/queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
new file mode 100644 (file)
index 0000000..b942f30
--- /dev/null
@@ -0,0 +1,48 @@
+From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang@os.amperecomputing.com>
+Date: Fri, 12 Jul 2024 08:58:55 -0700
+Subject: mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit machines
+
+From: Yang Shi <yang@os.amperecomputing.com>
+
+commit d9592025000b3cf26c742f3505da7b83aedc26d5 upstream.
+
+Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't
+force huge page alignment on 32 bit") didn't work for x86_32 [1].  It is
+because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT.
+
+!CONFIG_64BIT should cover all 32 bit machines.
+
+[1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/
+
+Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com
+Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit")
+Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
+Reported-by: Yves-Alexis Perez <corsac@debian.org>
+Tested-by: Yves-Alexis Perez <corsac@debian.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Jiri Slaby <jirislaby@kernel.org>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Salvatore Bonaccorso <carnil@debian.org>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: <stable@vger.kernel.org>   [6.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/huge_memory.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -857,7 +857,7 @@ static unsigned long __thp_get_unmapped_
+       loff_t off_align = round_up(off, size);
+       unsigned long len_pad, ret, off_sub;
+-      if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
++      if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
+               return 0;
+       if (off_end <= off_align || (off_end - off_align) < size)
diff --git a/queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch b/queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch
new file mode 100644 (file)
index 0000000..48fc6b6
--- /dev/null
@@ -0,0 +1,100 @@
+From 667574e873b5f77a220b2a93329689f36fb56d5d Mon Sep 17 00:00:00 2001
+From: Miaohe Lin <linmiaohe@huawei.com>
+Date: Fri, 12 Jul 2024 11:13:14 +0800
+Subject: mm/hugetlb: fix possible recursive locking detected warning
+
+From: Miaohe Lin <linmiaohe@huawei.com>
+
+commit 667574e873b5f77a220b2a93329689f36fb56d5d upstream.
+
+When tries to demote 1G hugetlb folios, a lockdep warning is observed:
+
+============================================
+WARNING: possible recursive locking detected
+6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Not tainted
+--------------------------------------------
+bash/710 is trying to acquire lock:
+ffffffff8f0a7850 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0x244/0x460
+
+but task is already holding lock:
+ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460
+
+other info that might help us debug this:
+ Possible unsafe locking scenario:
+
+       CPU0
+       ----
+  lock(&h->resize_lock);
+  lock(&h->resize_lock);
+
+ *** DEADLOCK ***
+
+ May be due to missing lock nesting notation
+
+4 locks held by bash/710:
+ #0: ffff8f118439c3f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
+ #1: ffff8f11893b9e88 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0
+ #2: ffff8f1183dc4428 (kn->active#98){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0
+ #3: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460
+
+stack backtrace:
+CPU: 3 PID: 710 Comm: bash Not tainted 6.10.0-rc6-00452-ga4d0275fa660-dirty #79
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x68/0xa0
+ __lock_acquire+0x10f2/0x1ca0
+ lock_acquire+0xbe/0x2d0
+ __mutex_lock+0x6d/0x400
+ demote_store+0x244/0x460
+ kernfs_fop_write_iter+0x12c/0x1d0
+ vfs_write+0x380/0x540
+ ksys_write+0x64/0xe0
+ do_syscall_64+0xb9/0x1d0
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7fa61db14887
+RSP: 002b:00007ffc56c48358 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fa61db14887
+RDX: 0000000000000002 RSI: 000055a030050220 RDI: 0000000000000001
+RBP: 000055a030050220 R08: 00007fa61dbd1460 R09: 000000007fffffff
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002
+R13: 00007fa61dc1b780 R14: 00007fa61dc17600 R15: 00007fa61dc16a00
+ </TASK>
+
+Lockdep considers this an AA deadlock because the different resize_lock
+mutexes reside in the same lockdep class, but this is a false positive.
+Place them in distinct classes to avoid these warnings.
+
+Link: https://lkml.kernel.org/r/20240712031314.2570452-1-linmiaohe@huawei.com
+Fixes: 8531fc6f52f5 ("hugetlb: add hugetlb demote page support")
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Acked-by: Muchun Song <muchun.song@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/hugetlb.h |    1 +
+ mm/hugetlb.c            |    2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -681,6 +681,7 @@ HPAGEFLAG(RawHwpUnreliable, raw_hwp_unre
+ /* Defines one hugetlb page size */
+ struct hstate {
+       struct mutex resize_lock;
++      struct lock_class_key resize_key;
+       int next_nid_to_alloc;
+       int next_nid_to_free;
+       unsigned int order;
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -4643,7 +4643,7 @@ void __init hugetlb_add_hstate(unsigned
+       BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
+       BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
+       h = &hstates[hugetlb_max_hstate++];
+-      mutex_init(&h->resize_lock);
++      __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
+       h->order = order;
+       h->mask = ~(huge_page_size(h) - 1);
+       for (i = 0; i < MAX_NUMNODES; ++i)
diff --git a/queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch b/queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
new file mode 100644 (file)
index 0000000..4208c35
--- /dev/null
@@ -0,0 +1,51 @@
+From 8b671fe1a879923ecfb72dda6caf01460dd885ef Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Thu, 11 Jul 2024 13:19:56 -0600
+Subject: mm/mglru: fix div-by-zero in vmpressure_calc_level()
+
+From: Yu Zhao <yuzhao@google.com>
+
+commit 8b671fe1a879923ecfb72dda6caf01460dd885ef upstream.
+
+evict_folios() uses a second pass to reclaim folios that have gone through
+page writeback and become clean before it finishes the first pass, since
+folio_rotate_reclaimable() cannot handle those folios due to the
+isolation.
+
+The second pass tries to avoid potential double counting by deducting
+scan_control->nr_scanned.  However, this can result in underflow of
+nr_scanned, under a condition where shrink_folio_list() does not increment
+nr_scanned, i.e., when folio_trylock() fails.
+
+The underflow can cause the divisor, i.e., scale=scanned+reclaimed in
+vmpressure_calc_level(), to become zero, resulting in the following crash:
+
+  [exception RIP: vmpressure_work_fn+101]
+  process_one_work at ffffffffa3313f2b
+
+Since scan_control->nr_scanned has no established semantics, the potential
+double counting has minimal risks.  Therefore, fix the problem by not
+deducting scan_control->nr_scanned in evict_folios().
+
+Link: https://lkml.kernel.org/r/20240711191957.939105-1-yuzhao@google.com
+Fixes: 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated")
+Reported-by: Wei Xu <weixugc@google.com>
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Cc: Alexander Motin <mav@ixsystems.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4582,7 +4582,6 @@ retry:
+               /* retry folios that may have missed folio_rotate_reclaimable() */
+               list_move(&folio->lru, &clean);
+-              sc->nr_scanned -= folio_nr_pages(folio);
+       }
+       spin_lock_irq(&lruvec->lru_lock);
diff --git a/queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch b/queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch
new file mode 100644 (file)
index 0000000..17f85f7
--- /dev/null
@@ -0,0 +1,183 @@
+From 30d77b7eef019fa4422980806e8b7cdc8674493e Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Fri, 12 Jul 2024 17:29:56 -0600
+Subject: mm/mglru: fix ineffective protection calculation
+
+From: Yu Zhao <yuzhao@google.com>
+
+commit 30d77b7eef019fa4422980806e8b7cdc8674493e upstream.
+
+mem_cgroup_calculate_protection() is not stateless and should only be used
+as part of a top-down tree traversal.  shrink_one() traverses the per-node
+memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
+call mem_cgroup_calculate_protection().
+
+The existing misuse in shrink_one() can cause ineffective protection of
+sub-trees that are grandchildren of root_mem_cgroup.  Fix it by reusing
+lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
+calculate the protection.
+
+Previously lru_gen_age_node() opportunistically skips the first pass,
+i.e., when scan_control->priority is DEF_PRIORITY.  On the second pass,
+lruvec_is_sizable() uses appropriate scan_control->priority, set by
+set_initial_priority() from lru_gen_shrink_node(), to decide whether a
+memcg is too small to reclaim from.
+
+Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
+So it should call set_initial_priority() upfront, to make sure
+lruvec_is_sizable() uses appropriate scan_control->priority on the first
+pass.  Otherwise, lruvec_is_reclaimable() can return false negatives and
+result in premature OOM kills when min_ttl_ms is used.
+
+Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
+Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Reported-by: T.J. Mercier <tjmercier@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |   82 +++++++++++++++++++++++++++---------------------------------
+ 1 file changed, 38 insertions(+), 44 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3900,6 +3900,32 @@ done:
+  *                          working set protection
+  ******************************************************************************/
++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
++{
++      int priority;
++      unsigned long reclaimable;
++
++      if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
++              return;
++      /*
++       * Determine the initial priority based on
++       * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
++       * where reclaimed_to_scanned_ratio = inactive / total.
++       */
++      reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
++      if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
++              reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
++
++      /* round down reclaimable and round up sc->nr_to_reclaim */
++      priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
++
++      /*
++       * The estimation is based on LRU pages only, so cap it to prevent
++       * overshoots of shrinker objects by large margins.
++       */
++      sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
++}
++
+ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+ {
+       int gen, type, zone;
+@@ -3933,19 +3959,17 @@ static bool lruvec_is_reclaimable(struct
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       DEFINE_MIN_SEQ(lruvec);
+-      /* see the comment on lru_gen_folio */
+-      gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+-      birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+-
+-      if (time_is_after_jiffies(birth + min_ttl))
++      if (mem_cgroup_below_min(NULL, memcg))
+               return false;
+       if (!lruvec_is_sizable(lruvec, sc))
+               return false;
+-      mem_cgroup_calculate_protection(NULL, memcg);
++      /* see the comment on lru_gen_folio */
++      gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
++      birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+-      return !mem_cgroup_below_min(NULL, memcg);
++      return time_is_before_jiffies(birth + min_ttl);
+ }
+ /* to protect the working set of the last N jiffies */
+@@ -3955,23 +3979,20 @@ static void lru_gen_age_node(struct pgli
+ {
+       struct mem_cgroup *memcg;
+       unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
++      bool reclaimable = !min_ttl;
+       VM_WARN_ON_ONCE(!current_is_kswapd());
+-      /* check the order to exclude compaction-induced reclaim */
+-      if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+-              return;
++      set_initial_priority(pgdat, sc);
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+-              if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
+-                      mem_cgroup_iter_break(NULL, memcg);
+-                      return;
+-              }
++              mem_cgroup_calculate_protection(NULL, memcg);
+-              cond_resched();
++              if (!reclaimable)
++                      reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+       /*
+@@ -3979,7 +4000,7 @@ static void lru_gen_age_node(struct pgli
+        * younger than min_ttl. However, another possibility is all memcgs are
+        * either too small or below min.
+        */
+-      if (mutex_trylock(&oom_lock)) {
++      if (!reclaimable && mutex_trylock(&oom_lock)) {
+               struct oom_control oc = {
+                       .gfp_mask = sc->gfp_mask,
+               };
+@@ -4771,8 +4792,7 @@ static int shrink_one(struct lruvec *lru
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+-      mem_cgroup_calculate_protection(NULL, memcg);
+-
++      /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
+       if (mem_cgroup_below_min(NULL, memcg))
+               return MEMCG_LRU_YOUNG;
+@@ -4896,32 +4916,6 @@ static void lru_gen_shrink_lruvec(struct
+       blk_finish_plug(&plug);
+ }
+-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+-{
+-      int priority;
+-      unsigned long reclaimable;
+-
+-      if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+-              return;
+-      /*
+-       * Determine the initial priority based on
+-       * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+-       * where reclaimed_to_scanned_ratio = inactive / total.
+-       */
+-      reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+-      if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+-              reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+-
+-      /* round down reclaimable and round up sc->nr_to_reclaim */
+-      priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+-
+-      /*
+-       * The estimation is based on LRU pages only, so cap it to prevent
+-       * overshoots of shrinker objects by large margins.
+-       */
+-      sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+-}
+-
+ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+       struct blk_plug plug;
diff --git a/queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch b/queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch
new file mode 100644 (file)
index 0000000..a6f7b0e
--- /dev/null
@@ -0,0 +1,89 @@
+From 3f74e6bd3b84a8b6bb3cc51609c89e5b9d58eed7 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Thu, 11 Jul 2024 13:19:57 -0600
+Subject: mm/mglru: fix overshooting shrinker memory
+
+From: Yu Zhao <yuzhao@google.com>
+
+commit 3f74e6bd3b84a8b6bb3cc51609c89e5b9d58eed7 upstream.
+
+set_initial_priority() tries to jump-start global reclaim by estimating
+the priority based on cold/hot LRU pages.  The estimation does not account
+for shrinker objects, and it cannot do so because their sizes can be in
+different units other than page.
+
+If shrinker objects are the majority, e.g., on TrueNAS SCALE 24.04.0 where
+ZFS ARC can use almost all system memory, set_initial_priority() can
+vastly underestimate how much memory ARC shrinker can evict and assign
+extreme low values to scan_control->priority, resulting in overshoots of
+shrinker objects.
+
+To reproduce the problem, using TrueNAS SCALE 24.04.0 with 32GB DRAM, a
+test ZFS pool and the following commands:
+
+  fio --name=mglru.file --numjobs=36 --ioengine=io_uring \
+      --directory=/root/test-zfs-pool/ --size=1024m --buffered=1 \
+      --rw=randread --random_distribution=random \
+      --time_based --runtime=1h &
+
+  for ((i = 0; i < 20; i++))
+  do
+    sleep 120
+    fio --name=mglru.anon --numjobs=16 --ioengine=mmap \
+      --filename=/dev/zero --size=1024m --fadvise_hint=0 \
+      --rw=randrw --random_distribution=random \
+      --time_based --runtime=1m
+  done
+
+To fix the problem:
+1. Cap scan_control->priority at or above DEF_PRIORITY/2, to prevent
+   the jump-start from being overly aggressive.
+2. Account for the progress from mm_account_reclaimed_pages(), to
+   prevent kswapd_shrink_node() from raising the priority
+   unnecessarily.
+
+Link: https://lkml.kernel.org/r/20240711191957.939105-2-yuzhao@google.com
+Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Reported-by: Alexander Motin <mav@ixsystems.com>
+Cc: Wei Xu <weixugc@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4915,7 +4915,11 @@ static void set_initial_priority(struct
+       /* round down reclaimable and round up sc->nr_to_reclaim */
+       priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+-      sc->priority = clamp(priority, 0, DEF_PRIORITY);
++      /*
++       * The estimation is based on LRU pages only, so cap it to prevent
++       * overshoots of shrinker objects by large margins.
++       */
++      sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+ }
+ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+@@ -6701,6 +6705,7 @@ static bool kswapd_shrink_node(pg_data_t
+ {
+       struct zone *zone;
+       int z;
++      unsigned long nr_reclaimed = sc->nr_reclaimed;
+       /* Reclaim a number of pages proportional to the number of zones */
+       sc->nr_to_reclaim = 0;
+@@ -6728,7 +6733,8 @@ static bool kswapd_shrink_node(pg_data_t
+       if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
+               sc->order = 0;
+-      return sc->nr_scanned >= sc->nr_to_reclaim;
++      /* account for progress from mm_account_reclaimed_pages() */
++      return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
+ }
+ /* Page allocator PCP high watermark is lowered if reclaim is active. */
diff --git a/queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch b/queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch
new file mode 100644 (file)
index 0000000..8b9335a
--- /dev/null
@@ -0,0 +1,262 @@
+From 7d6be67cfdd4a53cea7147313ca13c531e3a470f Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Fri, 21 Jun 2024 10:08:41 +0900
+Subject: mm: mmap_lock: replace get_memcg_path_buf() with on-stack buffer
+
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+
+commit 7d6be67cfdd4a53cea7147313ca13c531e3a470f upstream.
+
+Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock
+acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro using
+preempt_disable() in order to let get_mm_memcg_path() return a percpu
+buffer exclusively used by normal, softirq, irq and NMI contexts
+respectively.
+
+Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling
+preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock)
+based on an argument that preempt_disable() has to be avoided because
+get_mm_memcg_path() might sleep if PREEMPT_RT=y.
+
+But syzbot started reporting
+
+  inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
+
+and
+
+  inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+
+messages, for local_lock() does not disable IRQ.
+
+We could replace local_lock() with local_lock_irqsave() in order to
+suppress these messages.  But this patch instead replaces percpu buffers
+with on-stack buffer, for the size of each buffer returned by
+get_memcg_path_buf() is only 256 bytes which is tolerable for allocating
+from current thread's kernel stack memory.
+
+Link: https://lkml.kernel.org/r/ef22d289-eadb-4ed9-863b-fbc922b33d8d@I-love.SAKURA.ne.jp
+Reported-by: syzbot <syzbot+40905bca570ae6784745@syzkaller.appspotmail.com>
+Closes: https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745
+Fixes: 832b50725373 ("mm: mmap_lock: use local locks instead of disabling preemption")
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Nicolas Saenz Julienne <nsaenzju@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mmap_lock.c |  175 ++++++---------------------------------------------------
+ 1 file changed, 20 insertions(+), 155 deletions(-)
+
+--- a/mm/mmap_lock.c
++++ b/mm/mmap_lock.c
+@@ -19,14 +19,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_relea
+ #ifdef CONFIG_MEMCG
+-/*
+- * Our various events all share the same buffer (because we don't want or need
+- * to allocate a set of buffers *per event type*), so we need to protect against
+- * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
+- * been made.
+- */
+-static DEFINE_MUTEX(reg_lock);
+-static int reg_refcount; /* Protected by reg_lock. */
++static atomic_t reg_refcount;
+ /*
+  * Size of the buffer for memcg path names. Ignoring stack trace support,
+@@ -34,136 +27,22 @@ static int reg_refcount; /* Protected by
+  */
+ #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
+-/*
+- * How many contexts our trace events might be called in: normal, softirq, irq,
+- * and NMI.
+- */
+-#define CONTEXT_COUNT 4
+-
+-struct memcg_path {
+-      local_lock_t lock;
+-      char __rcu *buf;
+-      local_t buf_idx;
+-};
+-static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
+-      .lock = INIT_LOCAL_LOCK(lock),
+-      .buf_idx = LOCAL_INIT(0),
+-};
+-
+-static char **tmp_bufs;
+-
+-/* Called with reg_lock held. */
+-static void free_memcg_path_bufs(void)
+-{
+-      struct memcg_path *memcg_path;
+-      int cpu;
+-      char **old = tmp_bufs;
+-
+-      for_each_possible_cpu(cpu) {
+-              memcg_path = per_cpu_ptr(&memcg_paths, cpu);
+-              *(old++) = rcu_dereference_protected(memcg_path->buf,
+-                      lockdep_is_held(&reg_lock));
+-              rcu_assign_pointer(memcg_path->buf, NULL);
+-      }
+-
+-      /* Wait for inflight memcg_path_buf users to finish. */
+-      synchronize_rcu();
+-
+-      old = tmp_bufs;
+-      for_each_possible_cpu(cpu) {
+-              kfree(*(old++));
+-      }
+-
+-      kfree(tmp_bufs);
+-      tmp_bufs = NULL;
+-}
+-
+ int trace_mmap_lock_reg(void)
+ {
+-      int cpu;
+-      char *new;
+-
+-      mutex_lock(&reg_lock);
+-
+-      /* If the refcount is going 0->1, proceed with allocating buffers. */
+-      if (reg_refcount++)
+-              goto out;
+-
+-      tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
+-                               GFP_KERNEL);
+-      if (tmp_bufs == NULL)
+-              goto out_fail;
+-
+-      for_each_possible_cpu(cpu) {
+-              new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
+-              if (new == NULL)
+-                      goto out_fail_free;
+-              rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
+-              /* Don't need to wait for inflights, they'd have gotten NULL. */
+-      }
+-
+-out:
+-      mutex_unlock(&reg_lock);
++      atomic_inc(&reg_refcount);
+       return 0;
+-
+-out_fail_free:
+-      free_memcg_path_bufs();
+-out_fail:
+-      /* Since we failed, undo the earlier ref increment. */
+-      --reg_refcount;
+-
+-      mutex_unlock(&reg_lock);
+-      return -ENOMEM;
+ }
+ void trace_mmap_lock_unreg(void)
+ {
+-      mutex_lock(&reg_lock);
+-
+-      /* If the refcount is going 1->0, proceed with freeing buffers. */
+-      if (--reg_refcount)
+-              goto out;
+-
+-      free_memcg_path_bufs();
+-
+-out:
+-      mutex_unlock(&reg_lock);
+-}
+-
+-static inline char *get_memcg_path_buf(void)
+-{
+-      struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
+-      char *buf;
+-      int idx;
+-
+-      rcu_read_lock();
+-      buf = rcu_dereference(memcg_path->buf);
+-      if (buf == NULL) {
+-              rcu_read_unlock();
+-              return NULL;
+-      }
+-      idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
+-            MEMCG_PATH_BUF_SIZE;
+-      return &buf[idx];
++      atomic_dec(&reg_refcount);
+ }
+-static inline void put_memcg_path_buf(void)
+-{
+-      local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
+-      rcu_read_unlock();
+-}
+-
+-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
+-      do {                                                                   \
+-              const char *memcg_path;                                        \
+-              local_lock(&memcg_paths.lock);                                 \
+-              memcg_path = get_mm_memcg_path(mm);                            \
+-              trace_mmap_lock_##type(mm,                                     \
+-                                     memcg_path != NULL ? memcg_path : "",   \
+-                                     ##__VA_ARGS__);                         \
+-              if (likely(memcg_path != NULL))                                \
+-                      put_memcg_path_buf();                                  \
+-              local_unlock(&memcg_paths.lock);                               \
++#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                    \
++      do {                                                    \
++              char buf[MEMCG_PATH_BUF_SIZE];                  \
++              get_mm_memcg_path(mm, buf, sizeof(buf));        \
++              trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
+       } while (0)
+ #else /* !CONFIG_MEMCG */
+@@ -185,37 +64,23 @@ void trace_mmap_lock_unreg(void)
+ #ifdef CONFIG_TRACING
+ #ifdef CONFIG_MEMCG
+ /*
+- * Write the given mm_struct's memcg path to a percpu buffer, and return a
+- * pointer to it. If the path cannot be determined, or no buffer was available
+- * (because the trace event is being unregistered), NULL is returned.
+- *
+- * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
+- * disabled by the caller before calling us, and re-enabled only after the
+- * caller is done with the pointer.
+- *
+- * The caller must call put_memcg_path_buf() once the buffer is no longer
+- * needed. This must be done while preemption is still disabled.
++ * Write the given mm_struct's memcg path to a buffer. If the path cannot be
++ * determined or the trace event is being unregistered, empty string is written.
+  */
+-static const char *get_mm_memcg_path(struct mm_struct *mm)
++static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
+ {
+-      char *buf = NULL;
+-      struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
++      struct mem_cgroup *memcg;
++      buf[0] = '\0';
++      /* No need to get path if no trace event is registered. */
++      if (!atomic_read(&reg_refcount))
++              return;
++      memcg = get_mem_cgroup_from_mm(mm);
+       if (memcg == NULL)
+-              goto out;
+-      if (unlikely(memcg->css.cgroup == NULL))
+-              goto out_put;
+-
+-      buf = get_memcg_path_buf();
+-      if (buf == NULL)
+-              goto out_put;
+-
+-      cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+-
+-out_put:
++              return;
++      if (memcg->css.cgroup)
++              cgroup_path(memcg->css.cgroup, buf, buflen);
+       css_put(&memcg->css);
+-out:
+-      return buf;
+ }
+ #endif /* CONFIG_MEMCG */
diff --git a/queue-6.10/selftests-landlock-add-cred_transfer-test.patch b/queue-6.10/selftests-landlock-add-cred_transfer-test.patch
new file mode 100644 (file)
index 0000000..1734da9
--- /dev/null
@@ -0,0 +1,124 @@
+From cc374782b6ca0fd634482391da977542443d3368 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
+Date: Wed, 24 Jul 2024 16:54:26 +0200
+Subject: selftests/landlock: Add cred_transfer test
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Mickaël Salaün <mic@digikod.net>
+
+commit cc374782b6ca0fd634482391da977542443d3368 upstream.
+
+Check that keyctl(KEYCTL_SESSION_TO_PARENT) preserves the parent's
+restrictions.
+
+Fixes: e1199815b47b ("selftests/landlock: Add user space tests")
+Co-developed-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jann Horn <jannh@google.com>
+Link: https://lore.kernel.org/r/20240724.Ood5aige9she@digikod.net
+Signed-off-by: Mickaël Salaün <mic@digikod.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/landlock/base_test.c |   74 +++++++++++++++++++++++++++
+ tools/testing/selftests/landlock/config      |    1 
+ 2 files changed, 75 insertions(+)
+
+--- a/tools/testing/selftests/landlock/base_test.c
++++ b/tools/testing/selftests/landlock/base_test.c
+@@ -9,6 +9,7 @@
+ #define _GNU_SOURCE
+ #include <errno.h>
+ #include <fcntl.h>
++#include <linux/keyctl.h>
+ #include <linux/landlock.h>
+ #include <string.h>
+ #include <sys/prctl.h>
+@@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer)
+       ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
+ }
++TEST(cred_transfer)
++{
++      struct landlock_ruleset_attr ruleset_attr = {
++              .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR,
++      };
++      int ruleset_fd, dir_fd;
++      pid_t child;
++      int status;
++
++      drop_caps(_metadata);
++
++      dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
++      EXPECT_LE(0, dir_fd);
++      EXPECT_EQ(0, close(dir_fd));
++
++      /* Denies opening directories. */
++      ruleset_fd =
++              landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
++      ASSERT_LE(0, ruleset_fd);
++      EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
++      ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
++      EXPECT_EQ(0, close(ruleset_fd));
++
++      /* Checks ruleset enforcement. */
++      EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
++      EXPECT_EQ(EACCES, errno);
++
++      /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */
++      EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0,
++                            0, 0))
++      {
++              TH_LOG("Failed to join session keyring: %s", strerror(errno));
++      }
++
++      child = fork();
++      ASSERT_LE(0, child);
++      if (child == 0) {
++              /* Checks ruleset enforcement. */
++              EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
++              EXPECT_EQ(EACCES, errno);
++
++              /*
++               * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a
++               * different session keyring in the child, so make that happen.
++               */
++              EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING,
++                                    NULL, 0, 0, 0));
++
++              /*
++               * KEYCTL_SESSION_TO_PARENT installs credentials on the parent
++               * that never go through the cred_prepare hook, this path uses
++               * cred_transfer instead.
++               */
++              EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0,
++                                   0, 0, 0));
++
++              /* Re-checks ruleset enforcement. */
++              EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
++              EXPECT_EQ(EACCES, errno);
++
++              _exit(_metadata->exit_code);
++              return;
++      }
++
++      EXPECT_EQ(child, waitpid(child, &status, 0));
++      EXPECT_EQ(1, WIFEXITED(status));
++      EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
++
++      /* Re-checks ruleset enforcement. */
++      EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
++      EXPECT_EQ(EACCES, errno);
++}
++
+ TEST_HARNESS_MAIN
+--- a/tools/testing/selftests/landlock/config
++++ b/tools/testing/selftests/landlock/config
+@@ -2,6 +2,7 @@ CONFIG_CGROUPS=y
+ CONFIG_CGROUP_SCHED=y
+ CONFIG_INET=y
+ CONFIG_IPV6=y
++CONFIG_KEYS=y
+ CONFIG_NET=y
+ CONFIG_NET_NS=y
+ CONFIG_OVERLAY_FS=y
index 2bac512b7679ea0e73fb1428ae0f358df38f8d37..eec45fbbd3d8091b79efce6a2f4b0ab530c1fcca 100644 (file)
@@ -529,3 +529,15 @@ remoteproc-k3-r5-fix-ipc-only-mode-detection.patch
 mailbox-omap-fix-mailbox-interrupt-sharing.patch
 mailbox-imx-fix-txdb_v2-channel-race-condition.patch
 mailbox-mtk-cmdq-move-devm_mbox_controller_register-.patch
+selftests-landlock-add-cred_transfer-test.patch
+landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch
+mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch
+mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
+hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch
+mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch
+mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
+mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch
+mm-mglru-fix-overshooting-shrinker-memory.patch
+mm-mglru-fix-ineffective-protection-calculation.patch
+x86-efistub-avoid-returning-efi_success-on-error.patch
+x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch
diff --git a/queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch b/queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch
new file mode 100644 (file)
index 0000000..a36d691
--- /dev/null
@@ -0,0 +1,40 @@
+From fb318ca0a522295edd6d796fb987e99ec41f0ee5 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Thu, 4 Jul 2024 10:59:23 +0200
+Subject: x86/efistub: Avoid returning EFI_SUCCESS on error
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit fb318ca0a522295edd6d796fb987e99ec41f0ee5 upstream.
+
+The fail label is only used in a situation where the previous EFI API
+call succeeded, and so status will be set to EFI_SUCCESS. Fix this, by
+dropping the goto entirely, and call efi_exit() with the correct error
+code.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/libstub/x86-stub.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/firmware/efi/libstub/x86-stub.c
++++ b/drivers/firmware/efi/libstub/x86-stub.c
+@@ -501,16 +501,13 @@ efi_status_t __efiapi efi_pe_entry(efi_h
+       /* Convert unicode cmdline to ascii */
+       cmdline_ptr = efi_convert_cmdline(image, &options_size);
+       if (!cmdline_ptr)
+-              goto fail;
++              efi_exit(handle, EFI_OUT_OF_RESOURCES);
+       efi_set_u64_split((unsigned long)cmdline_ptr, &hdr->cmd_line_ptr,
+                         &boot_params.ext_cmd_line_ptr);
+       efi_stub_entry(handle, sys_table_arg, &boot_params);
+       /* not reached */
+-
+-fail:
+-      efi_exit(handle, status);
+ }
+ static void add_e820ext(struct boot_params *params,
diff --git a/queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch b/queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch
new file mode 100644 (file)
index 0000000..4b14184
--- /dev/null
@@ -0,0 +1,76 @@
+From ae835a96d72cd025421910edb0e8faf706998727 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 22 Mar 2024 18:11:32 +0100
+Subject: x86/efistub: Revert to heap allocated boot_params for PE entrypoint
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit ae835a96d72cd025421910edb0e8faf706998727 upstream.
+
+This is a partial revert of commit
+
+  8117961d98f ("x86/efi: Disregard setup header of loaded image")
+
+which triggers boot issues on older Dell laptops. As it turns out,
+switching back to a heap allocation for the struct boot_params
+constructed by the EFI stub works around this, even though it is unclear
+why.
+
+Cc: Christian Heusel <christian@heusel.eu>
+Reported-by: <mavrix#kernel@simplelogin.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/libstub/x86-stub.c |   20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/drivers/firmware/efi/libstub/x86-stub.c
++++ b/drivers/firmware/efi/libstub/x86-stub.c
+@@ -469,11 +469,12 @@ void __noreturn efi_stub_entry(efi_handl
+ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
+                                  efi_system_table_t *sys_table_arg)
+ {
+-      static struct boot_params boot_params __page_aligned_bss;
+-      struct setup_header *hdr = &boot_params.hdr;
+       efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
++      struct boot_params *boot_params;
++      struct setup_header *hdr;
+       int options_size = 0;
+       efi_status_t status;
++      unsigned long alloc;
+       char *cmdline_ptr;
+       if (efi_is_native())
+@@ -491,6 +492,13 @@ efi_status_t __efiapi efi_pe_entry(efi_h
+               efi_exit(handle, status);
+       }
++      status = efi_allocate_pages(PARAM_SIZE, &alloc, ULONG_MAX);
++      if (status != EFI_SUCCESS)
++              efi_exit(handle, status);
++
++      boot_params = memset((void *)alloc, 0x0, PARAM_SIZE);
++      hdr         = &boot_params->hdr;
++
+       /* Assign the setup_header fields that the kernel actually cares about */
+       hdr->root_flags = 1;
+       hdr->vid_mode   = 0xffff;
+@@ -500,13 +508,15 @@ efi_status_t __efiapi efi_pe_entry(efi_h
+       /* Convert unicode cmdline to ascii */
+       cmdline_ptr = efi_convert_cmdline(image, &options_size);
+-      if (!cmdline_ptr)
++      if (!cmdline_ptr) {
++              efi_free(PARAM_SIZE, alloc);
+               efi_exit(handle, EFI_OUT_OF_RESOURCES);
++      }
+       efi_set_u64_split((unsigned long)cmdline_ptr, &hdr->cmd_line_ptr,
+-                        &boot_params.ext_cmd_line_ptr);
++                        &boot_params->ext_cmd_line_ptr);
+-      efi_stub_entry(handle, sys_table_arg, &boot_params);
++      efi_stub_entry(handle, sys_table_arg, boot_params);
+       /* not reached */
+ }