From: Greg Kroah-Hartman Date: Mon, 29 Jul 2024 07:44:25 +0000 (+0200) Subject: 6.10-stable patches X-Git-Tag: v6.1.103~98 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e726df225c02e30ea3c5289a6e9c10ca90f26630;p=thirdparty%2Fkernel%2Fstable-queue.git 6.10-stable patches added patches: hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch mm-mglru-fix-ineffective-protection-calculation.patch mm-mglru-fix-overshooting-shrinker-memory.patch mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch selftests-landlock-add-cred_transfer-test.patch x86-efistub-avoid-returning-efi_success-on-error.patch x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch --- diff --git a/queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch b/queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch new file mode 100644 index 00000000000..008f3d15d38 --- /dev/null +++ b/queue-6.10/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch @@ -0,0 +1,131 @@ +From 003af997c8a945493859dd1a2d015cc9387ff27a Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski +Date: Fri, 21 Jun 2024 15:00:50 -0400 +Subject: hugetlb: force allocating surplus hugepages on mempolicy allowed nodes + +From: Aristeu Rozanski + +commit 003af997c8a945493859dd1a2d015cc9387ff27a upstream. + +When trying to allocate a hugepage with no reserved ones free, it may be +allowed in case a number of overcommit hugepages was configured (using +/proc/sys/vm/nr_overcommit_hugepages) and that number wasn't reached. +This allows for a behavior of having extra hugepages allocated +dynamically, if there're resources for it. Some sysadmins even prefer not +reserving any hugepages and setting a big number of overcommit hugepages. + +But while attempting to allocate overcommit hugepages in a multi node +system (either NUMA or mempolicy/cpuset) said allocations might randomly +fail even when there're resources available for the allocation. + +This happens due to allowed_mems_nr() only accounting for the number of +free hugepages in the nodes the current process belongs to and the surplus +hugepage allocation is done so it can be allocated in any node. In case +one or more of the requested surplus hugepages are allocated in a +different node, the whole allocation will fail due allowed_mems_nr() +returning a lower value. + +So allocate surplus hugepages in one of the nodes the current process +belongs to. + +Easy way to reproduce this issue is to use a 2+ NUMA nodes system: + + # echo 0 >/proc/sys/vm/nr_hugepages + # echo 1 >/proc/sys/vm/nr_overcommit_hugepages + # numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2 + +Repeating the execution of map_hugetlb test application will eventually +fail when the hugepage ends up allocated in a different node. + +[aris@ruivo.org: v2] + Link: https://lkml.kernel.org/r/20240701212343.GG844599@cathedrallabs.org +Link: https://lkml.kernel.org/r/20240621190050.mhxwb65zn37doegp@redhat.com +Signed-off-by: Aristeu Rozanski +Cc: Muchun Song +Cc: Aristeu Rozanski +Cc: David Hildenbrand +Cc: Vishal Moola +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 47 ++++++++++++++++++++++++++++------------------- + 1 file changed, 28 insertions(+), 19 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2586,6 +2586,23 @@ struct folio *alloc_hugetlb_folio_nodema + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); + } + ++static nodemask_t *policy_mbind_nodemask(gfp_t gfp) ++{ ++#ifdef CONFIG_NUMA ++ struct mempolicy *mpol = get_task_policy(current); ++ ++ /* ++ * Only enforce MPOL_BIND policy which overlaps with cpuset policy ++ * (from policy_nodemask) specifically for hugetlb case ++ */ ++ if (mpol->mode == MPOL_BIND && ++ (apply_policy_zone(mpol, gfp_zone(gfp)) && ++ cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) ++ return &mpol->nodes; ++#endif ++ return NULL; ++} ++ + /* + * Increase the hugetlb pool such that it can accommodate a reservation + * of size 'delta'. +@@ -2599,6 +2616,8 @@ static int gather_surplus_pages(struct h + long i; + long needed, allocated; + bool alloc_ok = true; ++ int node; ++ nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + + lockdep_assert_held(&hugetlb_lock); + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; +@@ -2613,8 +2632,15 @@ static int gather_surplus_pages(struct h + retry: + spin_unlock_irq(&hugetlb_lock); + for (i = 0; i < needed; i++) { +- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), +- NUMA_NO_NODE, NULL); ++ folio = NULL; ++ for_each_node_mask(node, cpuset_current_mems_allowed) { ++ if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { ++ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), ++ node, NULL); ++ if (folio) ++ break; ++ } ++ } + if (!folio) { + alloc_ok = false; + break; +@@ -4840,23 +4866,6 @@ static int __init default_hugepagesz_set + } + __setup("default_hugepagesz=", default_hugepagesz_setup); + +-static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +-{ +-#ifdef CONFIG_NUMA +- struct mempolicy *mpol = get_task_policy(current); +- +- /* +- * Only enforce MPOL_BIND policy which overlaps with cpuset policy +- * (from policy_nodemask) specifically for hugetlb case +- */ +- if (mpol->mode == MPOL_BIND && +- (apply_policy_zone(mpol, gfp_zone(gfp)) && +- cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) +- return &mpol->nodes; +-#endif +- return NULL; +-} +- + static unsigned int allowed_mems_nr(struct hstate *h) + { + int node; diff --git a/queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch b/queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch new file mode 100644 index 00000000000..76236deda78 --- /dev/null +++ b/queue-6.10/landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch @@ -0,0 +1,72 @@ +From 39705a6c29f8a2b93cf5b99528a55366c50014d1 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Wed, 24 Jul 2024 14:49:01 +0200 +Subject: landlock: Don't lose track of restrictions on cred_transfer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jann Horn + +commit 39705a6c29f8a2b93cf5b99528a55366c50014d1 upstream. + +When a process' cred struct is replaced, this _almost_ always invokes +the cred_prepare LSM hook; but in one special case (when +KEYCTL_SESSION_TO_PARENT updates the parent's credentials), the +cred_transfer LSM hook is used instead. Landlock only implements the +cred_prepare hook, not cred_transfer, so KEYCTL_SESSION_TO_PARENT causes +all information on Landlock restrictions to be lost. + +This basically means that a process with the ability to use the fork() +and keyctl() syscalls can get rid of all Landlock restrictions on +itself. + +Fix it by adding a cred_transfer hook that does the same thing as the +existing cred_prepare hook. (Implemented by having hook_cred_prepare() +call hook_cred_transfer() so that the two functions are less likely to +accidentally diverge in the future.) + +Cc: stable@kernel.org +Fixes: 385975dca53e ("landlock: Set up the security framework and manage credentials") +Signed-off-by: Jann Horn +Link: https://lore.kernel.org/r/20240724-landlock-houdini-fix-v1-1-df89a4560ca3@google.com +Signed-off-by: Mickaël Salaün +Signed-off-by: Greg Kroah-Hartman +--- + security/landlock/cred.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/security/landlock/cred.c ++++ b/security/landlock/cred.c +@@ -14,8 +14,8 @@ + #include "ruleset.h" + #include "setup.h" + +-static int hook_cred_prepare(struct cred *const new, +- const struct cred *const old, const gfp_t gfp) ++static void hook_cred_transfer(struct cred *const new, ++ const struct cred *const old) + { + struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; + +@@ -23,6 +23,12 @@ static int hook_cred_prepare(struct cred + landlock_get_ruleset(old_dom); + landlock_cred(new)->domain = old_dom; + } ++} ++ ++static int hook_cred_prepare(struct cred *const new, ++ const struct cred *const old, const gfp_t gfp) ++{ ++ hook_cred_transfer(new, old); + return 0; + } + +@@ -36,6 +42,7 @@ static void hook_cred_free(struct cred * + + static struct security_hook_list landlock_hooks[] __ro_after_init = { + LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), ++ LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), + LSM_HOOK_INIT(cred_free, hook_cred_free), + }; + diff --git a/queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch b/queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch new file mode 100644 index 00000000000..c262d9a66d8 --- /dev/null +++ b/queue-6.10/mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch @@ -0,0 +1,197 @@ +From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Mon, 15 Jul 2024 10:04:23 +1000 +Subject: mm/huge_memory: avoid PMD-size page cache if needed + +From: Gavin Shan + +commit d659b715e94ac039803d7601505d3473393fc0be upstream. + +xarray can't support arbitrary page cache size. the largest and supported +page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 +("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, +it's possible to have 512MB page cache in the huge memory's collapsing +path on ARM64 system whose base page size is 64KB. 512MB page cache is +breaking the limitation and a warning is raised when the xarray entry is +split as shown in the following example. + +[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize +KernelPageSize: 64 kB +[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c + : +int main(int argc, char **argv) +{ + const char *filename = TEST_XFS_FILENAME; + int fd = 0; + void *buf = (void *)-1, *p; + int pgsize = getpagesize(); + int ret = 0; + + if (pgsize != 0x10000) { + fprintf(stdout, "System with 64KB base page size is required!\n"); + return -EPERM; + } + + system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); + system("echo 1 > /proc/sys/vm/drop_caches"); + + /* Open the xfs file */ + fd = open(filename, O_RDONLY); + assert(fd > 0); + + /* Create VMA */ + buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); + assert(buf != (void *)-1); + fprintf(stdout, "mapped buffer at 0x%p\n", buf); + + /* Populate VMA */ + ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); + assert(ret == 0); + ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); + assert(ret == 0); + + /* Collapse VMA */ + ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); + assert(ret == 0); + ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); + if (ret) { + fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); + goto out; + } + + /* Split xarray entry. Write permission is needed */ + munmap(buf, TEST_MEM_SIZE); + buf = (void *)-1; + close(fd); + fd = open(filename, O_RDWR); + assert(fd > 0); + fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + TEST_MEM_SIZE - pgsize, pgsize); +out: + if (buf != (void *)-1) + munmap(buf, TEST_MEM_SIZE); + if (fd > 0) + close(fd); + + return ret; +} + +[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test +[root@dhcp-10-26-1-207 ~]# /tmp/test + ------------[ cut here ]------------ + WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 + Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ + nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ + nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ + ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ + xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ + sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio + CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 + Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 + pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : xas_split_alloc+0xf8/0x128 + lr : split_huge_page_to_list_to_order+0x1c4/0x780 + sp : ffff8000ac32f660 + x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 + x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d + x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 + x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 + x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 + x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 + x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c + x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 + x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 + x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 + Call trace: + xas_split_alloc+0xf8/0x128 + split_huge_page_to_list_to_order+0x1c4/0x780 + truncate_inode_partial_folio+0xdc/0x160 + truncate_inode_pages_range+0x1b4/0x4a8 + truncate_pagecache_range+0x84/0xa0 + xfs_flush_unmap_range+0x70/0x90 [xfs] + xfs_file_fallocate+0xfc/0x4d8 [xfs] + vfs_fallocate+0x124/0x2f0 + ksys_fallocate+0x4c/0xa0 + __arm64_sys_fallocate+0x24/0x38 + invoke_syscall.constprop.0+0x7c/0xd8 + do_el0_svc+0xb4/0xd0 + el0_svc+0x44/0x1d8 + el0t_64_sync_handler+0x134/0x150 + el0t_64_sync+0x17c/0x180 + +Fix it by correcting the supported page cache orders, different sets for +DAX and other files. With it corrected, 512MB page cache becomes +disallowed on all non-DAX files on ARM64 system where the base page size +is 64KB. After this patch is applied, the test program fails with error +-EINVAL returned from __thp_vma_allowable_orders() and the madvise() +system call to collapse the page caches. + +Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com +Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") +Signed-off-by: Gavin Shan +Acked-by: David Hildenbrand +Reviewed-by: Ryan Roberts +Acked-by: Zi Yan +Cc: Baolin Wang +Cc: Barry Song +Cc: Don Dutile +Cc: Matthew Wilcox (Oracle) +Cc: Peter Xu +Cc: Ryan Roberts +Cc: William Kucharski +Cc: [5.17+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/huge_mm.h | 12 +++++++++--- + mm/huge_memory.c | 12 ++++++++++-- + 2 files changed, 19 insertions(+), 5 deletions(-) + +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@ -72,14 +72,20 @@ extern struct kobj_attribute shmem_enabl + #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) + + /* +- * Mask of all large folio orders supported for file THP. ++ * Mask of all large folio orders supported for file THP. Folios in a DAX ++ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to ++ * it. + */ +-#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) ++#define THP_ORDERS_ALL_FILE_DAX \ ++ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) ++#define THP_ORDERS_ALL_FILE_DEFAULT \ ++ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) + + /* + * Mask of all large folio orders supported for THP. + */ +-#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) ++#define THP_ORDERS_ALL \ ++ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) + + #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ + #define TVA_IN_PF (1 << 1) /* Page fault handler */ +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; ++ unsigned long supported_orders; ++ + /* Check the intersection of requested and supported orders. */ +- orders &= vma_is_anonymous(vma) ? +- THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; ++ if (vma_is_anonymous(vma)) ++ supported_orders = THP_ORDERS_ALL_ANON; ++ else if (vma_is_dax(vma)) ++ supported_orders = THP_ORDERS_ALL_FILE_DAX; ++ else ++ supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; ++ ++ orders &= supported_orders; + if (!orders) + return 0; + diff --git a/queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch b/queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch new file mode 100644 index 00000000000..b942f305450 --- /dev/null +++ b/queue-6.10/mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch @@ -0,0 +1,48 @@ +From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Fri, 12 Jul 2024 08:58:55 -0700 +Subject: mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit machines + +From: Yang Shi + +commit d9592025000b3cf26c742f3505da7b83aedc26d5 upstream. + +Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't +force huge page alignment on 32 bit") didn't work for x86_32 [1]. It is +because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT. + +!CONFIG_64BIT should cover all 32 bit machines. + +[1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/ + +Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com +Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") +Signed-off-by: Yang Shi +Reported-by: Yves-Alexis Perez +Tested-by: Yves-Alexis Perez +Acked-by: David Hildenbrand +Cc: Ben Hutchings +Cc: Christoph Lameter +Cc: Jiri Slaby +Cc: Matthew Wilcox (Oracle) +Cc: Rik van Riel +Cc: Salvatore Bonaccorso +Cc: Suren Baghdasaryan +Cc: [6.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/huge_memory.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -857,7 +857,7 @@ static unsigned long __thp_get_unmapped_ + loff_t off_align = round_up(off, size); + unsigned long len_pad, ret, off_sub; + +- if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) ++ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) + return 0; + + if (off_end <= off_align || (off_end - off_align) < size) diff --git a/queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch b/queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch new file mode 100644 index 00000000000..48fc6b623fc --- /dev/null +++ b/queue-6.10/mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch @@ -0,0 +1,100 @@ +From 667574e873b5f77a220b2a93329689f36fb56d5d Mon Sep 17 00:00:00 2001 +From: Miaohe Lin +Date: Fri, 12 Jul 2024 11:13:14 +0800 +Subject: mm/hugetlb: fix possible recursive locking detected warning + +From: Miaohe Lin + +commit 667574e873b5f77a220b2a93329689f36fb56d5d upstream. + +When tries to demote 1G hugetlb folios, a lockdep warning is observed: + +============================================ +WARNING: possible recursive locking detected +6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Not tainted +-------------------------------------------- +bash/710 is trying to acquire lock: +ffffffff8f0a7850 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0x244/0x460 + +but task is already holding lock: +ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 + +other info that might help us debug this: + Possible unsafe locking scenario: + + CPU0 + ---- + lock(&h->resize_lock); + lock(&h->resize_lock); + + *** DEADLOCK *** + + May be due to missing lock nesting notation + +4 locks held by bash/710: + #0: ffff8f118439c3f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 + #1: ffff8f11893b9e88 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 + #2: ffff8f1183dc4428 (kn->active#98){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 + #3: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 + +stack backtrace: +CPU: 3 PID: 710 Comm: bash Not tainted 6.10.0-rc6-00452-ga4d0275fa660-dirty #79 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 +Call Trace: + + dump_stack_lvl+0x68/0xa0 + __lock_acquire+0x10f2/0x1ca0 + lock_acquire+0xbe/0x2d0 + __mutex_lock+0x6d/0x400 + demote_store+0x244/0x460 + kernfs_fop_write_iter+0x12c/0x1d0 + vfs_write+0x380/0x540 + ksys_write+0x64/0xe0 + do_syscall_64+0xb9/0x1d0 + entry_SYSCALL_64_after_hwframe+0x77/0x7f +RIP: 0033:0x7fa61db14887 +RSP: 002b:00007ffc56c48358 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fa61db14887 +RDX: 0000000000000002 RSI: 000055a030050220 RDI: 0000000000000001 +RBP: 000055a030050220 R08: 00007fa61dbd1460 R09: 000000007fffffff +R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 +R13: 00007fa61dc1b780 R14: 00007fa61dc17600 R15: 00007fa61dc16a00 + + +Lockdep considers this an AA deadlock because the different resize_lock +mutexes reside in the same lockdep class, but this is a false positive. +Place them in distinct classes to avoid these warnings. + +Link: https://lkml.kernel.org/r/20240712031314.2570452-1-linmiaohe@huawei.com +Fixes: 8531fc6f52f5 ("hugetlb: add hugetlb demote page support") +Signed-off-by: Miaohe Lin +Acked-by: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 1 + + mm/hugetlb.c | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -681,6 +681,7 @@ HPAGEFLAG(RawHwpUnreliable, raw_hwp_unre + /* Defines one hugetlb page size */ + struct hstate { + struct mutex resize_lock; ++ struct lock_class_key resize_key; + int next_nid_to_alloc; + int next_nid_to_free; + unsigned int order; +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -4643,7 +4643,7 @@ void __init hugetlb_add_hstate(unsigned + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); + h = &hstates[hugetlb_max_hstate++]; +- mutex_init(&h->resize_lock); ++ __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); + h->order = order; + h->mask = ~(huge_page_size(h) - 1); + for (i = 0; i < MAX_NUMNODES; ++i) diff --git a/queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch b/queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch new file mode 100644 index 00000000000..4208c3512c4 --- /dev/null +++ b/queue-6.10/mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch @@ -0,0 +1,51 @@ +From 8b671fe1a879923ecfb72dda6caf01460dd885ef Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Thu, 11 Jul 2024 13:19:56 -0600 +Subject: mm/mglru: fix div-by-zero in vmpressure_calc_level() + +From: Yu Zhao + +commit 8b671fe1a879923ecfb72dda6caf01460dd885ef upstream. + +evict_folios() uses a second pass to reclaim folios that have gone through +page writeback and become clean before it finishes the first pass, since +folio_rotate_reclaimable() cannot handle those folios due to the +isolation. + +The second pass tries to avoid potential double counting by deducting +scan_control->nr_scanned. However, this can result in underflow of +nr_scanned, under a condition where shrink_folio_list() does not increment +nr_scanned, i.e., when folio_trylock() fails. + +The underflow can cause the divisor, i.e., scale=scanned+reclaimed in +vmpressure_calc_level(), to become zero, resulting in the following crash: + + [exception RIP: vmpressure_work_fn+101] + process_one_work at ffffffffa3313f2b + +Since scan_control->nr_scanned has no established semantics, the potential +double counting has minimal risks. Therefore, fix the problem by not +deducting scan_control->nr_scanned in evict_folios(). + +Link: https://lkml.kernel.org/r/20240711191957.939105-1-yuzhao@google.com +Fixes: 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated") +Reported-by: Wei Xu +Signed-off-by: Yu Zhao +Cc: Alexander Motin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4582,7 +4582,6 @@ retry: + + /* retry folios that may have missed folio_rotate_reclaimable() */ + list_move(&folio->lru, &clean); +- sc->nr_scanned -= folio_nr_pages(folio); + } + + spin_lock_irq(&lruvec->lru_lock); diff --git a/queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch b/queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch new file mode 100644 index 00000000000..17f85f75506 --- /dev/null +++ b/queue-6.10/mm-mglru-fix-ineffective-protection-calculation.patch @@ -0,0 +1,183 @@ +From 30d77b7eef019fa4422980806e8b7cdc8674493e Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Fri, 12 Jul 2024 17:29:56 -0600 +Subject: mm/mglru: fix ineffective protection calculation + +From: Yu Zhao + +commit 30d77b7eef019fa4422980806e8b7cdc8674493e upstream. + +mem_cgroup_calculate_protection() is not stateless and should only be used +as part of a top-down tree traversal. shrink_one() traverses the per-node +memcg LRU instead of the root_mem_cgroup tree, and therefore it should not +call mem_cgroup_calculate_protection(). + +The existing misuse in shrink_one() can cause ineffective protection of +sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing +lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to +calculate the protection. + +Previously lru_gen_age_node() opportunistically skips the first pass, +i.e., when scan_control->priority is DEF_PRIORITY. On the second pass, +lruvec_is_sizable() uses appropriate scan_control->priority, set by +set_initial_priority() from lru_gen_shrink_node(), to decide whether a +memcg is too small to reclaim from. + +Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree. +So it should call set_initial_priority() upfront, to make sure +lruvec_is_sizable() uses appropriate scan_control->priority on the first +pass. Otherwise, lruvec_is_reclaimable() can return false negatives and +result in premature OOM kills when min_ttl_ms is used. + +Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com +Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") +Signed-off-by: Yu Zhao +Reported-by: T.J. Mercier +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 82 +++++++++++++++++++++++++++--------------------------------- + 1 file changed, 38 insertions(+), 44 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3900,6 +3900,32 @@ done: + * working set protection + ******************************************************************************/ + ++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ int priority; ++ unsigned long reclaimable; ++ ++ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) ++ return; ++ /* ++ * Determine the initial priority based on ++ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, ++ * where reclaimed_to_scanned_ratio = inactive / total. ++ */ ++ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); ++ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) ++ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ /* round down reclaimable and round up sc->nr_to_reclaim */ ++ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); ++ ++ /* ++ * The estimation is based on LRU pages only, so cap it to prevent ++ * overshoots of shrinker objects by large margins. ++ */ ++ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); ++} ++ + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) + { + int gen, type, zone; +@@ -3933,19 +3959,17 @@ static bool lruvec_is_reclaimable(struct + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + +- /* see the comment on lru_gen_folio */ +- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); +- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); +- +- if (time_is_after_jiffies(birth + min_ttl)) ++ if (mem_cgroup_below_min(NULL, memcg)) + return false; + + if (!lruvec_is_sizable(lruvec, sc)) + return false; + +- mem_cgroup_calculate_protection(NULL, memcg); ++ /* see the comment on lru_gen_folio */ ++ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); ++ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + +- return !mem_cgroup_below_min(NULL, memcg); ++ return time_is_before_jiffies(birth + min_ttl); + } + + /* to protect the working set of the last N jiffies */ +@@ -3955,23 +3979,20 @@ static void lru_gen_age_node(struct pgli + { + struct mem_cgroup *memcg; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); ++ bool reclaimable = !min_ttl; + + VM_WARN_ON_ONCE(!current_is_kswapd()); + +- /* check the order to exclude compaction-induced reclaim */ +- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) +- return; ++ set_initial_priority(pgdat, sc); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + +- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { +- mem_cgroup_iter_break(NULL, memcg); +- return; +- } ++ mem_cgroup_calculate_protection(NULL, memcg); + +- cond_resched(); ++ if (!reclaimable) ++ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + /* +@@ -3979,7 +4000,7 @@ static void lru_gen_age_node(struct pgli + * younger than min_ttl. However, another possibility is all memcgs are + * either too small or below min. + */ +- if (mutex_trylock(&oom_lock)) { ++ if (!reclaimable && mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; +@@ -4771,8 +4792,7 @@ static int shrink_one(struct lruvec *lru + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +- mem_cgroup_calculate_protection(NULL, memcg); +- ++ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; + +@@ -4896,32 +4916,6 @@ static void lru_gen_shrink_lruvec(struct + blk_finish_plug(&plug); + } + +-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +-{ +- int priority; +- unsigned long reclaimable; +- +- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) +- return; +- /* +- * Determine the initial priority based on +- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, +- * where reclaimed_to_scanned_ratio = inactive / total. +- */ +- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); +- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) +- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); +- +- /* round down reclaimable and round up sc->nr_to_reclaim */ +- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); +- +- /* +- * The estimation is based on LRU pages only, so cap it to prevent +- * overshoots of shrinker objects by large margins. +- */ +- sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); +-} +- + static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct blk_plug plug; diff --git a/queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch b/queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch new file mode 100644 index 00000000000..a6f7b0e24bc --- /dev/null +++ b/queue-6.10/mm-mglru-fix-overshooting-shrinker-memory.patch @@ -0,0 +1,89 @@ +From 3f74e6bd3b84a8b6bb3cc51609c89e5b9d58eed7 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Thu, 11 Jul 2024 13:19:57 -0600 +Subject: mm/mglru: fix overshooting shrinker memory + +From: Yu Zhao + +commit 3f74e6bd3b84a8b6bb3cc51609c89e5b9d58eed7 upstream. + +set_initial_priority() tries to jump-start global reclaim by estimating +the priority based on cold/hot LRU pages. The estimation does not account +for shrinker objects, and it cannot do so because their sizes can be in +different units other than page. + +If shrinker objects are the majority, e.g., on TrueNAS SCALE 24.04.0 where +ZFS ARC can use almost all system memory, set_initial_priority() can +vastly underestimate how much memory ARC shrinker can evict and assign +extreme low values to scan_control->priority, resulting in overshoots of +shrinker objects. + +To reproduce the problem, using TrueNAS SCALE 24.04.0 with 32GB DRAM, a +test ZFS pool and the following commands: + + fio --name=mglru.file --numjobs=36 --ioengine=io_uring \ + --directory=/root/test-zfs-pool/ --size=1024m --buffered=1 \ + --rw=randread --random_distribution=random \ + --time_based --runtime=1h & + + for ((i = 0; i < 20; i++)) + do + sleep 120 + fio --name=mglru.anon --numjobs=16 --ioengine=mmap \ + --filename=/dev/zero --size=1024m --fadvise_hint=0 \ + --rw=randrw --random_distribution=random \ + --time_based --runtime=1m + done + +To fix the problem: +1. Cap scan_control->priority at or above DEF_PRIORITY/2, to prevent + the jump-start from being overly aggressive. +2. Account for the progress from mm_account_reclaimed_pages(), to + prevent kswapd_shrink_node() from raising the priority + unnecessarily. + +Link: https://lkml.kernel.org/r/20240711191957.939105-2-yuzhao@google.com +Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") +Signed-off-by: Yu Zhao +Reported-by: Alexander Motin +Cc: Wei Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4915,7 +4915,11 @@ static void set_initial_priority(struct + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + +- sc->priority = clamp(priority, 0, DEF_PRIORITY); ++ /* ++ * The estimation is based on LRU pages only, so cap it to prevent ++ * overshoots of shrinker objects by large margins. ++ */ ++ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); + } + + static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +@@ -6701,6 +6705,7 @@ static bool kswapd_shrink_node(pg_data_t + { + struct zone *zone; + int z; ++ unsigned long nr_reclaimed = sc->nr_reclaimed; + + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; +@@ -6728,7 +6733,8 @@ static bool kswapd_shrink_node(pg_data_t + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) + sc->order = 0; + +- return sc->nr_scanned >= sc->nr_to_reclaim; ++ /* account for progress from mm_account_reclaimed_pages() */ ++ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; + } + + /* Page allocator PCP high watermark is lowered if reclaim is active. */ diff --git a/queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch b/queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch new file mode 100644 index 00000000000..8b9335a1864 --- /dev/null +++ b/queue-6.10/mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch @@ -0,0 +1,262 @@ +From 7d6be67cfdd4a53cea7147313ca13c531e3a470f Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Fri, 21 Jun 2024 10:08:41 +0900 +Subject: mm: mmap_lock: replace get_memcg_path_buf() with on-stack buffer + +From: Tetsuo Handa + +commit 7d6be67cfdd4a53cea7147313ca13c531e3a470f upstream. + +Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock +acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro using +preempt_disable() in order to let get_mm_memcg_path() return a percpu +buffer exclusively used by normal, softirq, irq and NMI contexts +respectively. + +Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling +preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock) +based on an argument that preempt_disable() has to be avoided because +get_mm_memcg_path() might sleep if PREEMPT_RT=y. + +But syzbot started reporting + + inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. + +and + + inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. + +messages, for local_lock() does not disable IRQ. + +We could replace local_lock() with local_lock_irqsave() in order to +suppress these messages. But this patch instead replaces percpu buffers +with on-stack buffer, for the size of each buffer returned by +get_memcg_path_buf() is only 256 bytes which is tolerable for allocating +from current thread's kernel stack memory. + +Link: https://lkml.kernel.org/r/ef22d289-eadb-4ed9-863b-fbc922b33d8d@I-love.SAKURA.ne.jp +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 +Fixes: 832b50725373 ("mm: mmap_lock: use local locks instead of disabling preemption") +Signed-off-by: Tetsuo Handa +Reviewed-by: Axel Rasmussen +Cc: Nicolas Saenz Julienne +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/mmap_lock.c | 175 ++++++--------------------------------------------------- + 1 file changed, 20 insertions(+), 155 deletions(-) + +--- a/mm/mmap_lock.c ++++ b/mm/mmap_lock.c +@@ -19,14 +19,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_relea + + #ifdef CONFIG_MEMCG + +-/* +- * Our various events all share the same buffer (because we don't want or need +- * to allocate a set of buffers *per event type*), so we need to protect against +- * concurrent _reg() and _unreg() calls, and count how many _reg() calls have +- * been made. +- */ +-static DEFINE_MUTEX(reg_lock); +-static int reg_refcount; /* Protected by reg_lock. */ ++static atomic_t reg_refcount; + + /* + * Size of the buffer for memcg path names. Ignoring stack trace support, +@@ -34,136 +27,22 @@ static int reg_refcount; /* Protected by + */ + #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL + +-/* +- * How many contexts our trace events might be called in: normal, softirq, irq, +- * and NMI. +- */ +-#define CONTEXT_COUNT 4 +- +-struct memcg_path { +- local_lock_t lock; +- char __rcu *buf; +- local_t buf_idx; +-}; +-static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { +- .lock = INIT_LOCAL_LOCK(lock), +- .buf_idx = LOCAL_INIT(0), +-}; +- +-static char **tmp_bufs; +- +-/* Called with reg_lock held. */ +-static void free_memcg_path_bufs(void) +-{ +- struct memcg_path *memcg_path; +- int cpu; +- char **old = tmp_bufs; +- +- for_each_possible_cpu(cpu) { +- memcg_path = per_cpu_ptr(&memcg_paths, cpu); +- *(old++) = rcu_dereference_protected(memcg_path->buf, +- lockdep_is_held(®_lock)); +- rcu_assign_pointer(memcg_path->buf, NULL); +- } +- +- /* Wait for inflight memcg_path_buf users to finish. */ +- synchronize_rcu(); +- +- old = tmp_bufs; +- for_each_possible_cpu(cpu) { +- kfree(*(old++)); +- } +- +- kfree(tmp_bufs); +- tmp_bufs = NULL; +-} +- + int trace_mmap_lock_reg(void) + { +- int cpu; +- char *new; +- +- mutex_lock(®_lock); +- +- /* If the refcount is going 0->1, proceed with allocating buffers. */ +- if (reg_refcount++) +- goto out; +- +- tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), +- GFP_KERNEL); +- if (tmp_bufs == NULL) +- goto out_fail; +- +- for_each_possible_cpu(cpu) { +- new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); +- if (new == NULL) +- goto out_fail_free; +- rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); +- /* Don't need to wait for inflights, they'd have gotten NULL. */ +- } +- +-out: +- mutex_unlock(®_lock); ++ atomic_inc(®_refcount); + return 0; +- +-out_fail_free: +- free_memcg_path_bufs(); +-out_fail: +- /* Since we failed, undo the earlier ref increment. */ +- --reg_refcount; +- +- mutex_unlock(®_lock); +- return -ENOMEM; + } + + void trace_mmap_lock_unreg(void) + { +- mutex_lock(®_lock); +- +- /* If the refcount is going 1->0, proceed with freeing buffers. */ +- if (--reg_refcount) +- goto out; +- +- free_memcg_path_bufs(); +- +-out: +- mutex_unlock(®_lock); +-} +- +-static inline char *get_memcg_path_buf(void) +-{ +- struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); +- char *buf; +- int idx; +- +- rcu_read_lock(); +- buf = rcu_dereference(memcg_path->buf); +- if (buf == NULL) { +- rcu_read_unlock(); +- return NULL; +- } +- idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - +- MEMCG_PATH_BUF_SIZE; +- return &buf[idx]; ++ atomic_dec(®_refcount); + } + +-static inline void put_memcg_path_buf(void) +-{ +- local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); +- rcu_read_unlock(); +-} +- +-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ +- do { \ +- const char *memcg_path; \ +- local_lock(&memcg_paths.lock); \ +- memcg_path = get_mm_memcg_path(mm); \ +- trace_mmap_lock_##type(mm, \ +- memcg_path != NULL ? memcg_path : "", \ +- ##__VA_ARGS__); \ +- if (likely(memcg_path != NULL)) \ +- put_memcg_path_buf(); \ +- local_unlock(&memcg_paths.lock); \ ++#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ ++ do { \ ++ char buf[MEMCG_PATH_BUF_SIZE]; \ ++ get_mm_memcg_path(mm, buf, sizeof(buf)); \ ++ trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ + } while (0) + + #else /* !CONFIG_MEMCG */ +@@ -185,37 +64,23 @@ void trace_mmap_lock_unreg(void) + #ifdef CONFIG_TRACING + #ifdef CONFIG_MEMCG + /* +- * Write the given mm_struct's memcg path to a percpu buffer, and return a +- * pointer to it. If the path cannot be determined, or no buffer was available +- * (because the trace event is being unregistered), NULL is returned. +- * +- * Note: buffers are allocated per-cpu to avoid locking, so preemption must be +- * disabled by the caller before calling us, and re-enabled only after the +- * caller is done with the pointer. +- * +- * The caller must call put_memcg_path_buf() once the buffer is no longer +- * needed. This must be done while preemption is still disabled. ++ * Write the given mm_struct's memcg path to a buffer. If the path cannot be ++ * determined or the trace event is being unregistered, empty string is written. + */ +-static const char *get_mm_memcg_path(struct mm_struct *mm) ++static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) + { +- char *buf = NULL; +- struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct mem_cgroup *memcg; + ++ buf[0] = '\0'; ++ /* No need to get path if no trace event is registered. */ ++ if (!atomic_read(®_refcount)) ++ return; ++ memcg = get_mem_cgroup_from_mm(mm); + if (memcg == NULL) +- goto out; +- if (unlikely(memcg->css.cgroup == NULL)) +- goto out_put; +- +- buf = get_memcg_path_buf(); +- if (buf == NULL) +- goto out_put; +- +- cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); +- +-out_put: ++ return; ++ if (memcg->css.cgroup) ++ cgroup_path(memcg->css.cgroup, buf, buflen); + css_put(&memcg->css); +-out: +- return buf; + } + + #endif /* CONFIG_MEMCG */ diff --git a/queue-6.10/selftests-landlock-add-cred_transfer-test.patch b/queue-6.10/selftests-landlock-add-cred_transfer-test.patch new file mode 100644 index 00000000000..1734da9b5bd --- /dev/null +++ b/queue-6.10/selftests-landlock-add-cred_transfer-test.patch @@ -0,0 +1,124 @@ +From cc374782b6ca0fd634482391da977542443d3368 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= +Date: Wed, 24 Jul 2024 16:54:26 +0200 +Subject: selftests/landlock: Add cred_transfer test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Mickaël Salaün + +commit cc374782b6ca0fd634482391da977542443d3368 upstream. + +Check that keyctl(KEYCTL_SESSION_TO_PARENT) preserves the parent's +restrictions. + +Fixes: e1199815b47b ("selftests/landlock: Add user space tests") +Co-developed-by: Jann Horn +Signed-off-by: Jann Horn +Link: https://lore.kernel.org/r/20240724.Ood5aige9she@digikod.net +Signed-off-by: Mickaël Salaün +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/landlock/base_test.c | 74 +++++++++++++++++++++++++++ + tools/testing/selftests/landlock/config | 1 + 2 files changed, 75 insertions(+) + +--- a/tools/testing/selftests/landlock/base_test.c ++++ b/tools/testing/selftests/landlock/base_test.c +@@ -9,6 +9,7 @@ + #define _GNU_SOURCE + #include + #include ++#include + #include + #include + #include +@@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer) + ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + } + ++TEST(cred_transfer) ++{ ++ struct landlock_ruleset_attr ruleset_attr = { ++ .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, ++ }; ++ int ruleset_fd, dir_fd; ++ pid_t child; ++ int status; ++ ++ drop_caps(_metadata); ++ ++ dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC); ++ EXPECT_LE(0, dir_fd); ++ EXPECT_EQ(0, close(dir_fd)); ++ ++ /* Denies opening directories. */ ++ ruleset_fd = ++ landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); ++ ASSERT_LE(0, ruleset_fd); ++ EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); ++ ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); ++ EXPECT_EQ(0, close(ruleset_fd)); ++ ++ /* Checks ruleset enforcement. */ ++ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); ++ EXPECT_EQ(EACCES, errno); ++ ++ /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */ ++ EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0, ++ 0, 0)) ++ { ++ TH_LOG("Failed to join session keyring: %s", strerror(errno)); ++ } ++ ++ child = fork(); ++ ASSERT_LE(0, child); ++ if (child == 0) { ++ /* Checks ruleset enforcement. */ ++ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); ++ EXPECT_EQ(EACCES, errno); ++ ++ /* ++ * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a ++ * different session keyring in the child, so make that happen. ++ */ ++ EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, ++ NULL, 0, 0, 0)); ++ ++ /* ++ * KEYCTL_SESSION_TO_PARENT installs credentials on the parent ++ * that never go through the cred_prepare hook, this path uses ++ * cred_transfer instead. ++ */ ++ EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0, ++ 0, 0, 0)); ++ ++ /* Re-checks ruleset enforcement. */ ++ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); ++ EXPECT_EQ(EACCES, errno); ++ ++ _exit(_metadata->exit_code); ++ return; ++ } ++ ++ EXPECT_EQ(child, waitpid(child, &status, 0)); ++ EXPECT_EQ(1, WIFEXITED(status)); ++ EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); ++ ++ /* Re-checks ruleset enforcement. */ ++ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); ++ EXPECT_EQ(EACCES, errno); ++} ++ + TEST_HARNESS_MAIN +--- a/tools/testing/selftests/landlock/config ++++ b/tools/testing/selftests/landlock/config +@@ -2,6 +2,7 @@ CONFIG_CGROUPS=y + CONFIG_CGROUP_SCHED=y + CONFIG_INET=y + CONFIG_IPV6=y ++CONFIG_KEYS=y + CONFIG_NET=y + CONFIG_NET_NS=y + CONFIG_OVERLAY_FS=y diff --git a/queue-6.10/series b/queue-6.10/series index 2bac512b767..eec45fbbd3d 100644 --- a/queue-6.10/series +++ b/queue-6.10/series @@ -529,3 +529,15 @@ remoteproc-k3-r5-fix-ipc-only-mode-detection.patch mailbox-omap-fix-mailbox-interrupt-sharing.patch mailbox-imx-fix-txdb_v2-channel-race-condition.patch mailbox-mtk-cmdq-move-devm_mbox_controller_register-.patch +selftests-landlock-add-cred_transfer-test.patch +landlock-don-t-lose-track-of-restrictions-on-cred_transfer.patch +mm-huge_memory-use-config_64bit-to-relax-huge-page-alignment-on-32-bit-machines.patch +mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch +hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch +mm-hugetlb-fix-possible-recursive-locking-detected-warning.patch +mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch +mm-mmap_lock-replace-get_memcg_path_buf-with-on-stack-buffer.patch +mm-mglru-fix-overshooting-shrinker-memory.patch +mm-mglru-fix-ineffective-protection-calculation.patch +x86-efistub-avoid-returning-efi_success-on-error.patch +x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch diff --git a/queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch b/queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch new file mode 100644 index 00000000000..a36d6912996 --- /dev/null +++ b/queue-6.10/x86-efistub-avoid-returning-efi_success-on-error.patch @@ -0,0 +1,40 @@ +From fb318ca0a522295edd6d796fb987e99ec41f0ee5 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Thu, 4 Jul 2024 10:59:23 +0200 +Subject: x86/efistub: Avoid returning EFI_SUCCESS on error + +From: Ard Biesheuvel + +commit fb318ca0a522295edd6d796fb987e99ec41f0ee5 upstream. + +The fail label is only used in a situation where the previous EFI API +call succeeded, and so status will be set to EFI_SUCCESS. Fix this, by +dropping the goto entirely, and call efi_exit() with the correct error +code. + +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/x86-stub.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/firmware/efi/libstub/x86-stub.c ++++ b/drivers/firmware/efi/libstub/x86-stub.c +@@ -501,16 +501,13 @@ efi_status_t __efiapi efi_pe_entry(efi_h + /* Convert unicode cmdline to ascii */ + cmdline_ptr = efi_convert_cmdline(image, &options_size); + if (!cmdline_ptr) +- goto fail; ++ efi_exit(handle, EFI_OUT_OF_RESOURCES); + + efi_set_u64_split((unsigned long)cmdline_ptr, &hdr->cmd_line_ptr, + &boot_params.ext_cmd_line_ptr); + + efi_stub_entry(handle, sys_table_arg, &boot_params); + /* not reached */ +- +-fail: +- efi_exit(handle, status); + } + + static void add_e820ext(struct boot_params *params, diff --git a/queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch b/queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch new file mode 100644 index 00000000000..4b141842a5c --- /dev/null +++ b/queue-6.10/x86-efistub-revert-to-heap-allocated-boot_params-for-pe-entrypoint.patch @@ -0,0 +1,76 @@ +From ae835a96d72cd025421910edb0e8faf706998727 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Fri, 22 Mar 2024 18:11:32 +0100 +Subject: x86/efistub: Revert to heap allocated boot_params for PE entrypoint + +From: Ard Biesheuvel + +commit ae835a96d72cd025421910edb0e8faf706998727 upstream. + +This is a partial revert of commit + + 8117961d98f ("x86/efi: Disregard setup header of loaded image") + +which triggers boot issues on older Dell laptops. As it turns out, +switching back to a heap allocation for the struct boot_params +constructed by the EFI stub works around this, even though it is unclear +why. + +Cc: Christian Heusel +Reported-by: +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/x86-stub.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/drivers/firmware/efi/libstub/x86-stub.c ++++ b/drivers/firmware/efi/libstub/x86-stub.c +@@ -469,11 +469,12 @@ void __noreturn efi_stub_entry(efi_handl + efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, + efi_system_table_t *sys_table_arg) + { +- static struct boot_params boot_params __page_aligned_bss; +- struct setup_header *hdr = &boot_params.hdr; + efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; ++ struct boot_params *boot_params; ++ struct setup_header *hdr; + int options_size = 0; + efi_status_t status; ++ unsigned long alloc; + char *cmdline_ptr; + + if (efi_is_native()) +@@ -491,6 +492,13 @@ efi_status_t __efiapi efi_pe_entry(efi_h + efi_exit(handle, status); + } + ++ status = efi_allocate_pages(PARAM_SIZE, &alloc, ULONG_MAX); ++ if (status != EFI_SUCCESS) ++ efi_exit(handle, status); ++ ++ boot_params = memset((void *)alloc, 0x0, PARAM_SIZE); ++ hdr = &boot_params->hdr; ++ + /* Assign the setup_header fields that the kernel actually cares about */ + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; +@@ -500,13 +508,15 @@ efi_status_t __efiapi efi_pe_entry(efi_h + + /* Convert unicode cmdline to ascii */ + cmdline_ptr = efi_convert_cmdline(image, &options_size); +- if (!cmdline_ptr) ++ if (!cmdline_ptr) { ++ efi_free(PARAM_SIZE, alloc); + efi_exit(handle, EFI_OUT_OF_RESOURCES); ++ } + + efi_set_u64_split((unsigned long)cmdline_ptr, &hdr->cmd_line_ptr, +- &boot_params.ext_cmd_line_ptr); ++ &boot_params->ext_cmd_line_ptr); + +- efi_stub_entry(handle, sys_table_arg, &boot_params); ++ efi_stub_entry(handle, sys_table_arg, boot_params); + /* not reached */ + } +