]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.11-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 18 Oct 2024 08:28:35 +0000 (10:28 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 18 Oct 2024 08:28:35 +0000 (10:28 +0200)
added patches:
fat-fix-uninitialized-variable.patch
lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch
maple_tree-correct-tree-corruption-on-spanning-store.patch
mm-damon-tests-sysfs-kunit.h-fix-memory-leak-in-damon_sysfs_test_add_targets.patch
mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch
mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch
mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch
mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch
nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch
selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch

12 files changed:
queue-6.11/fat-fix-uninitialized-variable.patch [new file with mode: 0644]
queue-6.11/lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch [new file with mode: 0644]
queue-6.11/maple_tree-correct-tree-corruption-on-spanning-store.patch [new file with mode: 0644]
queue-6.11/mm-damon-tests-sysfs-kunit.h-fix-memory-leak-in-damon_sysfs_test_add_targets.patch [new file with mode: 0644]
queue-6.11/mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch [new file with mode: 0644]
queue-6.11/mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch [new file with mode: 0644]
queue-6.11/mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch [new file with mode: 0644]
queue-6.11/mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch [new file with mode: 0644]
queue-6.11/nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch [new file with mode: 0644]
queue-6.11/selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch [new file with mode: 0644]
queue-6.11/selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch [new file with mode: 0644]
queue-6.11/series

diff --git a/queue-6.11/fat-fix-uninitialized-variable.patch b/queue-6.11/fat-fix-uninitialized-variable.patch
new file mode 100644 (file)
index 0000000..edea2c3
--- /dev/null
@@ -0,0 +1,36 @@
+From 963a7f4d3b90ee195b895ca06b95757fcba02d1a Mon Sep 17 00:00:00 2001
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Date: Fri, 4 Oct 2024 15:03:49 +0900
+Subject: fat: fix uninitialized variable
+
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+
+commit 963a7f4d3b90ee195b895ca06b95757fcba02d1a upstream.
+
+syszbot produced this with a corrupted fs image.  In theory, however an IO
+error would trigger this also.
+
+This affects just an error report, so should not be a serious error.
+
+Link: https://lkml.kernel.org/r/87r08wjsnh.fsf@mail.parknet.co.jp
+Link: https://lkml.kernel.org/r/66ff2c95.050a0220.49194.03e9.GAE@google.com
+Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Reported-by: syzbot+ef0d7bc412553291aa86@syzkaller.appspotmail.com
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fat/namei_vfat.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/fat/namei_vfat.c
++++ b/fs/fat/namei_vfat.c
+@@ -1037,7 +1037,7 @@ error_inode:
+       if (corrupt < 0) {
+               fat_fs_error(new_dir->i_sb,
+                            "%s: Filesystem corrupted (i_pos %lld)",
+-                           __func__, sinfo.i_pos);
++                           __func__, new_i_pos);
+       }
+       goto out;
+ }
diff --git a/queue-6.11/lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch b/queue-6.11/lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch
new file mode 100644 (file)
index 0000000..8362f11
--- /dev/null
@@ -0,0 +1,66 @@
+From dc783ba4b9df3fb3e76e968b2cbeb9960069263c Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Mon, 7 Oct 2024 22:52:24 +0200
+Subject: lib: alloc_tag_module_unload must wait for pending kfree_rcu calls
+
+From: Florian Westphal <fw@strlen.de>
+
+commit dc783ba4b9df3fb3e76e968b2cbeb9960069263c upstream.
+
+Ben Greear reports following splat:
+ ------------[ cut here ]------------
+ net/netfilter/nf_nat_core.c:1114 module nf_nat func:nf_nat_register_fn has 256 allocated at module unload
+ WARNING: CPU: 1 PID: 10421 at lib/alloc_tag.c:168 alloc_tag_module_unload+0x22b/0x3f0
+ Modules linked in: nf_nat(-) btrfs ufs qnx4 hfsplus hfs minix vfat msdos fat
+...
+ Hardware name: Default string Default string/SKYBAY, BIOS 5.12 08/04/2020
+ RIP: 0010:alloc_tag_module_unload+0x22b/0x3f0
+  codetag_unload_module+0x19b/0x2a0
+  ? codetag_load_module+0x80/0x80
+
+nf_nat module exit calls kfree_rcu on those addresses, but the free
+operation is likely still pending by the time alloc_tag checks for leaks.
+
+Wait for outstanding kfree_rcu operations to complete before checking
+resolves this warning.
+
+Reproducer:
+unshare -n iptables-nft -t nat -A PREROUTING -p tcp
+grep nf_nat /proc/allocinfo # will list 4 allocations
+rmmod nft_chain_nat
+rmmod nf_nat                # will WARN.
+
+[akpm@linux-foundation.org: add comment]
+Link: https://lkml.kernel.org/r/20241007205236.11847-1-fw@strlen.de
+Fixes: a473573964e5 ("lib: code tagging module support")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Reported-by: Ben Greear <greearb@candelatech.com>
+Closes: https://lore.kernel.org/netdev/bdaaef9d-4364-4171-b82b-bcfc12e207eb@candelatech.com/
+Cc: Uladzislau Rezki <urezki@gmail.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/codetag.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/lib/codetag.c b/lib/codetag.c
+index afa8a2d4f317..d1fbbb7c2ec3 100644
+--- a/lib/codetag.c
++++ b/lib/codetag.c
+@@ -228,6 +228,9 @@ bool codetag_unload_module(struct module *mod)
+       if (!mod)
+               return true;
++      /* await any module's kfree_rcu() operations to complete */
++      kvfree_rcu_barrier();
++
+       mutex_lock(&codetag_lock);
+       list_for_each_entry(cttype, &codetag_types, link) {
+               struct codetag_module *found = NULL;
+-- 
+2.47.0
+
diff --git a/queue-6.11/maple_tree-correct-tree-corruption-on-spanning-store.patch b/queue-6.11/maple_tree-correct-tree-corruption-on-spanning-store.patch
new file mode 100644 (file)
index 0000000..e31ba2c
--- /dev/null
@@ -0,0 +1,245 @@
+From bea07fd63192b61209d48cbb81ef474cc3ee4c62 Mon Sep 17 00:00:00 2001
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Date: Mon, 7 Oct 2024 16:28:32 +0100
+Subject: maple_tree: correct tree corruption on spanning store
+
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+
+commit bea07fd63192b61209d48cbb81ef474cc3ee4c62 upstream.
+
+Patch series "maple_tree: correct tree corruption on spanning store", v3.
+
+There has been a nasty yet subtle maple tree corruption bug that appears
+to have been in existence since the inception of the algorithm.
+
+This bug seems far more likely to happen since commit f8d112a4e657
+("mm/mmap: avoid zeroing vma tree in mmap_region()"), which is the point
+at which reports started to be submitted concerning this bug.
+
+We were made definitely aware of the bug thanks to the kind efforts of
+Bert Karwatzki who helped enormously in my being able to track this down
+and identify the cause of it.
+
+The bug arises when an attempt is made to perform a spanning store across
+two leaf nodes, where the right leaf node is the rightmost child of the
+shared parent, AND the store completely consumes the right-mode node.
+
+This results in mas_wr_spanning_store() mitakenly duplicating the new and
+existing entries at the maximum pivot within the range, and thus maple
+tree corruption.
+
+The fix patch corrects this by detecting this scenario and disallowing the
+mistaken duplicate copy.
+
+The fix patch commit message goes into great detail as to how this occurs.
+
+This series also includes a test which reliably reproduces the issue, and
+asserts that the fix works correctly.
+
+Bert has kindly tested the fix and confirmed it resolved his issues.  Also
+Mikhail Gavrilov kindly reported what appears to be precisely the same
+bug, which this fix should also resolve.
+
+
+This patch (of 2):
+
+There has been a subtle bug present in the maple tree implementation from
+its inception.
+
+This arises from how stores are performed - when a store occurs, it will
+overwrite overlapping ranges and adjust the tree as necessary to
+accommodate this.
+
+A range may always ultimately span two leaf nodes.  In this instance we
+walk the two leaf nodes, determine which elements are not overwritten to
+the left and to the right of the start and end of the ranges respectively
+and then rebalance the tree to contain these entries and the newly
+inserted one.
+
+This kind of store is dubbed a 'spanning store' and is implemented by
+mas_wr_spanning_store().
+
+In order to reach this stage, mas_store_gfp() invokes
+mas_wr_preallocate(), mas_wr_store_type() and mas_wr_walk() in turn to
+walk the tree and update the object (mas) to traverse to the location
+where the write should be performed, determining its store type.
+
+When a spanning store is required, this function returns false stopping at
+the parent node which contains the target range, and mas_wr_store_type()
+marks the mas->store_type as wr_spanning_store to denote this fact.
+
+When we go to perform the store in mas_wr_spanning_store(), we first
+determine the elements AFTER the END of the range we wish to store (that
+is, to the right of the entry to be inserted) - we do this by walking to
+the NEXT pivot in the tree (i.e.  r_mas.last + 1), starting at the node we
+have just determined contains the range over which we intend to write.
+
+We then turn our attention to the entries to the left of the entry we are
+inserting, whose state is represented by l_mas, and copy these into a 'big
+node', which is a special node which contains enough slots to contain two
+leaf node's worth of data.
+
+We then copy the entry we wish to store immediately after this - the copy
+and the insertion of the new entry is performed by mas_store_b_node().
+
+After this we copy the elements to the right of the end of the range which
+we are inserting, if we have not exceeded the length of the node (i.e.
+r_mas.offset <= r_mas.end).
+
+Herein lies the bug - under very specific circumstances, this logic can
+break and corrupt the maple tree.
+
+Consider the following tree:
+
+Height
+  0                             Root Node
+                                 /      \
+                 pivot = 0xffff /        \ pivot = ULONG_MAX
+                               /          \
+  1                       A [-----]       ...
+                             /   \
+             pivot = 0x4fff /     \ pivot = 0xffff
+                           /       \
+  2 (LEAVES)          B [-----]  [-----] C
+                                      ^--- Last pivot 0xffff.
+
+Now imagine we wish to store an entry in the range [0x4000, 0xffff] (note
+that all ranges expressed in maple tree code are inclusive):
+
+1. mas_store_gfp() descends the tree, finds node A at <=0xffff, then
+   determines that this is a spanning store across nodes B and C. The mas
+   state is set such that the current node from which we traverse further
+   is node A.
+
+2. In mas_wr_spanning_store() we try to find elements to the right of pivot
+   0xffff by searching for an index of 0x10000:
+
+    - mas_wr_walk_index() invokes mas_wr_walk_descend() and
+      mas_wr_node_walk() in turn.
+
+        - mas_wr_node_walk() loops over entries in node A until EITHER it
+          finds an entry whose pivot equals or exceeds 0x10000 OR it
+          reaches the final entry.
+
+        - Since no entry has a pivot equal to or exceeding 0x10000, pivot
+          0xffff is selected, leading to node C.
+
+    - mas_wr_walk_traverse() resets the mas state to traverse node C. We
+      loop around and invoke mas_wr_walk_descend() and mas_wr_node_walk()
+      in turn once again.
+
+         - Again, we reach the last entry in node C, which has a pivot of
+           0xffff.
+
+3. We then copy the elements to the left of 0x4000 in node B to the big
+   node via mas_store_b_node(), and insert the new [0x4000, 0xffff] entry
+   too.
+
+4. We determine whether we have any entries to copy from the right of the
+   end of the range via - and with r_mas set up at the entry at pivot
+   0xffff, r_mas.offset <= r_mas.end, and then we DUPLICATE the entry at
+   pivot 0xffff.
+
+5. BUG! The maple tree is corrupted with a duplicate entry.
+
+This requires a very specific set of circumstances - we must be spanning
+the last element in a leaf node, which is the last element in the parent
+node.
+
+spanning store across two leaf nodes with a range that ends at that shared
+pivot.
+
+A potential solution to this problem would simply be to reset the walk
+each time we traverse r_mas, however given the rarity of this situation it
+seems that would be rather inefficient.
+
+Instead, this patch detects if the right hand node is populated, i.e.  has
+anything we need to copy.
+
+We do so by only copying elements from the right of the entry being
+inserted when the maximum value present exceeds the last, rather than
+basing this on offset position.
+
+The patch also updates some comments and eliminates the unused bool return
+value in mas_wr_walk_index().
+
+The work performed in commit f8d112a4e657 ("mm/mmap: avoid zeroing vma
+tree in mmap_region()") seems to have made the probability of this event
+much more likely, which is the point at which reports started to be
+submitted concerning this bug.
+
+The motivation for this change arose from Bert Karwatzki's report of
+encountering mm instability after the release of kernel v6.12-rc1 which,
+after the use of CONFIG_DEBUG_VM_MAPLE_TREE and similar configuration
+options, was identified as maple tree corruption.
+
+After Bert very generously provided his time and ability to reproduce this
+event consistently, I was able to finally identify that the issue
+discussed in this commit message was occurring for him.
+
+Link: https://lkml.kernel.org/r/cover.1728314402.git.lorenzo.stoakes@oracle.com
+Link: https://lkml.kernel.org/r/48b349a2a0f7c76e18772712d0997a5e12ab0a3b.1728314403.git.lorenzo.stoakes@oracle.com
+Fixes: 54a611b60590 ("Maple Tree: add new data structure")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reported-by: Bert Karwatzki <spasswolf@web.de>
+Closes: https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/
+Tested-by: Bert Karwatzki <spasswolf@web.de>
+Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
+Closes: https://lore.kernel.org/all/CABXGCsOPwuoNOqSMmAvWO2Fz4TEmPnjFj-b7iF+XFRu1h7-+Dg@mail.gmail.com/
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
+Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
+Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/maple_tree.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -2196,6 +2196,8 @@ static inline void mas_node_or_none(stru
+ /*
+  * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
++ *                      If @mas->index cannot be found within the containing
++ *                      node, we traverse to the last entry in the node.
+  * @wr_mas: The maple write state
+  *
+  * Uses mas_slot_locked() and does not need to worry about dead nodes.
+@@ -3609,7 +3611,7 @@ static bool mas_wr_walk(struct ma_wr_sta
+       return true;
+ }
+-static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
++static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
+ {
+       struct ma_state *mas = wr_mas->mas;
+@@ -3618,11 +3620,9 @@ static bool mas_wr_walk_index(struct ma_
+               wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
+                                                 mas->offset);
+               if (ma_is_leaf(wr_mas->type))
+-                      return true;
++                      return;
+               mas_wr_walk_traverse(wr_mas);
+-
+       }
+-      return true;
+ }
+ /*
+  * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
+@@ -3853,8 +3853,8 @@ static inline int mas_wr_spanning_store(
+       memset(&b_node, 0, sizeof(struct maple_big_node));
+       /* Copy l_mas and store the value in b_node. */
+       mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
+-      /* Copy r_mas into b_node. */
+-      if (r_mas.offset <= r_mas.end)
++      /* Copy r_mas into b_node if there is anything to copy. */
++      if (r_mas.max > r_mas.last)
+               mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
+                          &b_node, b_node.b_end + 1);
+       else
diff --git a/queue-6.11/mm-damon-tests-sysfs-kunit.h-fix-memory-leak-in-damon_sysfs_test_add_targets.patch b/queue-6.11/mm-damon-tests-sysfs-kunit.h-fix-memory-leak-in-damon_sysfs_test_add_targets.patch
new file mode 100644 (file)
index 0000000..0bfb742
--- /dev/null
@@ -0,0 +1,48 @@
+From 2d6a1c835685de3b0c8e8dc871f60f4ef92ab01a Mon Sep 17 00:00:00 2001
+From: Jinjie Ruan <ruanjinjie@huawei.com>
+Date: Thu, 10 Oct 2024 20:53:23 +0800
+Subject: mm/damon/tests/sysfs-kunit.h: fix memory leak in damon_sysfs_test_add_targets()
+
+From: Jinjie Ruan <ruanjinjie@huawei.com>
+
+commit 2d6a1c835685de3b0c8e8dc871f60f4ef92ab01a upstream.
+
+The sysfs_target->regions allocated in damon_sysfs_regions_alloc() is not
+freed in damon_sysfs_test_add_targets(), which cause the following memory
+leak, free it to fix it.
+
+       unreferenced object 0xffffff80c2a8db80 (size 96):
+         comm "kunit_try_catch", pid 187, jiffies 4294894363
+         hex dump (first 32 bytes):
+           00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+           00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+         backtrace (crc 0):
+           [<0000000001e3714d>] kmemleak_alloc+0x34/0x40
+           [<000000008e6835c1>] __kmalloc_cache_noprof+0x26c/0x2f4
+           [<000000001286d9f8>] damon_sysfs_test_add_targets+0x1cc/0x738
+           [<0000000032ef8f77>] kunit_try_run_case+0x13c/0x3ac
+           [<00000000f3edea23>] kunit_generic_run_threadfn_adapter+0x80/0xec
+           [<00000000adf936cf>] kthread+0x2e8/0x374
+           [<0000000041bb1628>] ret_from_fork+0x10/0x20
+
+Link: https://lkml.kernel.org/r/20241010125323.3127187-1-ruanjinjie@huawei.com
+Fixes: b8ee5575f763 ("mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()")
+Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
+Reviewed-by: SeongJae Park <sj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/damon/sysfs-test.h |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/damon/sysfs-test.h
++++ b/mm/damon/sysfs-test.h
+@@ -67,6 +67,7 @@ static void damon_sysfs_test_add_targets
+       damon_destroy_ctx(ctx);
+       kfree(sysfs_targets->targets_arr);
+       kfree(sysfs_targets);
++      kfree(sysfs_target->regions);
+       kfree(sysfs_target);
+ }
diff --git a/queue-6.11/mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch b/queue-6.11/mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch
new file mode 100644 (file)
index 0000000..faa58e0
--- /dev/null
@@ -0,0 +1,75 @@
+From 37f0b47c5143c2957909ced44fc09ffb118c99f7 Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang@os.amperecomputing.com>
+Date: Fri, 11 Oct 2024 18:17:02 -0700
+Subject: mm: khugepaged: fix the arguments order in khugepaged_collapse_file trace point
+
+From: Yang Shi <yang@os.amperecomputing.com>
+
+commit 37f0b47c5143c2957909ced44fc09ffb118c99f7 upstream.
+
+The "addr" and "is_shmem" arguments have different order in TP_PROTO and
+TP_ARGS.  This resulted in the incorrect trace result:
+
+text-hugepage-644429 [276] 392092.878683: mm_khugepaged_collapse_file:
+mm=0xffff20025d52c440, hpage_pfn=0x200678c00, index=512, addr=1, is_shmem=0,
+filename=text-hugepage, nr=512, result=failed
+
+The value of "addr" is wrong because it was treated as bool value, the
+type of is_shmem.
+
+Fix the order in TP_PROTO to keep "addr" is before "is_shmem" since the
+original patch review suggested this order to achieve best packing.
+
+And use "lx" for "addr" instead of "ld" in TP_printk because address is
+typically shown in hex.
+
+After the fix, the trace result looks correct:
+
+text-hugepage-7291  [004]   128.627251: mm_khugepaged_collapse_file:
+mm=0xffff0001328f9500, hpage_pfn=0x20016ea00, index=512, addr=0x400000,
+is_shmem=0, filename=text-hugepage, nr=512, result=failed
+
+Link: https://lkml.kernel.org/r/20241012011702.1084846-1-yang@os.amperecomputing.com
+Fixes: 4c9473e87e75 ("mm/khugepaged: add tracepoint to collapse_file()")
+Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
+Cc: Gautam Menghani <gautammenghani201@gmail.com>
+Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
+Cc: <stable@vger.kernel.org>    [6.2+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/trace/events/huge_memory.h |    4 ++--
+ mm/khugepaged.c                    |    2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/include/trace/events/huge_memory.h
++++ b/include/trace/events/huge_memory.h
+@@ -208,7 +208,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
+ TRACE_EVENT(mm_khugepaged_collapse_file,
+       TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
+-                      bool is_shmem, unsigned long addr, struct file *file,
++                      unsigned long addr, bool is_shmem, struct file *file,
+                       int nr, int result),
+       TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
+       TP_STRUCT__entry(
+@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
+               __entry->result = result;
+       ),
+-      TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
++      TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
+               __entry->mm,
+               __entry->hpfn,
+               __entry->index,
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -2219,7 +2219,7 @@ rollback:
+       folio_put(new_folio);
+ out:
+       VM_BUG_ON(!list_empty(&pagelist));
+-      trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
++      trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
+       return result;
+ }
diff --git a/queue-6.11/mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch b/queue-6.11/mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch
new file mode 100644 (file)
index 0000000..2913ed6
--- /dev/null
@@ -0,0 +1,51 @@
+From b130ba4a6259f6b64d8af15e9e7ab1e912bcb7ad Mon Sep 17 00:00:00 2001
+From: Wei Xu <weixugc@google.com>
+Date: Mon, 14 Oct 2024 22:12:11 +0000
+Subject: mm/mglru: only clear kswapd_failures if reclaimable
+
+From: Wei Xu <weixugc@google.com>
+
+commit b130ba4a6259f6b64d8af15e9e7ab1e912bcb7ad upstream.
+
+lru_gen_shrink_node() unconditionally clears kswapd_failures, which can
+prevent kswapd from sleeping and cause 100% kswapd cpu usage even when
+kswapd repeatedly fails to make progress in reclaim.
+
+Only clear kswap_failures in lru_gen_shrink_node() if reclaim makes some
+progress, similar to shrink_node().
+
+I happened to run into this problem in one of my tests recently.  It
+requires a combination of several conditions: The allocator needs to
+allocate a right amount of pages such that it can wake up kswapd
+without itself being OOM killed; there is no memory for kswapd to
+reclaim (My test disables swap and cleans page cache first); no other
+process frees enough memory at the same time.
+
+Link: https://lkml.kernel.org/r/20241014221211.832591-1-weixugc@google.com
+Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
+Signed-off-by: Wei Xu <weixugc@google.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Brian Geffon <bgeffon@google.com>
+Cc: Jan Alexander Steffens <heftig@archlinux.org>
+Cc: Suleiman Souhlal <suleiman@google.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4940,8 +4940,8 @@ static void lru_gen_shrink_node(struct p
+       blk_finish_plug(&plug);
+ done:
+-      /* kswapd should never fail */
+-      pgdat->kswapd_failures = 0;
++      if (sc->nr_reclaimed > reclaimed)
++              pgdat->kswapd_failures = 0;
+ }
+ /******************************************************************************
diff --git a/queue-6.11/mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch b/queue-6.11/mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch
new file mode 100644 (file)
index 0000000..5821cbf
--- /dev/null
@@ -0,0 +1,134 @@
+From 6fa1066fc5d00cb9f1b0e83b7ff6ef98d26ba2aa Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Mon, 7 Oct 2024 23:42:04 +0200
+Subject: mm/mremap: fix move_normal_pmd/retract_page_tables race
+
+From: Jann Horn <jannh@google.com>
+
+commit 6fa1066fc5d00cb9f1b0e83b7ff6ef98d26ba2aa upstream.
+
+In mremap(), move_page_tables() looks at the type of the PMD entry and the
+specified address range to figure out by which method the next chunk of
+page table entries should be moved.
+
+At that point, the mmap_lock is held in write mode, but no rmap locks are
+held yet.  For PMD entries that point to page tables and are fully covered
+by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
+which first takes rmap locks, then does move_normal_pmd().
+move_normal_pmd() takes the necessary page table locks at source and
+destination, then moves an entire page table from the source to the
+destination.
+
+The problem is: The rmap locks, which protect against concurrent page
+table removal by retract_page_tables() in the THP code, are only taken
+after the PMD entry has been read and it has been decided how to move it.
+So we can race as follows (with two processes that have mappings of the
+same tmpfs file that is stored on a tmpfs mount with huge=advise); note
+that process A accesses page tables through the MM while process B does it
+through the file rmap:
+
+process A                      process B
+=========                      =========
+mremap
+  mremap_to
+    move_vma
+      move_page_tables
+        get_old_pmd
+        alloc_new_pmd
+                      *** PREEMPT ***
+                               madvise(MADV_COLLAPSE)
+                                 do_madvise
+                                   madvise_walk_vmas
+                                     madvise_vma_behavior
+                                       madvise_collapse
+                                         hpage_collapse_scan_file
+                                           collapse_file
+                                             retract_page_tables
+                                               i_mmap_lock_read(mapping)
+                                               pmdp_collapse_flush
+                                               i_mmap_unlock_read(mapping)
+        move_pgt_entry(NORMAL_PMD, ...)
+          take_rmap_locks
+          move_normal_pmd
+          drop_rmap_locks
+
+When this happens, move_normal_pmd() can end up creating bogus PMD entries
+in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`.  The effect
+depends on arch-specific and machine-specific details; on x86, you can end
+up with physical page 0 mapped as a page table, which is likely
+exploitable for user->kernel privilege escalation.
+
+Fix the race by letting process B recheck that the PMD still points to a
+page table after the rmap locks have been taken.  Otherwise, we bail and
+let the caller fall back to the PTE-level copying path, which will then
+bail immediately at the pmd_none() check.
+
+Bug reachability: Reaching this bug requires that you can create
+shmem/file THP mappings - anonymous THP uses different code that doesn't
+zap stuff under rmap locks.  File THP is gated on an experimental config
+flag (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need
+shmem THP to hit this bug.  As far as I know, getting shmem THP normally
+requires that you can mount your own tmpfs with the right mount flags,
+which would require creating your own user+mount namespace; though I don't
+know if some distros maybe enable shmem THP by default or something like
+that.
+
+Bug impact: This issue can likely be used for user->kernel privilege
+escalation when it is reachable.
+
+Link: https://lkml.kernel.org/r/20241007-move_normal_pmd-vs-collapse-fix-2-v1-1-5ead9631f2ea@google.com
+Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Co-developed-by: David Hildenbrand <david@redhat.com>
+Closes: https://project-zero.issues.chromium.org/371047675
+Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joel Fernandes <joel@joelfernandes.org>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mremap.c |   11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_ar
+ {
+       spinlock_t *old_ptl, *new_ptl;
+       struct mm_struct *mm = vma->vm_mm;
++      bool res = false;
+       pmd_t pmd;
+       if (!arch_supports_page_table_move())
+@@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_ar
+       if (new_ptl != old_ptl)
+               spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+-      /* Clear the pmd */
+       pmd = *old_pmd;
++
++      /* Racing with collapse? */
++      if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
++              goto out_unlock;
++      /* Clear the pmd */
+       pmd_clear(old_pmd);
++      res = true;
+       VM_BUG_ON(!pmd_none(*new_pmd));
+       pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+       flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
++out_unlock:
+       if (new_ptl != old_ptl)
+               spin_unlock(new_ptl);
+       spin_unlock(old_ptl);
+-      return true;
++      return res;
+ }
+ #else
+ static inline bool move_normal_pmd(struct vm_area_struct *vma,
diff --git a/queue-6.11/mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch b/queue-6.11/mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch
new file mode 100644 (file)
index 0000000..b47305f
--- /dev/null
@@ -0,0 +1,46 @@
+From 7528c4fb1237512ee18049f852f014eba80bbe8d Mon Sep 17 00:00:00 2001
+From: Liu Shixin <liushixin2@huawei.com>
+Date: Tue, 15 Oct 2024 09:45:21 +0800
+Subject: mm/swapfile: skip HugeTLB pages for unuse_vma
+
+From: Liu Shixin <liushixin2@huawei.com>
+
+commit 7528c4fb1237512ee18049f852f014eba80bbe8d upstream.
+
+I got a bad pud error and lost a 1GB HugeTLB when calling swapoff.  The
+problem can be reproduced by the following steps:
+
+ 1. Allocate an anonymous 1GB HugeTLB and some other anonymous memory.
+ 2. Swapout the above anonymous memory.
+ 3. run swapoff and we will get a bad pud error in kernel message:
+
+  mm/pgtable-generic.c:42: bad pud 00000000743d215d(84000001400000e7)
+
+We can tell that pud_clear_bad is called by pud_none_or_clear_bad in
+unuse_pud_range() by ftrace.  And therefore the HugeTLB pages will never
+be freed because we lost it from page table.  We can skip HugeTLB pages
+for unuse_vma to fix it.
+
+Link: https://lkml.kernel.org/r/20241015014521.570237-1-liushixin2@huawei.com
+Fixes: 0fe6e20b9c4c ("hugetlb, rmap: add reverse mapping for hugepage")
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+Acked-by: Muchun Song <muchun.song@linux.dev>
+Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/swapfile.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -2106,7 +2106,7 @@ static int unuse_mm(struct mm_struct *mm
+       mmap_read_lock(mm);
+       for_each_vma(vmi, vma) {
+-              if (vma->anon_vma) {
++              if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
+                       ret = unuse_vma(vma, type);
+                       if (ret)
+                               break;
diff --git a/queue-6.11/nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch b/queue-6.11/nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch
new file mode 100644 (file)
index 0000000..da34110
--- /dev/null
@@ -0,0 +1,227 @@
+From 08cfa12adf888db98879dbd735bc741360a34168 Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Fri, 4 Oct 2024 12:35:31 +0900
+Subject: nilfs2: propagate directory read errors from nilfs_find_entry()
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 08cfa12adf888db98879dbd735bc741360a34168 upstream.
+
+Syzbot reported that a task hang occurs in vcs_open() during a fuzzing
+test for nilfs2.
+
+The root cause of this problem is that in nilfs_find_entry(), which
+searches for directory entries, ignores errors when loading a directory
+page/folio via nilfs_get_folio() fails.
+
+If the filesystem images is corrupted, and the i_size of the directory
+inode is large, and the directory page/folio is successfully read but
+fails the sanity check, for example when it is zero-filled,
+nilfs_check_folio() may continue to spit out error messages in bursts.
+
+Fix this issue by propagating the error to the callers when loading a
+page/folio fails in nilfs_find_entry().
+
+The current interface of nilfs_find_entry() and its callers is outdated
+and cannot propagate error codes such as -EIO and -ENOMEM returned via
+nilfs_find_entry(), so fix it together.
+
+Link: https://lkml.kernel.org/r/20241004033640.6841-1-konishi.ryusuke@gmail.com
+Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations")
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: Lizhi Xu <lizhi.xu@windriver.com>
+Closes: https://lkml.kernel.org/r/20240927013806.3577931-1-lizhi.xu@windriver.com
+Reported-by: syzbot+8a192e8d090fa9a31135@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=8a192e8d090fa9a31135
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/dir.c   |   48 +++++++++++++++++++++++++-----------------------
+ fs/nilfs2/namei.c |   39 ++++++++++++++++++++++++++-------------
+ fs/nilfs2/nilfs.h |    2 +-
+ 3 files changed, 52 insertions(+), 37 deletions(-)
+
+--- a/fs/nilfs2/dir.c
++++ b/fs/nilfs2/dir.c
+@@ -323,7 +323,7 @@ static int nilfs_readdir(struct file *fi
+  * The folio is mapped and unlocked.  When the caller is finished with
+  * the entry, it should call folio_release_kmap().
+  *
+- * On failure, returns NULL and the caller should ignore foliop.
++ * On failure, returns an error pointer and the caller should ignore foliop.
+  */
+ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
+               const struct qstr *qstr, struct folio **foliop)
+@@ -346,22 +346,24 @@ struct nilfs_dir_entry *nilfs_find_entry
+       do {
+               char *kaddr = nilfs_get_folio(dir, n, foliop);
+-              if (!IS_ERR(kaddr)) {
+-                      de = (struct nilfs_dir_entry *)kaddr;
+-                      kaddr += nilfs_last_byte(dir, n) - reclen;
+-                      while ((char *) de <= kaddr) {
+-                              if (de->rec_len == 0) {
+-                                      nilfs_error(dir->i_sb,
+-                                              "zero-length directory entry");
+-                                      folio_release_kmap(*foliop, kaddr);
+-                                      goto out;
+-                              }
+-                              if (nilfs_match(namelen, name, de))
+-                                      goto found;
+-                              de = nilfs_next_entry(de);
++              if (IS_ERR(kaddr))
++                      return ERR_CAST(kaddr);
++
++              de = (struct nilfs_dir_entry *)kaddr;
++              kaddr += nilfs_last_byte(dir, n) - reclen;
++              while ((char *)de <= kaddr) {
++                      if (de->rec_len == 0) {
++                              nilfs_error(dir->i_sb,
++                                          "zero-length directory entry");
++                              folio_release_kmap(*foliop, kaddr);
++                              goto out;
+                       }
+-                      folio_release_kmap(*foliop, kaddr);
++                      if (nilfs_match(namelen, name, de))
++                              goto found;
++                      de = nilfs_next_entry(de);
+               }
++              folio_release_kmap(*foliop, kaddr);
++
+               if (++n >= npages)
+                       n = 0;
+               /* next folio is past the blocks we've got */
+@@ -374,7 +376,7 @@ struct nilfs_dir_entry *nilfs_find_entry
+               }
+       } while (n != start);
+ out:
+-      return NULL;
++      return ERR_PTR(-ENOENT);
+ found:
+       ei->i_dir_start_lookup = n;
+@@ -418,18 +420,18 @@ fail:
+       return NULL;
+ }
+-ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
++int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino)
+ {
+-      ino_t res = 0;
+       struct nilfs_dir_entry *de;
+       struct folio *folio;
+       de = nilfs_find_entry(dir, qstr, &folio);
+-      if (de) {
+-              res = le64_to_cpu(de->inode);
+-              folio_release_kmap(folio, de);
+-      }
+-      return res;
++      if (IS_ERR(de))
++              return PTR_ERR(de);
++
++      *ino = le64_to_cpu(de->inode);
++      folio_release_kmap(folio, de);
++      return 0;
+ }
+ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+--- a/fs/nilfs2/namei.c
++++ b/fs/nilfs2/namei.c
+@@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct d
+ {
+       struct inode *inode;
+       ino_t ino;
++      int res;
+       if (dentry->d_name.len > NILFS_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
+-      ino = nilfs_inode_by_name(dir, &dentry->d_name);
+-      inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL;
++      res = nilfs_inode_by_name(dir, &dentry->d_name, &ino);
++      if (res) {
++              if (res != -ENOENT)
++                      return ERR_PTR(res);
++              inode = NULL;
++      } else {
++              inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
++      }
++
+       return d_splice_alias(inode, dentry);
+ }
+@@ -263,10 +271,11 @@ static int nilfs_do_unlink(struct inode
+       struct folio *folio;
+       int err;
+-      err = -ENOENT;
+       de = nilfs_find_entry(dir, &dentry->d_name, &folio);
+-      if (!de)
++      if (IS_ERR(de)) {
++              err = PTR_ERR(de);
+               goto out;
++      }
+       inode = d_inode(dentry);
+       err = -EIO;
+@@ -362,10 +371,11 @@ static int nilfs_rename(struct mnt_idmap
+       if (unlikely(err))
+               return err;
+-      err = -ENOENT;
+       old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
+-      if (!old_de)
++      if (IS_ERR(old_de)) {
++              err = PTR_ERR(old_de);
+               goto out;
++      }
+       if (S_ISDIR(old_inode->i_mode)) {
+               err = -EIO;
+@@ -382,10 +392,12 @@ static int nilfs_rename(struct mnt_idmap
+               if (dir_de && !nilfs_empty_dir(new_inode))
+                       goto out_dir;
+-              err = -ENOENT;
+-              new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
+-              if (!new_de)
++              new_de = nilfs_find_entry(new_dir, &new_dentry->d_name,
++                                        &new_folio);
++              if (IS_ERR(new_de)) {
++                      err = PTR_ERR(new_de);
+                       goto out_dir;
++              }
+               nilfs_set_link(new_dir, new_de, new_folio, old_inode);
+               folio_release_kmap(new_folio, new_de);
+               nilfs_mark_inode_dirty(new_dir);
+@@ -440,12 +452,13 @@ out:
+  */
+ static struct dentry *nilfs_get_parent(struct dentry *child)
+ {
+-      unsigned long ino;
++      ino_t ino;
++      int res;
+       struct nilfs_root *root;
+-      ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
+-      if (!ino)
+-              return ERR_PTR(-ENOENT);
++      res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino);
++      if (res)
++              return ERR_PTR(res);
+       root = NILFS_I(d_inode(child))->i_root;
+--- a/fs/nilfs2/nilfs.h
++++ b/fs/nilfs2/nilfs.h
+@@ -233,7 +233,7 @@ static inline __u32 nilfs_mask_flags(umo
+ /* dir.c */
+ int nilfs_add_link(struct dentry *, struct inode *);
+-ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
++int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino);
+ int nilfs_make_empty(struct inode *, struct inode *);
+ struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
+               struct folio **);
diff --git a/queue-6.11/selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch b/queue-6.11/selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
new file mode 100644 (file)
index 0000000..17f1302
--- /dev/null
@@ -0,0 +1,50 @@
+From e142cc87ac4ec618f2ccf5f68aedcd6e28a59d9d Mon Sep 17 00:00:00 2001
+From: Edward Liaw <edliaw@google.com>
+Date: Thu, 3 Oct 2024 21:17:11 +0000
+Subject: selftests/mm: fix deadlock for fork after pthread_create on ARM
+
+From: Edward Liaw <edliaw@google.com>
+
+commit e142cc87ac4ec618f2ccf5f68aedcd6e28a59d9d upstream.
+
+On Android with arm, there is some synchronization needed to avoid a
+deadlock when forking after pthread_create.
+
+Link: https://lkml.kernel.org/r/20241003211716.371786-3-edliaw@google.com
+Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test")
+Signed-off-by: Edward Liaw <edliaw@google.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/mm/uffd-unit-tests.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/tools/testing/selftests/mm/uffd-unit-tests.c
++++ b/tools/testing/selftests/mm/uffd-unit-tests.c
+@@ -241,6 +241,9 @@ static void *fork_event_consumer(void *d
+       fork_event_args *args = data;
+       struct uffd_msg msg = { 0 };
++      /* Ready for parent thread to fork */
++      pthread_barrier_wait(&ready_for_fork);
++
+       /* Read until a full msg received */
+       while (uffd_read_msg(args->parent_uffd, &msg));
+@@ -308,8 +311,12 @@ static int pagemap_test_fork(int uffd, b
+       /* Prepare a thread to resolve EVENT_FORK */
+       if (with_event) {
++              pthread_barrier_init(&ready_for_fork, NULL, 2);
+               if (pthread_create(&thread, NULL, fork_event_consumer, &args))
+                       err("pthread_create()");
++              /* Wait for child thread to start before forking */
++              pthread_barrier_wait(&ready_for_fork);
++              pthread_barrier_destroy(&ready_for_fork);
+       }
+       child = fork();
diff --git a/queue-6.11/selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch b/queue-6.11/selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
new file mode 100644 (file)
index 0000000..29a5e8b
--- /dev/null
@@ -0,0 +1,129 @@
+From e61ef21e27e8deed8c474e9f47f4aa7bc37e138c Mon Sep 17 00:00:00 2001
+From: Edward Liaw <edliaw@google.com>
+Date: Thu, 3 Oct 2024 21:17:10 +0000
+Subject: selftests/mm: replace atomic_bool with pthread_barrier_t
+
+From: Edward Liaw <edliaw@google.com>
+
+commit e61ef21e27e8deed8c474e9f47f4aa7bc37e138c upstream.
+
+Patch series "selftests/mm: fix deadlock after pthread_create".
+
+On Android arm, pthread_create followed by a fork caused a deadlock in the
+case where the fork required work to be completed by the created thread.
+
+Update the synchronization primitive to use pthread_barrier instead of
+atomic_bool.
+
+Apply the same fix to the wp-fork-with-event test.
+
+
+This patch (of 2):
+
+Swap synchronization primitive with pthread_barrier, so that stdatomic.h
+does not need to be included.
+
+The synchronization is needed on Android ARM64; we see a deadlock with
+pthread_create when the parent thread races forward before the child has a
+chance to start doing work.
+
+Link: https://lkml.kernel.org/r/20241003211716.371786-1-edliaw@google.com
+Link: https://lkml.kernel.org/r/20241003211716.371786-2-edliaw@google.com
+Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test")
+Signed-off-by: Edward Liaw <edliaw@google.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/mm/uffd-common.c     |    5 +++--
+ tools/testing/selftests/mm/uffd-common.h     |    3 +--
+ tools/testing/selftests/mm/uffd-unit-tests.c |   14 ++++++++------
+ 3 files changed, 12 insertions(+), 10 deletions(-)
+
+--- a/tools/testing/selftests/mm/uffd-common.c
++++ b/tools/testing/selftests/mm/uffd-common.c
+@@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
+ unsigned long long *count_verify;
+ uffd_test_ops_t *uffd_test_ops;
+ uffd_test_case_ops_t *uffd_test_case_ops;
+-atomic_bool ready_for_fork;
++pthread_barrier_t ready_for_fork;
+ static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
+ {
+@@ -519,7 +519,8 @@ void *uffd_poll_thread(void *arg)
+       pollfd[1].fd = pipefd[cpu*2];
+       pollfd[1].events = POLLIN;
+-      ready_for_fork = true;
++      /* Ready for parent thread to fork */
++      pthread_barrier_wait(&ready_for_fork);
+       for (;;) {
+               ret = poll(pollfd, 2, -1);
+--- a/tools/testing/selftests/mm/uffd-common.h
++++ b/tools/testing/selftests/mm/uffd-common.h
+@@ -33,7 +33,6 @@
+ #include <inttypes.h>
+ #include <stdint.h>
+ #include <sys/random.h>
+-#include <stdatomic.h>
+ #include "../kselftest.h"
+ #include "vm_util.h"
+@@ -105,7 +104,7 @@ extern bool map_shared;
+ extern bool test_uffdio_wp;
+ extern unsigned long long *count_verify;
+ extern volatile bool test_uffdio_copy_eexist;
+-extern atomic_bool ready_for_fork;
++extern pthread_barrier_t ready_for_fork;
+ extern uffd_test_ops_t anon_uffd_test_ops;
+ extern uffd_test_ops_t shmem_uffd_test_ops;
+--- a/tools/testing/selftests/mm/uffd-unit-tests.c
++++ b/tools/testing/selftests/mm/uffd-unit-tests.c
+@@ -774,7 +774,7 @@ static void uffd_sigbus_test_common(bool
+       char c;
+       struct uffd_args args = { 0 };
+-      ready_for_fork = false;
++      pthread_barrier_init(&ready_for_fork, NULL, 2);
+       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+@@ -791,8 +791,9 @@ static void uffd_sigbus_test_common(bool
+       if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+               err("uffd_poll_thread create");
+-      while (!ready_for_fork)
+-              ; /* Wait for the poll_thread to start executing before forking */
++      /* Wait for child thread to start before forking */
++      pthread_barrier_wait(&ready_for_fork);
++      pthread_barrier_destroy(&ready_for_fork);
+       pid = fork();
+       if (pid < 0)
+@@ -833,7 +834,7 @@ static void uffd_events_test_common(bool
+       char c;
+       struct uffd_args args = { 0 };
+-      ready_for_fork = false;
++      pthread_barrier_init(&ready_for_fork, NULL, 2);
+       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+       if (uffd_register(uffd, area_dst, nr_pages * page_size,
+@@ -844,8 +845,9 @@ static void uffd_events_test_common(bool
+       if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+               err("uffd_poll_thread create");
+-      while (!ready_for_fork)
+-              ; /* Wait for the poll_thread to start executing before forking */
++      /* Wait for child thread to start before forking */
++      pthread_barrier_wait(&ready_for_fork);
++      pthread_barrier_destroy(&ready_for_fork);
+       pid = fork();
+       if (pid < 0)
index 924c2f32608e0c8778fadc9c34c545bd1c15d12a..2f1b7464c5c280da48a70dffd69d35472ba23dc3 100644 (file)
@@ -18,3 +18,14 @@ arm64-probes-fix-uprobes-for-big-endian-kernels.patch
 net-macb-avoid-20s-boot-delay-by-skipping-mdio-bus-registration-for-fixed-link-phy.patch
 net-microchip-vcap-api-fix-memory-leaks-in-vcap_api_encode_rule_test.patch
 selftests-mptcp-join-test-for-prohibited-mpc-to-port-based-endp.patch
+maple_tree-correct-tree-corruption-on-spanning-store.patch
+nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch
+fat-fix-uninitialized-variable.patch
+lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch
+selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
+selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
+mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch
+mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch
+mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch
+mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch
+mm-damon-tests-sysfs-kunit.h-fix-memory-leak-in-damon_sysfs_test_add_targets.patch