From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 8 Sep 2024 12:13:28 +0000 (+0200)
Subject: 6.10-stable patches
X-Git-Tag: v4.19.322~87
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8a267fdc319392a8f98c23dffb51f0a4ce040093;p=thirdparty%2Fkernel%2Fstable-queue.git

6.10-stable patches

added patches:
	fscache-delete-fscache_cookie_lru_timer-when-fscache-exits-to-avoid-uaf.patch
	userfaultfd-don-t-bug_on-if-khugepaged-yanks-our-page-table.patch
	userfaultfd-fix-checks-for-huge-pmds.patch
---

diff --git a/queue-6.10/fscache-delete-fscache_cookie_lru_timer-when-fscache-exits-to-avoid-uaf.patch b/queue-6.10/fscache-delete-fscache_cookie_lru_timer-when-fscache-exits-to-avoid-uaf.patch
new file mode 100644
index 00000000000..ed6a960f928
--- /dev/null
+++ b/queue-6.10/fscache-delete-fscache_cookie_lru_timer-when-fscache-exits-to-avoid-uaf.patch
@@ -0,0 +1,69 @@
+From 72a6e22c604c95ddb3b10b5d3bb85b6ff4dbc34f Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Mon, 26 Aug 2024 19:20:56 +0800
+Subject: fscache: delete fscache_cookie_lru_timer when fscache exits to avoid UAF
+
+From: Baokun Li <libaokun1@huawei.com>
+
+commit 72a6e22c604c95ddb3b10b5d3bb85b6ff4dbc34f upstream.
+
+The fscache_cookie_lru_timer is initialized when the fscache module
+is inserted, but is not deleted when the fscache module is removed.
+If timer_reduce() is called before removing the fscache module,
+the fscache_cookie_lru_timer will be added to the timer list of
+the current cpu. Afterwards, a use-after-free will be triggered
+in the softIRQ after removing the fscache module, as follows:
+
+==================================================================
+BUG: unable to handle page fault for address: fffffbfff803c9e9
+ PF: supervisor read access in kernel mode
+ PF: error_code(0x0000) - not-present page
+PGD 21ffea067 P4D 21ffea067 PUD 21ffe6067 PMD 110a7c067 PTE 0
+Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI
+CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Tainted: G W 6.11.0-rc3 #855
+Tainted: [W]=WARN
+RIP: 0010:__run_timer_base.part.0+0x254/0x8a0
+Call Trace:
+ <IRQ>
+ tmigr_handle_remote_up+0x627/0x810
+ __walk_groups.isra.0+0x47/0x140
+ tmigr_handle_remote+0x1fa/0x2f0
+ handle_softirqs+0x180/0x590
+ irq_exit_rcu+0x84/0xb0
+ sysvec_apic_timer_interrupt+0x6e/0x90
+ </IRQ>
+ <TASK>
+ asm_sysvec_apic_timer_interrupt+0x1a/0x20
+RIP: 0010:default_idle+0xf/0x20
+ default_idle_call+0x38/0x60
+ do_idle+0x2b5/0x300
+ cpu_startup_entry+0x54/0x60
+ start_secondary+0x20d/0x280
+ common_startup_64+0x13e/0x148
+ </TASK>
+Modules linked in: [last unloaded: netfs]
+==================================================================
+
+Therefore delete fscache_cookie_lru_timer when removing the fscahe module.
+
+Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning")
+Cc: stable@kernel.org
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20240826112056.2458299-1-libaokun@huaweicloud.com
+Acked-by: David Howells <dhowells@redhat.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/netfs/fscache_main.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/netfs/fscache_main.c
++++ b/fs/netfs/fscache_main.c
+@@ -103,6 +103,7 @@ void __exit fscache_exit(void)
+ 
+ 	kmem_cache_destroy(fscache_cookie_jar);
+ 	fscache_proc_cleanup();
++	timer_shutdown_sync(&fscache_cookie_lru_timer);
+ 	destroy_workqueue(fscache_wq);
+ 	pr_notice("FS-Cache unloaded\n");
+ }
diff --git a/queue-6.10/series b/queue-6.10/series
index 4cceb06307d..99904d1e95a 100644
--- a/queue-6.10/series
+++ b/queue-6.10/series
@@ -63,3 +63,6 @@ tracing-osnoise-use-a-cpumask-to-know-what-threads-are-kthreads.patch
 tracing-timerlat-only-clear-timer-if-a-kthread-exists.patch
 tracing-avoid-possible-softlockup-in-tracing_iter_reset.patch
 tracing-timerlat-add-interface_lock-around-clearing-of-kthread-in-stop_kthread.patch
+userfaultfd-don-t-bug_on-if-khugepaged-yanks-our-page-table.patch
+userfaultfd-fix-checks-for-huge-pmds.patch
+fscache-delete-fscache_cookie_lru_timer-when-fscache-exits-to-avoid-uaf.patch
diff --git a/queue-6.10/userfaultfd-don-t-bug_on-if-khugepaged-yanks-our-page-table.patch b/queue-6.10/userfaultfd-don-t-bug_on-if-khugepaged-yanks-our-page-table.patch
new file mode 100644
index 00000000000..f293cdcb9c5
--- /dev/null
+++ b/queue-6.10/userfaultfd-don-t-bug_on-if-khugepaged-yanks-our-page-table.patch
@@ -0,0 +1,49 @@
+From 4828d207dc5161dc7ddf9a4f6dcfd80c7dd7d20a Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 13 Aug 2024 22:25:22 +0200
+Subject: userfaultfd: don't BUG_ON() if khugepaged yanks our page table
+
+From: Jann Horn <jannh@google.com>
+
+commit 4828d207dc5161dc7ddf9a4f6dcfd80c7dd7d20a upstream.
+
+Since khugepaged was changed to allow retracting page tables in file
+mappings without holding the mmap lock, these BUG_ON()s are wrong - get
+rid of them.
+
+We could also remove the preceding "if (unlikely(...))" block, but then we
+could reach pte_offset_map_lock() with transhuge pages not just for file
+mappings but also for anonymous mappings - which would probably be fine
+but I think is not necessarily expected.
+
+Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-2-5efa61078a41@google.com
+Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Pavel Emelyanov <xemul@virtuozzo.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -805,9 +805,10 @@ retry:
+ 			err = -EFAULT;
+ 			break;
+ 		}
+-
+-		BUG_ON(pmd_none(*dst_pmd));
+-		BUG_ON(pmd_trans_huge(*dst_pmd));
++		/*
++		 * For shmem mappings, khugepaged is allowed to remove page
++		 * tables under us; pte_offset_map_lock() will deal with that.
++		 */
+ 
+ 		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
+ 				       src_addr, flags, &folio);
diff --git a/queue-6.10/userfaultfd-fix-checks-for-huge-pmds.patch b/queue-6.10/userfaultfd-fix-checks-for-huge-pmds.patch
new file mode 100644
index 00000000000..cdb6c3ba457
--- /dev/null
+++ b/queue-6.10/userfaultfd-fix-checks-for-huge-pmds.patch
@@ -0,0 +1,142 @@
+From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 13 Aug 2024 22:25:21 +0200
+Subject: userfaultfd: fix checks for huge PMDs
+
+From: Jann Horn <jannh@google.com>
+
+commit 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 upstream.
+
+Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
+
+The pmd_trans_huge() code in mfill_atomic() is wrong in three different
+ways depending on kernel version:
+
+1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
+   the right two race windows) - I've tested this in a kernel build with
+   some extra mdelay() calls. See the commit message for a description
+   of the race scenario.
+   On older kernels (before 6.5), I think the same bug can even
+   theoretically lead to accessing transhuge page contents as a page table
+   if you hit the right 5 narrow race windows (I haven't tested this case).
+2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
+   detecting PMDs that don't point to page tables.
+   On older kernels (before 6.5), you'd just have to win a single fairly
+   wide race to hit this.
+   I've tested this on 6.1 stable by racing migration (with a mdelay()
+   patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
+   VM, that causes a kernel oops in ptlock_ptr().
+3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
+   to yank page tables out from under us (though I haven't tested that),
+   so I think the BUG_ON() checks in mfill_atomic() are just wrong.
+
+I decided to write two separate fixes for these (one fix for bugs 1+2, one
+fix for bug 3), so that the first fix can be backported to kernels
+affected by bugs 1+2.
+
+
+This patch (of 2):
+
+This fixes two issues.
+
+I discovered that the following race can occur:
+
+  mfill_atomic                other thread
+  ============                ============
+                              <zap PMD>
+  pmdp_get_lockless() [reads none pmd]
+  <bail if trans_huge>
+  <if none:>
+                              <pagefault creates transhuge zeropage>
+    __pte_alloc [no-op]
+                              <zap PMD>
+  <bail if pmd_trans_huge(*dst_pmd)>
+  BUG_ON(pmd_none(*dst_pmd))
+
+I have experimentally verified this in a kernel with extra mdelay() calls;
+the BUG_ON(pmd_none(*dst_pmd)) triggers.
+
+On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
+pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
+a BUG_ON(), since the page table access helpers are actually designed to
+deal with page tables concurrently disappearing; but on older kernels
+(<=6.4), I think we could probably theoretically race past the two
+BUG_ON() checks and end up treating a hugepage as a page table.
+
+The second issue is that, as Qi Zheng pointed out, there are other types
+of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
+(in particular, migration PMDs).
+
+On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
+PMD that contains a migration entry (which just requires winning a single,
+fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
+assumes that the PMD points to a page table.
+
+Breakage follows: First, the kernel tries to take the PTE lock (which will
+crash or maybe worse if there is no "struct page" for the address bits in
+the migration entry PMD - I think at least on X86 there usually is no
+corresponding "struct page" thanks to the PTE inversion mitigation, amd64
+looks different).
+
+If that didn't crash, the kernel would next try to write a PTE into what
+it wrongly thinks is a page table.
+
+As part of fixing these issues, get rid of the check for pmd_trans_huge()
+before __pte_alloc() - that's redundant, we're going to have to check for
+that after the __pte_alloc() anyway.
+
+Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
+
+Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@google.com
+Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@google.com
+Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
+Signed-off-by: Jann Horn <jannh@google.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Pavel Emelyanov <xemul@virtuozzo.com>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/userfaultfd.c |   22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -787,21 +787,23 @@ retry:
+ 		}
+ 
+ 		dst_pmdval = pmdp_get_lockless(dst_pmd);
+-		/*
+-		 * If the dst_pmd is mapped as THP don't
+-		 * override it and just be strict.
+-		 */
+-		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+-			err = -EEXIST;
+-			break;
+-		}
+ 		if (unlikely(pmd_none(dst_pmdval)) &&
+ 		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
+ 			err = -ENOMEM;
+ 			break;
+ 		}
+-		/* If an huge pmd materialized from under us fail */
+-		if (unlikely(pmd_trans_huge(*dst_pmd))) {
++		dst_pmdval = pmdp_get_lockless(dst_pmd);
++		/*
++		 * If the dst_pmd is THP don't override it and just be strict.
++		 * (This includes the case where the PMD used to be THP and
++		 * changed back to none after __pte_alloc().)
++		 */
++		if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
++			     pmd_devmap(dst_pmdval))) {
++			err = -EEXIST;
++			break;
++		}
++		if (unlikely(pmd_bad(dst_pmdval))) {
+ 			err = -EFAULT;
+ 			break;
+ 		}