]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - queue-5.15/x86-mm-pat-fix-vm_pat-handling-in-cow-mappings.patch
5.15-stable patches
[thirdparty/kernel/stable-queue.git] / queue-5.15 / x86-mm-pat-fix-vm_pat-handling-in-cow-mappings.patch
1 From 04c35ab3bdae7fefbd7c7a7355f29fa03a035221 Mon Sep 17 00:00:00 2001
2 From: David Hildenbrand <david@redhat.com>
3 Date: Wed, 3 Apr 2024 23:21:30 +0200
4 Subject: x86/mm/pat: fix VM_PAT handling in COW mappings
5
6 From: David Hildenbrand <david@redhat.com>
7
8 commit 04c35ab3bdae7fefbd7c7a7355f29fa03a035221 upstream.
9
10 PAT handling won't do the right thing in COW mappings: the first PTE (or,
11 in fact, all PTEs) can be replaced during write faults to point at anon
12 folios. Reliably recovering the correct PFN and cachemode using
13 follow_phys() from PTEs will not work in COW mappings.
14
15 Using follow_phys(), we might just get the address+protection of the anon
16 folio (which is very wrong), or fail on swap/nonswap entries, failing
17 follow_phys() and triggering a WARN_ON_ONCE() in untrack_pfn() and
18 track_pfn_copy(), not properly calling free_pfn_range().
19
20 In free_pfn_range(), we either wouldn't call memtype_free() or would call
21 it with the wrong range, possibly leaking memory.
22
23 To fix that, let's update follow_phys() to refuse returning anon folios,
24 and fallback to using the stored PFN inside vma->vm_pgoff for COW mappings
25 if we run into that.
26
27 We will now properly handle untrack_pfn() with COW mappings, where we
28 don't need the cachemode. We'll have to fail fork()->track_pfn_copy() if
29 the first page was replaced by an anon folio, though: we'd have to store
30 the cachemode in the VMA to make this work, likely growing the VMA size.
31
32 For now, lets keep it simple and let track_pfn_copy() just fail in that
33 case: it would have failed in the past with swap/nonswap entries already,
34 and it would have done the wrong thing with anon folios.
35
36 Simple reproducer to trigger the WARN_ON_ONCE() in untrack_pfn():
37
38 <--- C reproducer --->
39 #include <stdio.h>
40 #include <sys/mman.h>
41 #include <unistd.h>
42 #include <liburing.h>
43
44 int main(void)
45 {
46 struct io_uring_params p = {};
47 int ring_fd;
48 size_t size;
49 char *map;
50
51 ring_fd = io_uring_setup(1, &p);
52 if (ring_fd < 0) {
53 perror("io_uring_setup");
54 return 1;
55 }
56 size = p.sq_off.array + p.sq_entries * sizeof(unsigned);
57
58 /* Map the submission queue ring MAP_PRIVATE */
59 map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE,
60 ring_fd, IORING_OFF_SQ_RING);
61 if (map == MAP_FAILED) {
62 perror("mmap");
63 return 1;
64 }
65
66 /* We have at least one page. Let's COW it. */
67 *map = 0;
68 pause();
69 return 0;
70 }
71 <--- C reproducer --->
72
73 On a system with 16 GiB RAM and swap configured:
74 # ./iouring &
75 # memhog 16G
76 # killall iouring
77 [ 301.552930] ------------[ cut here ]------------
78 [ 301.553285] WARNING: CPU: 7 PID: 1402 at arch/x86/mm/pat/memtype.c:1060 untrack_pfn+0xf4/0x100
79 [ 301.553989] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_g
80 [ 301.558232] CPU: 7 PID: 1402 Comm: iouring Not tainted 6.7.5-100.fc38.x86_64 #1
81 [ 301.558772] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebu4
82 [ 301.559569] RIP: 0010:untrack_pfn+0xf4/0x100
83 [ 301.559893] Code: 75 c4 eb cf 48 8b 43 10 8b a8 e8 00 00 00 3b 6b 28 74 b8 48 8b 7b 30 e8 ea 1a f7 000
84 [ 301.561189] RSP: 0018:ffffba2c0377fab8 EFLAGS: 00010282
85 [ 301.561590] RAX: 00000000ffffffea RBX: ffff9208c8ce9cc0 RCX: 000000010455e047
86 [ 301.562105] RDX: 07fffffff0eb1e0a RSI: 0000000000000000 RDI: ffff9208c391d200
87 [ 301.562628] RBP: 0000000000000000 R08: ffffba2c0377fab8 R09: 0000000000000000
88 [ 301.563145] R10: ffff9208d2292d50 R11: 0000000000000002 R12: 00007fea890e0000
89 [ 301.563669] R13: 0000000000000000 R14: ffffba2c0377fc08 R15: 0000000000000000
90 [ 301.564186] FS: 0000000000000000(0000) GS:ffff920c2fbc0000(0000) knlGS:0000000000000000
91 [ 301.564773] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
92 [ 301.565197] CR2: 00007fea88ee8a20 CR3: 00000001033a8000 CR4: 0000000000750ef0
93 [ 301.565725] PKRU: 55555554
94 [ 301.565944] Call Trace:
95 [ 301.566148] <TASK>
96 [ 301.566325] ? untrack_pfn+0xf4/0x100
97 [ 301.566618] ? __warn+0x81/0x130
98 [ 301.566876] ? untrack_pfn+0xf4/0x100
99 [ 301.567163] ? report_bug+0x171/0x1a0
100 [ 301.567466] ? handle_bug+0x3c/0x80
101 [ 301.567743] ? exc_invalid_op+0x17/0x70
102 [ 301.568038] ? asm_exc_invalid_op+0x1a/0x20
103 [ 301.568363] ? untrack_pfn+0xf4/0x100
104 [ 301.568660] ? untrack_pfn+0x65/0x100
105 [ 301.568947] unmap_single_vma+0xa6/0xe0
106 [ 301.569247] unmap_vmas+0xb5/0x190
107 [ 301.569532] exit_mmap+0xec/0x340
108 [ 301.569801] __mmput+0x3e/0x130
109 [ 301.570051] do_exit+0x305/0xaf0
110 ...
111
112 Link: https://lkml.kernel.org/r/20240403212131.929421-3-david@redhat.com
113 Signed-off-by: David Hildenbrand <david@redhat.com>
114 Reported-by: Wupeng Ma <mawupeng1@huawei.com>
115 Closes: https://lkml.kernel.org/r/20240227122814.3781907-1-mawupeng1@huawei.com
116 Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
117 Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
118 Acked-by: Ingo Molnar <mingo@kernel.org>
119 Cc: Dave Hansen <dave.hansen@linux.intel.com>
120 Cc: Andy Lutomirski <luto@kernel.org>
121 Cc: Peter Zijlstra <peterz@infradead.org>
122 Cc: Thomas Gleixner <tglx@linutronix.de>
123 Cc: Borislav Petkov <bp@alien8.de>
124 Cc: "H. Peter Anvin" <hpa@zytor.com>
125 Cc: <stable@vger.kernel.org>
126 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
127 Signed-off-by: David Hildenbrand <david@redhat.com>
128 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
129 ---
130 arch/x86/mm/pat/memtype.c | 49 ++++++++++++++++++++++++++++++++--------------
131 mm/memory.c | 4 +++
132 2 files changed, 39 insertions(+), 14 deletions(-)
133
134 --- a/arch/x86/mm/pat/memtype.c
135 +++ b/arch/x86/mm/pat/memtype.c
136 @@ -989,6 +989,38 @@ static void free_pfn_range(u64 paddr, un
137 memtype_free(paddr, paddr + size);
138 }
139
140 +static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
141 + pgprot_t *pgprot)
142 +{
143 + unsigned long prot;
144 +
145 + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
146 +
147 + /*
148 + * We need the starting PFN and cachemode used for track_pfn_remap()
149 + * that covered the whole VMA. For most mappings, we can obtain that
150 + * information from the page tables. For COW mappings, we might now
151 + * suddenly have anon folios mapped and follow_phys() will fail.
152 + *
153 + * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
154 + * detect the PFN. If we need the cachemode as well, we're out of luck
155 + * for now and have to fail fork().
156 + */
157 + if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
158 + if (pgprot)
159 + *pgprot = __pgprot(prot);
160 + return 0;
161 + }
162 + if (is_cow_mapping(vma->vm_flags)) {
163 + if (pgprot)
164 + return -EINVAL;
165 + *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
166 + return 0;
167 + }
168 + WARN_ON_ONCE(1);
169 + return -EINVAL;
170 +}
171 +
172 /*
173 * track_pfn_copy is called when vma that is covering the pfnmap gets
174 * copied through copy_page_range().
175 @@ -999,20 +1031,13 @@ static void free_pfn_range(u64 paddr, un
176 int track_pfn_copy(struct vm_area_struct *vma)
177 {
178 resource_size_t paddr;
179 - unsigned long prot;
180 unsigned long vma_size = vma->vm_end - vma->vm_start;
181 pgprot_t pgprot;
182
183 if (vma->vm_flags & VM_PAT) {
184 - /*
185 - * reserve the whole chunk covered by vma. We need the
186 - * starting address and protection from pte.
187 - */
188 - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
189 - WARN_ON_ONCE(1);
190 + if (get_pat_info(vma, &paddr, &pgprot))
191 return -EINVAL;
192 - }
193 - pgprot = __pgprot(prot);
194 + /* reserve the whole chunk covered by vma. */
195 return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
196 }
197
198 @@ -1087,7 +1112,6 @@ void untrack_pfn(struct vm_area_struct *
199 unsigned long size)
200 {
201 resource_size_t paddr;
202 - unsigned long prot;
203
204 if (vma && !(vma->vm_flags & VM_PAT))
205 return;
206 @@ -1095,11 +1119,8 @@ void untrack_pfn(struct vm_area_struct *
207 /* free the chunk starting from pfn or the whole chunk */
208 paddr = (resource_size_t)pfn << PAGE_SHIFT;
209 if (!paddr && !size) {
210 - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
211 - WARN_ON_ONCE(1);
212 + if (get_pat_info(vma, &paddr, NULL))
213 return;
214 - }
215 -
216 size = vma->vm_end - vma->vm_start;
217 }
218 free_pfn_range(paddr, size);
219 --- a/mm/memory.c
220 +++ b/mm/memory.c
221 @@ -5086,6 +5086,10 @@ int follow_phys(struct vm_area_struct *v
222 goto out;
223 pte = *ptep;
224
225 + /* Never return PFNs of anon folios in COW mappings. */
226 + if (vm_normal_page(vma, address, pte))
227 + goto unlock;
228 +
229 if ((flags & FOLL_WRITE) && !pte_write(pte))
230 goto unlock;
231