]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/gup.c
mm: fix shmem THP counters on migration
[thirdparty/linux.git] / mm / gup.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
4bbd4c77
KS
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/err.h>
5#include <linux/spinlock.h>
6
4bbd4c77 7#include <linux/mm.h>
3565fce3 8#include <linux/memremap.h>
4bbd4c77
KS
9#include <linux/pagemap.h>
10#include <linux/rmap.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
1507f512 13#include <linux/secretmem.h>
4bbd4c77 14
174cd4b1 15#include <linux/sched/signal.h>
2667f50e 16#include <linux/rwsem.h>
f30c59e9 17#include <linux/hugetlb.h>
9a4e9f3b
AK
18#include <linux/migrate.h>
19#include <linux/mm_inline.h>
20#include <linux/sched/mm.h>
a6e79df9 21#include <linux/shmem_fs.h>
1027e443 22
33a709b2 23#include <asm/mmu_context.h>
1027e443 24#include <asm/tlbflush.h>
2667f50e 25
4bbd4c77
KS
26#include "internal.h"
27
df06b37f
KB
28struct follow_page_context {
29 struct dev_pagemap *pgmap;
30 unsigned int page_mask;
31};
32
b6a2619c
DH
33static inline void sanity_check_pinned_pages(struct page **pages,
34 unsigned long npages)
35{
36 if (!IS_ENABLED(CONFIG_DEBUG_VM))
37 return;
38
39 /*
40 * We only pin anonymous pages if they are exclusive. Once pinned, we
41 * can no longer turn them possibly shared and PageAnonExclusive() will
42 * stick around until the page is freed.
43 *
44 * We'd like to verify that our pinned anonymous pages are still mapped
45 * exclusively. The issue with anon THP is that we don't know how
46 * they are/were mapped when pinning them. However, for anon
47 * THP we can assume that either the given page (PTE-mapped THP) or
48 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
49 * neither is the case, there is certainly something wrong.
50 */
51 for (; npages; npages--, pages++) {
52 struct page *page = *pages;
53 struct folio *folio = page_folio(page);
54
55 if (!folio_test_anon(folio))
56 continue;
57 if (!folio_test_large(folio) || folio_test_hugetlb(folio))
58 VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
59 else
60 /* Either a PTE-mapped or a PMD-mapped THP. */
61 VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
62 !PageAnonExclusive(page), page);
63 }
64}
65
cd1adf1b 66/*
ece1ed7b 67 * Return the folio with ref appropriately incremented,
cd1adf1b 68 * or NULL if that failed.
a707cdd5 69 */
ece1ed7b 70static inline struct folio *try_get_folio(struct page *page, int refs)
a707cdd5 71{
ece1ed7b 72 struct folio *folio;
a707cdd5 73
59409373 74retry:
ece1ed7b
MWO
75 folio = page_folio(page);
76 if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
a707cdd5 77 return NULL;
ece1ed7b 78 if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
a707cdd5 79 return NULL;
c24d3732
JH
80
81 /*
ece1ed7b
MWO
82 * At this point we have a stable reference to the folio; but it
83 * could be that between calling page_folio() and the refcount
84 * increment, the folio was split, in which case we'd end up
85 * holding a reference on a folio that has nothing to do with the page
c24d3732 86 * we were given anymore.
ece1ed7b
MWO
87 * So now that the folio is stable, recheck that the page still
88 * belongs to this folio.
c24d3732 89 */
ece1ed7b 90 if (unlikely(page_folio(page) != folio)) {
f4f451a1
MS
91 if (!put_devmap_managed_page_refs(&folio->page, refs))
92 folio_put_refs(folio, refs);
59409373 93 goto retry;
c24d3732
JH
94 }
95
ece1ed7b 96 return folio;
a707cdd5
JH
97}
98
3967db22 99/**
ece1ed7b 100 * try_grab_folio() - Attempt to get or pin a folio.
3967db22 101 * @page: pointer to page to be grabbed
ece1ed7b 102 * @refs: the value to (effectively) add to the folio's refcount
3967db22
JH
103 * @flags: gup flags: these are the FOLL_* flag values.
104 *
3faa52c0 105 * "grab" names in this file mean, "look at flags to decide whether to use
ece1ed7b 106 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
3faa52c0
JH
107 *
108 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
109 * same time. (That's true throughout the get_user_pages*() and
110 * pin_user_pages*() APIs.) Cases:
111 *
ece1ed7b 112 * FOLL_GET: folio's refcount will be incremented by @refs.
3967db22 113 *
ece1ed7b 114 * FOLL_PIN on large folios: folio's refcount will be incremented by
94688e8e 115 * @refs, and its pincount will be incremented by @refs.
3967db22 116 *
ece1ed7b 117 * FOLL_PIN on single-page folios: folio's refcount will be incremented by
5232c63f 118 * @refs * GUP_PIN_COUNTING_BIAS.
3faa52c0 119 *
ece1ed7b
MWO
120 * Return: The folio containing @page (with refcount appropriately
121 * incremented) for success, or NULL upon failure. If neither FOLL_GET
122 * nor FOLL_PIN was set, that's considered failure, and furthermore,
123 * a likely bug in the caller, so a warning is also emitted.
3faa52c0 124 */
ece1ed7b 125struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
3faa52c0 126{
503670ee
VMO
127 struct folio *folio;
128
129 if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
130 return NULL;
131
4003f107
LG
132 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
133 return NULL;
134
503670ee
VMO
135 folio = try_get_folio(page, refs);
136
3faa52c0 137 if (flags & FOLL_GET)
503670ee 138 return folio;
088b8aa5 139
503670ee
VMO
140 /* FOLL_PIN is set */
141 if (!folio)
142 return NULL;
47e29d32 143
503670ee
VMO
144 /*
145 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
146 * right zone, so fail and let the caller fall back to the slow
147 * path.
148 */
149 if (unlikely((flags & FOLL_LONGTERM) &&
150 !folio_is_longterm_pinnable(folio))) {
151 if (!put_devmap_managed_page_refs(&folio->page, refs))
152 folio_put_refs(folio, refs);
153 return NULL;
3faa52c0
JH
154 }
155
503670ee
VMO
156 /*
157 * When pinning a large folio, use an exact count to track it.
158 *
159 * However, be sure to *also* increment the normal folio
160 * refcount field at least once, so that the folio really
161 * is pinned. That's why the refcount from the earlier
162 * try_get_folio() is left intact.
163 */
164 if (folio_test_large(folio))
165 atomic_add(refs, &folio->_pincount);
166 else
167 folio_ref_add(folio,
168 refs * (GUP_PIN_COUNTING_BIAS - 1));
169 /*
170 * Adjust the pincount before re-checking the PTE for changes.
171 * This is essentially a smp_mb() and is paired with a memory
172 * barrier in page_try_share_anon_rmap().
173 */
174 smp_mb__after_atomic();
175
176 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
177
178 return folio;
3faa52c0
JH
179}
180
d8ddc099 181static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
4509b42c
JG
182{
183 if (flags & FOLL_PIN) {
d8ddc099
MWO
184 node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
185 if (folio_test_large(folio))
94688e8e 186 atomic_sub(refs, &folio->_pincount);
4509b42c
JG
187 else
188 refs *= GUP_PIN_COUNTING_BIAS;
189 }
190
f4f451a1
MS
191 if (!put_devmap_managed_page_refs(&folio->page, refs))
192 folio_put_refs(folio, refs);
4509b42c
JG
193}
194
3faa52c0
JH
195/**
196 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
5fec0719
MWO
197 * @page: pointer to page to be grabbed
198 * @flags: gup flags: these are the FOLL_* flag values.
3faa52c0
JH
199 *
200 * This might not do anything at all, depending on the flags argument.
201 *
202 * "grab" names in this file mean, "look at flags to decide whether to use
203 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
204 *
3faa52c0 205 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
ece1ed7b 206 * time. Cases: please see the try_grab_folio() documentation, with
3967db22 207 * "refs=1".
3faa52c0 208 *
0f089235
LG
209 * Return: 0 for success, or if no action was required (if neither FOLL_PIN
210 * nor FOLL_GET was set, nothing is done). A negative error code for failure:
211 *
212 * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
213 * be grabbed.
3faa52c0 214 */
0f089235 215int __must_check try_grab_page(struct page *page, unsigned int flags)
3faa52c0 216{
5fec0719
MWO
217 struct folio *folio = page_folio(page);
218
5fec0719 219 if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
0f089235 220 return -ENOMEM;
3faa52c0 221
4003f107
LG
222 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
223 return -EREMOTEIO;
3faa52c0 224
c36c04c2 225 if (flags & FOLL_GET)
5fec0719 226 folio_ref_inc(folio);
c36c04c2 227 else if (flags & FOLL_PIN) {
c36c04c2 228 /*
5fec0719 229 * Similar to try_grab_folio(): be sure to *also*
78d9d6ce
MWO
230 * increment the normal page refcount field at least once,
231 * so that the page really is pinned.
c36c04c2 232 */
5fec0719
MWO
233 if (folio_test_large(folio)) {
234 folio_ref_add(folio, 1);
94688e8e 235 atomic_add(1, &folio->_pincount);
8ea2979c 236 } else {
5fec0719 237 folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
8ea2979c 238 }
c36c04c2 239
5fec0719 240 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
c36c04c2
JH
241 }
242
0f089235 243 return 0;
3faa52c0
JH
244}
245
3faa52c0
JH
246/**
247 * unpin_user_page() - release a dma-pinned page
248 * @page: pointer to page to be released
249 *
250 * Pages that were pinned via pin_user_pages*() must be released via either
251 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
252 * that such pages can be separately tracked and uniquely handled. In
253 * particular, interactions with RDMA and filesystems need special handling.
254 */
255void unpin_user_page(struct page *page)
256{
b6a2619c 257 sanity_check_pinned_pages(&page, 1);
d8ddc099 258 gup_put_folio(page_folio(page), 1, FOLL_PIN);
3faa52c0
JH
259}
260EXPORT_SYMBOL(unpin_user_page);
261
659508f9 262static inline struct folio *gup_folio_range_next(struct page *start,
8f39f5fc 263 unsigned long npages, unsigned long i, unsigned int *ntails)
458a4f78 264{
659508f9
MWO
265 struct page *next = nth_page(start, i);
266 struct folio *folio = page_folio(next);
458a4f78
JM
267 unsigned int nr = 1;
268
659508f9 269 if (folio_test_large(folio))
4c654229 270 nr = min_t(unsigned int, npages - i,
659508f9 271 folio_nr_pages(folio) - folio_page_idx(folio, next));
458a4f78 272
458a4f78 273 *ntails = nr;
659508f9 274 return folio;
458a4f78
JM
275}
276
12521c76 277static inline struct folio *gup_folio_next(struct page **list,
28297dbc 278 unsigned long npages, unsigned long i, unsigned int *ntails)
8745d7f6 279{
12521c76 280 struct folio *folio = page_folio(list[i]);
8745d7f6
JM
281 unsigned int nr;
282
8745d7f6 283 for (nr = i + 1; nr < npages; nr++) {
12521c76 284 if (page_folio(list[nr]) != folio)
8745d7f6
JM
285 break;
286 }
287
8745d7f6 288 *ntails = nr - i;
12521c76 289 return folio;
8745d7f6
JM
290}
291
fc1d8e7c 292/**
f1f6a7dd 293 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
2d15eb31 294 * @pages: array of pages to be maybe marked dirty, and definitely released.
fc1d8e7c 295 * @npages: number of pages in the @pages array.
2d15eb31 296 * @make_dirty: whether to mark the pages dirty
fc1d8e7c
JH
297 *
298 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
299 * variants called on that page.
300 *
301 * For each page in the @pages array, make that page (or its head page, if a
2d15eb31 302 * compound page) dirty, if @make_dirty is true, and if the page was previously
f1f6a7dd
JH
303 * listed as clean. In any case, releases all pages using unpin_user_page(),
304 * possibly via unpin_user_pages(), for the non-dirty case.
fc1d8e7c 305 *
f1f6a7dd 306 * Please see the unpin_user_page() documentation for details.
fc1d8e7c 307 *
2d15eb31
AM
308 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
309 * required, then the caller should a) verify that this is really correct,
310 * because _lock() is usually required, and b) hand code it:
f1f6a7dd 311 * set_page_dirty_lock(), unpin_user_page().
fc1d8e7c
JH
312 *
313 */
f1f6a7dd
JH
314void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
315 bool make_dirty)
fc1d8e7c 316{
12521c76
MWO
317 unsigned long i;
318 struct folio *folio;
319 unsigned int nr;
2d15eb31
AM
320
321 if (!make_dirty) {
f1f6a7dd 322 unpin_user_pages(pages, npages);
2d15eb31
AM
323 return;
324 }
325
b6a2619c 326 sanity_check_pinned_pages(pages, npages);
12521c76
MWO
327 for (i = 0; i < npages; i += nr) {
328 folio = gup_folio_next(pages, npages, i, &nr);
2d15eb31
AM
329 /*
330 * Checking PageDirty at this point may race with
331 * clear_page_dirty_for_io(), but that's OK. Two key
332 * cases:
333 *
334 * 1) This code sees the page as already dirty, so it
335 * skips the call to set_page_dirty(). That could happen
336 * because clear_page_dirty_for_io() called
337 * page_mkclean(), followed by set_page_dirty().
338 * However, now the page is going to get written back,
339 * which meets the original intention of setting it
340 * dirty, so all is well: clear_page_dirty_for_io() goes
341 * on to call TestClearPageDirty(), and write the page
342 * back.
343 *
344 * 2) This code sees the page as clean, so it calls
345 * set_page_dirty(). The page stays dirty, despite being
346 * written back, so it gets written back again in the
347 * next writeback cycle. This is harmless.
348 */
12521c76
MWO
349 if (!folio_test_dirty(folio)) {
350 folio_lock(folio);
351 folio_mark_dirty(folio);
352 folio_unlock(folio);
353 }
354 gup_put_folio(folio, nr, FOLL_PIN);
2d15eb31 355 }
fc1d8e7c 356}
f1f6a7dd 357EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
fc1d8e7c 358
458a4f78
JM
359/**
360 * unpin_user_page_range_dirty_lock() - release and optionally dirty
361 * gup-pinned page range
362 *
363 * @page: the starting page of a range maybe marked dirty, and definitely released.
364 * @npages: number of consecutive pages to release.
365 * @make_dirty: whether to mark the pages dirty
366 *
367 * "gup-pinned page range" refers to a range of pages that has had one of the
368 * pin_user_pages() variants called on that page.
369 *
370 * For the page ranges defined by [page .. page+npages], make that range (or
371 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
372 * page range was previously listed as clean.
373 *
374 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
375 * required, then the caller should a) verify that this is really correct,
376 * because _lock() is usually required, and b) hand code it:
377 * set_page_dirty_lock(), unpin_user_page().
378 *
379 */
380void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
381 bool make_dirty)
382{
659508f9
MWO
383 unsigned long i;
384 struct folio *folio;
385 unsigned int nr;
386
387 for (i = 0; i < npages; i += nr) {
388 folio = gup_folio_range_next(page, npages, i, &nr);
389 if (make_dirty && !folio_test_dirty(folio)) {
390 folio_lock(folio);
391 folio_mark_dirty(folio);
392 folio_unlock(folio);
393 }
394 gup_put_folio(folio, nr, FOLL_PIN);
458a4f78
JM
395 }
396}
397EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
398
b6a2619c
DH
399static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
400{
401 unsigned long i;
402 struct folio *folio;
403 unsigned int nr;
404
405 /*
406 * Don't perform any sanity checks because we might have raced with
407 * fork() and some anonymous pages might now actually be shared --
408 * which is why we're unpinning after all.
409 */
410 for (i = 0; i < npages; i += nr) {
411 folio = gup_folio_next(pages, npages, i, &nr);
412 gup_put_folio(folio, nr, FOLL_PIN);
413 }
414}
415
fc1d8e7c 416/**
f1f6a7dd 417 * unpin_user_pages() - release an array of gup-pinned pages.
fc1d8e7c
JH
418 * @pages: array of pages to be marked dirty and released.
419 * @npages: number of pages in the @pages array.
420 *
f1f6a7dd 421 * For each page in the @pages array, release the page using unpin_user_page().
fc1d8e7c 422 *
f1f6a7dd 423 * Please see the unpin_user_page() documentation for details.
fc1d8e7c 424 */
f1f6a7dd 425void unpin_user_pages(struct page **pages, unsigned long npages)
fc1d8e7c 426{
12521c76
MWO
427 unsigned long i;
428 struct folio *folio;
429 unsigned int nr;
fc1d8e7c 430
146608bb
JH
431 /*
432 * If this WARN_ON() fires, then the system *might* be leaking pages (by
433 * leaving them pinned), but probably not. More likely, gup/pup returned
434 * a hard -ERRNO error to the caller, who erroneously passed it here.
435 */
436 if (WARN_ON(IS_ERR_VALUE(npages)))
437 return;
31b912de 438
b6a2619c 439 sanity_check_pinned_pages(pages, npages);
12521c76
MWO
440 for (i = 0; i < npages; i += nr) {
441 folio = gup_folio_next(pages, npages, i, &nr);
442 gup_put_folio(folio, nr, FOLL_PIN);
e7602748 443 }
fc1d8e7c 444}
f1f6a7dd 445EXPORT_SYMBOL(unpin_user_pages);
fc1d8e7c 446
a458b76a
AA
447/*
448 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
449 * lifecycle. Avoid setting the bit unless necessary, or it might cause write
450 * cache bouncing on large SMP machines for concurrent pinned gups.
451 */
452static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
453{
454 if (!test_bit(MMF_HAS_PINNED, mm_flags))
455 set_bit(MMF_HAS_PINNED, mm_flags);
456}
457
050a9adc 458#ifdef CONFIG_MMU
69e68b4f
KS
459static struct page *no_page_table(struct vm_area_struct *vma,
460 unsigned int flags)
4bbd4c77 461{
69e68b4f
KS
462 /*
463 * When core dumping an enormous anonymous area that nobody
464 * has touched so far, we don't want to allocate unnecessary pages or
465 * page tables. Return error instead of NULL to skip handle_mm_fault,
466 * then get_dump_page() will return NULL to leave a hole in the dump.
467 * But we can only make this optimization where a hole would surely
468 * be zero-filled if handle_mm_fault() actually did handle it.
469 */
a0137f16
AK
470 if ((flags & FOLL_DUMP) &&
471 (vma_is_anonymous(vma) || !vma->vm_ops->fault))
69e68b4f
KS
472 return ERR_PTR(-EFAULT);
473 return NULL;
474}
4bbd4c77 475
1027e443
KS
476static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
477 pte_t *pte, unsigned int flags)
478{
1027e443 479 if (flags & FOLL_TOUCH) {
c33c7948
RR
480 pte_t orig_entry = ptep_get(pte);
481 pte_t entry = orig_entry;
1027e443
KS
482
483 if (flags & FOLL_WRITE)
484 entry = pte_mkdirty(entry);
485 entry = pte_mkyoung(entry);
486
c33c7948 487 if (!pte_same(orig_entry, entry)) {
1027e443
KS
488 set_pte_at(vma->vm_mm, address, pte, entry);
489 update_mmu_cache(vma, address, pte);
490 }
491 }
492
493 /* Proper page table entry exists, but no corresponding struct page */
494 return -EEXIST;
495}
496
5535be30
DH
497/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
498static inline bool can_follow_write_pte(pte_t pte, struct page *page,
499 struct vm_area_struct *vma,
500 unsigned int flags)
19be0eaf 501{
5535be30
DH
502 /* If the pte is writable, we can write to the page. */
503 if (pte_write(pte))
504 return true;
505
506 /* Maybe FOLL_FORCE is set to override it? */
507 if (!(flags & FOLL_FORCE))
508 return false;
509
510 /* But FOLL_FORCE has no effect on shared mappings */
511 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
512 return false;
513
514 /* ... or read-only private ones */
515 if (!(vma->vm_flags & VM_MAYWRITE))
516 return false;
517
518 /* ... or already writable ones that just need to take a write fault */
519 if (vma->vm_flags & VM_WRITE)
520 return false;
521
522 /*
523 * See can_change_pte_writable(): we broke COW and could map the page
524 * writable if we have an exclusive anonymous page ...
525 */
526 if (!page || !PageAnon(page) || !PageAnonExclusive(page))
527 return false;
528
529 /* ... and a write-fault isn't required for other reasons. */
530 if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
531 return false;
532 return !userfaultfd_pte_wp(vma, pte);
19be0eaf
LT
533}
534
69e68b4f 535static struct page *follow_page_pte(struct vm_area_struct *vma,
df06b37f
KB
536 unsigned long address, pmd_t *pmd, unsigned int flags,
537 struct dev_pagemap **pgmap)
69e68b4f
KS
538{
539 struct mm_struct *mm = vma->vm_mm;
540 struct page *page;
541 spinlock_t *ptl;
542 pte_t *ptep, pte;
f28d4363 543 int ret;
4bbd4c77 544
eddb1c22
JH
545 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
546 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
547 (FOLL_PIN | FOLL_GET)))
548 return ERR_PTR(-EINVAL);
4bbd4c77
KS
549
550 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
04dee9e8
HD
551 if (!ptep)
552 return no_page_table(vma, flags);
c33c7948 553 pte = ptep_get(ptep);
f7355e99
DH
554 if (!pte_present(pte))
555 goto no_page;
474098ed 556 if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
4bbd4c77 557 goto no_page;
4bbd4c77
KS
558
559 page = vm_normal_page(vma, address, pte);
5535be30
DH
560
561 /*
562 * We only care about anon pages in can_follow_write_pte() and don't
563 * have to worry about pte_devmap() because they are never anon.
564 */
565 if ((flags & FOLL_WRITE) &&
566 !can_follow_write_pte(pte, page, vma, flags)) {
567 page = NULL;
568 goto out;
569 }
570
3faa52c0 571 if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
3565fce3 572 /*
3faa52c0
JH
573 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
574 * case since they are only valid while holding the pgmap
575 * reference.
3565fce3 576 */
df06b37f
KB
577 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
578 if (*pgmap)
3565fce3
DW
579 page = pte_page(pte);
580 else
581 goto no_page;
582 } else if (unlikely(!page)) {
1027e443
KS
583 if (flags & FOLL_DUMP) {
584 /* Avoid special (like zero) pages in core dumps */
585 page = ERR_PTR(-EFAULT);
586 goto out;
587 }
588
589 if (is_zero_pfn(pte_pfn(pte))) {
590 page = pte_page(pte);
591 } else {
1027e443
KS
592 ret = follow_pfn_pte(vma, address, ptep, flags);
593 page = ERR_PTR(ret);
594 goto out;
595 }
4bbd4c77
KS
596 }
597
84209e87 598 if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
a7f22660
DH
599 page = ERR_PTR(-EMLINK);
600 goto out;
601 }
b6a2619c
DH
602
603 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
604 !PageAnonExclusive(page), page);
605
3faa52c0 606 /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
0f089235
LG
607 ret = try_grab_page(page, flags);
608 if (unlikely(ret)) {
609 page = ERR_PTR(ret);
3faa52c0 610 goto out;
8fde12ca 611 }
4003f107 612
f28d4363
CI
613 /*
614 * We need to make the page accessible if and only if we are going
615 * to access its content (the FOLL_PIN case). Please see
616 * Documentation/core-api/pin_user_pages.rst for details.
617 */
618 if (flags & FOLL_PIN) {
619 ret = arch_make_page_accessible(page);
620 if (ret) {
621 unpin_user_page(page);
622 page = ERR_PTR(ret);
623 goto out;
624 }
625 }
4bbd4c77
KS
626 if (flags & FOLL_TOUCH) {
627 if ((flags & FOLL_WRITE) &&
628 !pte_dirty(pte) && !PageDirty(page))
629 set_page_dirty(page);
630 /*
631 * pte_mkyoung() would be more correct here, but atomic care
632 * is needed to avoid losing the dirty bit: it is easier to use
633 * mark_page_accessed().
634 */
635 mark_page_accessed(page);
636 }
1027e443 637out:
4bbd4c77 638 pte_unmap_unlock(ptep, ptl);
4bbd4c77 639 return page;
4bbd4c77
KS
640no_page:
641 pte_unmap_unlock(ptep, ptl);
642 if (!pte_none(pte))
69e68b4f
KS
643 return NULL;
644 return no_page_table(vma, flags);
645}
646
080dbb61
AK
647static struct page *follow_pmd_mask(struct vm_area_struct *vma,
648 unsigned long address, pud_t *pudp,
df06b37f
KB
649 unsigned int flags,
650 struct follow_page_context *ctx)
69e68b4f 651{
68827280 652 pmd_t *pmd, pmdval;
69e68b4f
KS
653 spinlock_t *ptl;
654 struct page *page;
655 struct mm_struct *mm = vma->vm_mm;
656
080dbb61 657 pmd = pmd_offset(pudp, address);
26e1a0c3 658 pmdval = pmdp_get_lockless(pmd);
68827280 659 if (pmd_none(pmdval))
69e68b4f 660 return no_page_table(vma, flags);
f7355e99 661 if (!pmd_present(pmdval))
e66f17ff 662 return no_page_table(vma, flags);
68827280 663 if (pmd_devmap(pmdval)) {
3565fce3 664 ptl = pmd_lock(mm, pmd);
df06b37f 665 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
3565fce3
DW
666 spin_unlock(ptl);
667 if (page)
668 return page;
669 }
68827280 670 if (likely(!pmd_trans_huge(pmdval)))
df06b37f 671 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
6742d293 672
474098ed 673 if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
db08f203
AK
674 return no_page_table(vma, flags);
675
6742d293 676 ptl = pmd_lock(mm, pmd);
84c3fc4e
ZY
677 if (unlikely(!pmd_present(*pmd))) {
678 spin_unlock(ptl);
f7355e99 679 return no_page_table(vma, flags);
84c3fc4e 680 }
6742d293
KS
681 if (unlikely(!pmd_trans_huge(*pmd))) {
682 spin_unlock(ptl);
df06b37f 683 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
6742d293 684 }
4066c119 685 if (flags & FOLL_SPLIT_PMD) {
2378118b
HD
686 spin_unlock(ptl);
687 split_huge_pmd(vma, pmd, address);
688 /* If pmd was left empty, stuff a page table in there quickly */
689 return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
df06b37f 690 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
69e68b4f 691 }
6742d293
KS
692 page = follow_trans_huge_pmd(vma, address, pmd, flags);
693 spin_unlock(ptl);
df06b37f 694 ctx->page_mask = HPAGE_PMD_NR - 1;
6742d293 695 return page;
4bbd4c77
KS
696}
697
080dbb61
AK
698static struct page *follow_pud_mask(struct vm_area_struct *vma,
699 unsigned long address, p4d_t *p4dp,
df06b37f
KB
700 unsigned int flags,
701 struct follow_page_context *ctx)
080dbb61
AK
702{
703 pud_t *pud;
704 spinlock_t *ptl;
705 struct page *page;
706 struct mm_struct *mm = vma->vm_mm;
707
708 pud = pud_offset(p4dp, address);
709 if (pud_none(*pud))
710 return no_page_table(vma, flags);
080dbb61
AK
711 if (pud_devmap(*pud)) {
712 ptl = pud_lock(mm, pud);
df06b37f 713 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
080dbb61
AK
714 spin_unlock(ptl);
715 if (page)
716 return page;
717 }
718 if (unlikely(pud_bad(*pud)))
719 return no_page_table(vma, flags);
720
df06b37f 721 return follow_pmd_mask(vma, address, pud, flags, ctx);
080dbb61
AK
722}
723
080dbb61
AK
724static struct page *follow_p4d_mask(struct vm_area_struct *vma,
725 unsigned long address, pgd_t *pgdp,
df06b37f
KB
726 unsigned int flags,
727 struct follow_page_context *ctx)
080dbb61
AK
728{
729 p4d_t *p4d;
730
731 p4d = p4d_offset(pgdp, address);
732 if (p4d_none(*p4d))
733 return no_page_table(vma, flags);
734 BUILD_BUG_ON(p4d_huge(*p4d));
735 if (unlikely(p4d_bad(*p4d)))
736 return no_page_table(vma, flags);
737
df06b37f 738 return follow_pud_mask(vma, address, p4d, flags, ctx);
080dbb61
AK
739}
740
741/**
742 * follow_page_mask - look up a page descriptor from a user-virtual address
743 * @vma: vm_area_struct mapping @address
744 * @address: virtual address to look up
745 * @flags: flags modifying lookup behaviour
78179556
MR
746 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
747 * pointer to output page_mask
080dbb61
AK
748 *
749 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
750 *
78179556
MR
751 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
752 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
753 *
a7f22660
DH
754 * When getting an anonymous page and the caller has to trigger unsharing
755 * of a shared anonymous page first, -EMLINK is returned. The caller should
756 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
757 * relevant with FOLL_PIN and !FOLL_WRITE.
758 *
78179556
MR
759 * On output, the @ctx->page_mask is set according to the size of the page.
760 *
761 * Return: the mapped (struct page *), %NULL if no mapping exists, or
080dbb61
AK
762 * an error pointer if there is a mapping to something not represented
763 * by a page descriptor (see also vm_normal_page()).
764 */
a7030aea 765static struct page *follow_page_mask(struct vm_area_struct *vma,
080dbb61 766 unsigned long address, unsigned int flags,
df06b37f 767 struct follow_page_context *ctx)
080dbb61
AK
768{
769 pgd_t *pgd;
770 struct page *page;
771 struct mm_struct *mm = vma->vm_mm;
772
df06b37f 773 ctx->page_mask = 0;
080dbb61 774
57a196a5
MK
775 /*
776 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
777 * special hugetlb page table walking code. This eliminates the
778 * need to check for hugetlb entries in the general walking code.
779 *
780 * hugetlb_follow_page_mask is only for follow_page() handling here.
781 * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
782 */
783 if (is_vm_hugetlb_page(vma)) {
784 page = hugetlb_follow_page_mask(vma, address, flags);
785 if (!page)
786 page = no_page_table(vma, flags);
080dbb61
AK
787 return page;
788 }
789
790 pgd = pgd_offset(mm, address);
791
792 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
793 return no_page_table(vma, flags);
794
df06b37f
KB
795 return follow_p4d_mask(vma, address, pgd, flags, ctx);
796}
797
798struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
799 unsigned int foll_flags)
800{
801 struct follow_page_context ctx = { NULL };
802 struct page *page;
803
1507f512
MR
804 if (vma_is_secretmem(vma))
805 return NULL;
806
d64e2dbc 807 if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
8909691b
DH
808 return NULL;
809
df06b37f
KB
810 page = follow_page_mask(vma, address, foll_flags, &ctx);
811 if (ctx.pgmap)
812 put_dev_pagemap(ctx.pgmap);
813 return page;
080dbb61
AK
814}
815
f2b495ca
KS
816static int get_gate_page(struct mm_struct *mm, unsigned long address,
817 unsigned int gup_flags, struct vm_area_struct **vma,
818 struct page **page)
819{
820 pgd_t *pgd;
c2febafc 821 p4d_t *p4d;
f2b495ca
KS
822 pud_t *pud;
823 pmd_t *pmd;
824 pte_t *pte;
c33c7948 825 pte_t entry;
f2b495ca
KS
826 int ret = -EFAULT;
827
828 /* user gate pages are read-only */
829 if (gup_flags & FOLL_WRITE)
830 return -EFAULT;
831 if (address > TASK_SIZE)
832 pgd = pgd_offset_k(address);
833 else
834 pgd = pgd_offset_gate(mm, address);
b5d1c39f
AL
835 if (pgd_none(*pgd))
836 return -EFAULT;
c2febafc 837 p4d = p4d_offset(pgd, address);
b5d1c39f
AL
838 if (p4d_none(*p4d))
839 return -EFAULT;
c2febafc 840 pud = pud_offset(p4d, address);
b5d1c39f
AL
841 if (pud_none(*pud))
842 return -EFAULT;
f2b495ca 843 pmd = pmd_offset(pud, address);
84c3fc4e 844 if (!pmd_present(*pmd))
f2b495ca 845 return -EFAULT;
f2b495ca 846 pte = pte_offset_map(pmd, address);
04dee9e8
HD
847 if (!pte)
848 return -EFAULT;
c33c7948
RR
849 entry = ptep_get(pte);
850 if (pte_none(entry))
f2b495ca
KS
851 goto unmap;
852 *vma = get_gate_vma(mm);
853 if (!page)
854 goto out;
c33c7948 855 *page = vm_normal_page(*vma, address, entry);
f2b495ca 856 if (!*page) {
c33c7948 857 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
f2b495ca 858 goto unmap;
c33c7948 859 *page = pte_page(entry);
f2b495ca 860 }
0f089235
LG
861 ret = try_grab_page(*page, gup_flags);
862 if (unlikely(ret))
8fde12ca 863 goto unmap;
f2b495ca
KS
864out:
865 ret = 0;
866unmap:
867 pte_unmap(pte);
868 return ret;
869}
870
9a95f3cf 871/*
9a863a6a
JG
872 * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
873 * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
874 * to 0 and -EBUSY returned.
9a95f3cf 875 */
64019a2e 876static int faultin_page(struct vm_area_struct *vma,
a7f22660
DH
877 unsigned long address, unsigned int *flags, bool unshare,
878 int *locked)
16744483 879{
16744483 880 unsigned int fault_flags = 0;
2b740303 881 vm_fault_t ret;
16744483 882
55b8fe70
AG
883 if (*flags & FOLL_NOFAULT)
884 return -EFAULT;
16744483
KS
885 if (*flags & FOLL_WRITE)
886 fault_flags |= FAULT_FLAG_WRITE;
1b2ee126
DH
887 if (*flags & FOLL_REMOTE)
888 fault_flags |= FAULT_FLAG_REMOTE;
f04740f5 889 if (*flags & FOLL_UNLOCKABLE) {
71335f37 890 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
93c5c61d
PX
891 /*
892 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
893 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
894 * That's because some callers may not be prepared to
895 * handle early exits caused by non-fatal signals.
896 */
897 if (*flags & FOLL_INTERRUPTIBLE)
898 fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
899 }
16744483
KS
900 if (*flags & FOLL_NOWAIT)
901 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
234b239b 902 if (*flags & FOLL_TRIED) {
4426e945
PX
903 /*
904 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
905 * can co-exist
906 */
234b239b
ALC
907 fault_flags |= FAULT_FLAG_TRIED;
908 }
a7f22660
DH
909 if (unshare) {
910 fault_flags |= FAULT_FLAG_UNSHARE;
911 /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
912 VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
913 }
16744483 914
bce617ed 915 ret = handle_mm_fault(vma, address, fault_flags, NULL);
d9272525
PX
916
917 if (ret & VM_FAULT_COMPLETED) {
918 /*
919 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
920 * mmap lock in the page fault handler. Sanity check this.
921 */
922 WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
9a863a6a
JG
923 *locked = 0;
924
d9272525
PX
925 /*
926 * We should do the same as VM_FAULT_RETRY, but let's not
927 * return -EBUSY since that's not reflecting the reality of
928 * what has happened - we've just fully completed a page
929 * fault, with the mmap lock released. Use -EAGAIN to show
930 * that we want to take the mmap lock _again_.
931 */
932 return -EAGAIN;
933 }
934
16744483 935 if (ret & VM_FAULT_ERROR) {
9a291a7c
JM
936 int err = vm_fault_to_errno(ret, *flags);
937
938 if (err)
939 return err;
16744483
KS
940 BUG();
941 }
942
16744483 943 if (ret & VM_FAULT_RETRY) {
9a863a6a 944 if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
4f6da934 945 *locked = 0;
16744483
KS
946 return -EBUSY;
947 }
948
16744483
KS
949 return 0;
950}
951
8ac26843
LS
952/*
953 * Writing to file-backed mappings which require folio dirty tracking using GUP
954 * is a fundamentally broken operation, as kernel write access to GUP mappings
955 * do not adhere to the semantics expected by a file system.
956 *
957 * Consider the following scenario:-
958 *
959 * 1. A folio is written to via GUP which write-faults the memory, notifying
960 * the file system and dirtying the folio.
961 * 2. Later, writeback is triggered, resulting in the folio being cleaned and
962 * the PTE being marked read-only.
963 * 3. The GUP caller writes to the folio, as it is mapped read/write via the
964 * direct mapping.
965 * 4. The GUP caller, now done with the page, unpins it and sets it dirty
966 * (though it does not have to).
967 *
968 * This results in both data being written to a folio without writenotify, and
969 * the folio being dirtied unexpectedly (if the caller decides to do so).
970 */
971static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
972 unsigned long gup_flags)
973{
974 /*
975 * If we aren't pinning then no problematic write can occur. A long term
976 * pin is the most egregious case so this is the case we disallow.
977 */
978 if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
979 (FOLL_PIN | FOLL_LONGTERM))
980 return true;
981
982 /*
983 * If the VMA does not require dirty tracking then no problematic write
984 * can occur either.
985 */
986 return !vma_needs_dirty_tracking(vma);
987}
988
fa5bb209
KS
989static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
990{
991 vm_flags_t vm_flags = vma->vm_flags;
1b2ee126
DH
992 int write = (gup_flags & FOLL_WRITE);
993 int foreign = (gup_flags & FOLL_REMOTE);
8ac26843 994 bool vma_anon = vma_is_anonymous(vma);
fa5bb209
KS
995
996 if (vm_flags & (VM_IO | VM_PFNMAP))
997 return -EFAULT;
998
8ac26843 999 if ((gup_flags & FOLL_ANON) && !vma_anon)
7f7ccc2c
WT
1000 return -EFAULT;
1001
52650c8b
JG
1002 if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
1003 return -EOPNOTSUPP;
1004
1507f512
MR
1005 if (vma_is_secretmem(vma))
1006 return -EFAULT;
1007
1b2ee126 1008 if (write) {
8ac26843
LS
1009 if (!vma_anon &&
1010 !writable_file_mapping_allowed(vma, gup_flags))
1011 return -EFAULT;
1012
fa5bb209
KS
1013 if (!(vm_flags & VM_WRITE)) {
1014 if (!(gup_flags & FOLL_FORCE))
1015 return -EFAULT;
f347454d
DH
1016 /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
1017 if (is_vm_hugetlb_page(vma))
1018 return -EFAULT;
fa5bb209
KS
1019 /*
1020 * We used to let the write,force case do COW in a
1021 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
1022 * set a breakpoint in a read-only mapping of an
1023 * executable, without corrupting the file (yet only
1024 * when that file had been opened for writing!).
1025 * Anon pages in shared mappings are surprising: now
1026 * just reject it.
1027 */
46435364 1028 if (!is_cow_mapping(vm_flags))
fa5bb209 1029 return -EFAULT;
fa5bb209
KS
1030 }
1031 } else if (!(vm_flags & VM_READ)) {
1032 if (!(gup_flags & FOLL_FORCE))
1033 return -EFAULT;
1034 /*
1035 * Is there actually any vma we can reach here which does not
1036 * have VM_MAYREAD set?
1037 */
1038 if (!(vm_flags & VM_MAYREAD))
1039 return -EFAULT;
1040 }
d61172b4
DH
1041 /*
1042 * gups are always data accesses, not instruction
1043 * fetches, so execute=false here
1044 */
1045 if (!arch_vma_access_permitted(vma, write, false, foreign))
33a709b2 1046 return -EFAULT;
fa5bb209
KS
1047 return 0;
1048}
1049
4bbd4c77
KS
1050/**
1051 * __get_user_pages() - pin user pages in memory
4bbd4c77
KS
1052 * @mm: mm_struct of target mm
1053 * @start: starting user address
1054 * @nr_pages: number of pages from start to pin
1055 * @gup_flags: flags modifying pin behaviour
1056 * @pages: array that receives pointers to the pages pinned.
1057 * Should be at least nr_pages long. Or NULL, if caller
1058 * only intends to ensure the pages are faulted in.
c1e8d7c6 1059 * @locked: whether we're still with the mmap_lock held
4bbd4c77 1060 *
d2dfbe47
LX
1061 * Returns either number of pages pinned (which may be less than the
1062 * number requested), or an error. Details about the return value:
1063 *
1064 * -- If nr_pages is 0, returns 0.
1065 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1066 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1067 * pages pinned. Again, this may be less than nr_pages.
2d3a36a4 1068 * -- 0 return value is possible when the fault would need to be retried.
d2dfbe47
LX
1069 *
1070 * The caller is responsible for releasing returned @pages, via put_page().
1071 *
c1e8d7c6 1072 * Must be called with mmap_lock held. It may be released. See below.
4bbd4c77
KS
1073 *
1074 * __get_user_pages walks a process's page tables and takes a reference to
1075 * each struct page that each user address corresponds to at a given
1076 * instant. That is, it takes the page that would be accessed if a user
1077 * thread accesses the given user virtual address at that instant.
1078 *
1079 * This does not guarantee that the page exists in the user mappings when
1080 * __get_user_pages returns, and there may even be a completely different
1081 * page there in some cases (eg. if mmapped pagecache has been invalidated
c5acf1f6 1082 * and subsequently re-faulted). However it does guarantee that the page
4bbd4c77
KS
1083 * won't be freed completely. And mostly callers simply care that the page
1084 * contains data that was valid *at some point in time*. Typically, an IO
1085 * or similar operation cannot guarantee anything stronger anyway because
1086 * locks can't be held over the syscall boundary.
1087 *
1088 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1089 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1090 * appropriate) must be called after the page is finished with, and
1091 * before put_page is called.
1092 *
9a863a6a
JG
1093 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
1094 * be released. If this happens *@locked will be set to 0 on return.
9a95f3cf 1095 *
9a863a6a
JG
1096 * A caller using such a combination of @gup_flags must therefore hold the
1097 * mmap_lock for reading only, and recognize when it's been released. Otherwise,
1098 * it must be held for either reading or writing and will not be released.
4bbd4c77
KS
1099 *
1100 * In most cases, get_user_pages or get_user_pages_fast should be used
1101 * instead of __get_user_pages. __get_user_pages should be used only if
1102 * you need some special @gup_flags.
1103 */
64019a2e 1104static long __get_user_pages(struct mm_struct *mm,
4bbd4c77
KS
1105 unsigned long start, unsigned long nr_pages,
1106 unsigned int gup_flags, struct page **pages,
b2cac248 1107 int *locked)
4bbd4c77 1108{
df06b37f 1109 long ret = 0, i = 0;
fa5bb209 1110 struct vm_area_struct *vma = NULL;
df06b37f 1111 struct follow_page_context ctx = { NULL };
4bbd4c77
KS
1112
1113 if (!nr_pages)
1114 return 0;
1115
428e106a 1116 start = untagged_addr_remote(mm, start);
f9652594 1117
eddb1c22 1118 VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
4bbd4c77 1119
4bbd4c77 1120 do {
fa5bb209
KS
1121 struct page *page;
1122 unsigned int foll_flags = gup_flags;
1123 unsigned int page_increm;
1124
1125 /* first iteration or cross vma bound */
1126 if (!vma || start >= vma->vm_end) {
1127 vma = find_extend_vma(mm, start);
1128 if (!vma && in_gate_area(mm, start)) {
fa5bb209
KS
1129 ret = get_gate_page(mm, start & PAGE_MASK,
1130 gup_flags, &vma,
1131 pages ? &pages[i] : NULL);
1132 if (ret)
08be37b7 1133 goto out;
df06b37f 1134 ctx.page_mask = 0;
fa5bb209
KS
1135 goto next_page;
1136 }
4bbd4c77 1137
52650c8b 1138 if (!vma) {
df06b37f
KB
1139 ret = -EFAULT;
1140 goto out;
1141 }
52650c8b
JG
1142 ret = check_vma_flags(vma, gup_flags);
1143 if (ret)
1144 goto out;
1145
fa5bb209 1146 if (is_vm_hugetlb_page(vma)) {
b2cac248
LS
1147 i = follow_hugetlb_page(mm, vma, pages,
1148 &start, &nr_pages, i,
1149 gup_flags, locked);
9a863a6a 1150 if (!*locked) {
ad415db8
PX
1151 /*
1152 * We've got a VM_FAULT_RETRY
c1e8d7c6 1153 * and we've lost mmap_lock.
ad415db8
PX
1154 * We must stop here.
1155 */
1156 BUG_ON(gup_flags & FOLL_NOWAIT);
ad415db8
PX
1157 goto out;
1158 }
fa5bb209 1159 continue;
4bbd4c77 1160 }
fa5bb209
KS
1161 }
1162retry:
1163 /*
1164 * If we have a pending SIGKILL, don't keep faulting pages and
1165 * potentially allocating memory.
1166 */
fa45f116 1167 if (fatal_signal_pending(current)) {
d180870d 1168 ret = -EINTR;
df06b37f
KB
1169 goto out;
1170 }
fa5bb209 1171 cond_resched();
df06b37f
KB
1172
1173 page = follow_page_mask(vma, start, foll_flags, &ctx);
a7f22660
DH
1174 if (!page || PTR_ERR(page) == -EMLINK) {
1175 ret = faultin_page(vma, start, &foll_flags,
1176 PTR_ERR(page) == -EMLINK, locked);
fa5bb209
KS
1177 switch (ret) {
1178 case 0:
1179 goto retry;
df06b37f 1180 case -EBUSY:
d9272525 1181 case -EAGAIN:
df06b37f 1182 ret = 0;
e4a9bc58 1183 fallthrough;
fa5bb209
KS
1184 case -EFAULT:
1185 case -ENOMEM:
1186 case -EHWPOISON:
df06b37f 1187 goto out;
4bbd4c77 1188 }
fa5bb209 1189 BUG();
1027e443
KS
1190 } else if (PTR_ERR(page) == -EEXIST) {
1191 /*
1192 * Proper page table entry exists, but no corresponding
65462462
JH
1193 * struct page. If the caller expects **pages to be
1194 * filled in, bail out now, because that can't be done
1195 * for this page.
1027e443 1196 */
65462462
JH
1197 if (pages) {
1198 ret = PTR_ERR(page);
1199 goto out;
1200 }
1201
1027e443
KS
1202 goto next_page;
1203 } else if (IS_ERR(page)) {
df06b37f
KB
1204 ret = PTR_ERR(page);
1205 goto out;
1027e443 1206 }
fa5bb209
KS
1207 if (pages) {
1208 pages[i] = page;
1209 flush_anon_page(vma, page, start);
1210 flush_dcache_page(page);
df06b37f 1211 ctx.page_mask = 0;
4bbd4c77 1212 }
4bbd4c77 1213next_page:
df06b37f 1214 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
fa5bb209
KS
1215 if (page_increm > nr_pages)
1216 page_increm = nr_pages;
1217 i += page_increm;
1218 start += page_increm * PAGE_SIZE;
1219 nr_pages -= page_increm;
4bbd4c77 1220 } while (nr_pages);
df06b37f
KB
1221out:
1222 if (ctx.pgmap)
1223 put_dev_pagemap(ctx.pgmap);
1224 return i ? i : ret;
4bbd4c77 1225}
4bbd4c77 1226
771ab430
TK
1227static bool vma_permits_fault(struct vm_area_struct *vma,
1228 unsigned int fault_flags)
d4925e00 1229{
1b2ee126
DH
1230 bool write = !!(fault_flags & FAULT_FLAG_WRITE);
1231 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
33a709b2 1232 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
d4925e00
DH
1233
1234 if (!(vm_flags & vma->vm_flags))
1235 return false;
1236
33a709b2
DH
1237 /*
1238 * The architecture might have a hardware protection
1b2ee126 1239 * mechanism other than read/write that can deny access.
d61172b4
DH
1240 *
1241 * gup always represents data access, not instruction
1242 * fetches, so execute=false here:
33a709b2 1243 */
d61172b4 1244 if (!arch_vma_access_permitted(vma, write, false, foreign))
33a709b2
DH
1245 return false;
1246
d4925e00
DH
1247 return true;
1248}
1249
adc8cb40 1250/**
4bbd4c77 1251 * fixup_user_fault() - manually resolve a user page fault
4bbd4c77
KS
1252 * @mm: mm_struct of target mm
1253 * @address: user address
1254 * @fault_flags:flags to pass down to handle_mm_fault()
c1e8d7c6 1255 * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
548b6a1e
MC
1256 * does not allow retry. If NULL, the caller must guarantee
1257 * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
4bbd4c77
KS
1258 *
1259 * This is meant to be called in the specific scenario where for locking reasons
1260 * we try to access user memory in atomic context (within a pagefault_disable()
1261 * section), this returns -EFAULT, and we want to resolve the user fault before
1262 * trying again.
1263 *
1264 * Typically this is meant to be used by the futex code.
1265 *
1266 * The main difference with get_user_pages() is that this function will
1267 * unconditionally call handle_mm_fault() which will in turn perform all the
1268 * necessary SW fixup of the dirty and young bits in the PTE, while
4a9e1cda 1269 * get_user_pages() only guarantees to update these in the struct page.
4bbd4c77
KS
1270 *
1271 * This is important for some architectures where those bits also gate the
1272 * access permission to the page because they are maintained in software. On
1273 * such architectures, gup() will not be enough to make a subsequent access
1274 * succeed.
1275 *
c1e8d7c6
ML
1276 * This function will not return with an unlocked mmap_lock. So it has not the
1277 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
4bbd4c77 1278 */
64019a2e 1279int fixup_user_fault(struct mm_struct *mm,
4a9e1cda
DD
1280 unsigned long address, unsigned int fault_flags,
1281 bool *unlocked)
4bbd4c77
KS
1282{
1283 struct vm_area_struct *vma;
8fed2f3c 1284 vm_fault_t ret;
4a9e1cda 1285
428e106a 1286 address = untagged_addr_remote(mm, address);
f9652594 1287
4a9e1cda 1288 if (unlocked)
71335f37 1289 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
4bbd4c77 1290
4a9e1cda 1291retry:
4bbd4c77
KS
1292 vma = find_extend_vma(mm, address);
1293 if (!vma || address < vma->vm_start)
1294 return -EFAULT;
1295
d4925e00 1296 if (!vma_permits_fault(vma, fault_flags))
4bbd4c77
KS
1297 return -EFAULT;
1298
475f4dfc
PX
1299 if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1300 fatal_signal_pending(current))
1301 return -EINTR;
1302
bce617ed 1303 ret = handle_mm_fault(vma, address, fault_flags, NULL);
d9272525
PX
1304
1305 if (ret & VM_FAULT_COMPLETED) {
1306 /*
1307 * NOTE: it's a pity that we need to retake the lock here
1308 * to pair with the unlock() in the callers. Ideally we
1309 * could tell the callers so they do not need to unlock.
1310 */
1311 mmap_read_lock(mm);
1312 *unlocked = true;
1313 return 0;
1314 }
1315
4bbd4c77 1316 if (ret & VM_FAULT_ERROR) {
9a291a7c
JM
1317 int err = vm_fault_to_errno(ret, 0);
1318
1319 if (err)
1320 return err;
4bbd4c77
KS
1321 BUG();
1322 }
4a9e1cda
DD
1323
1324 if (ret & VM_FAULT_RETRY) {
d8ed45c5 1325 mmap_read_lock(mm);
475f4dfc
PX
1326 *unlocked = true;
1327 fault_flags |= FAULT_FLAG_TRIED;
1328 goto retry;
4a9e1cda
DD
1329 }
1330
4bbd4c77
KS
1331 return 0;
1332}
add6a0cd 1333EXPORT_SYMBOL_GPL(fixup_user_fault);
4bbd4c77 1334
93c5c61d
PX
1335/*
1336 * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
1337 * specified, it'll also respond to generic signals. The caller of GUP
1338 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
1339 */
1340static bool gup_signal_pending(unsigned int flags)
1341{
1342 if (fatal_signal_pending(current))
1343 return true;
1344
1345 if (!(flags & FOLL_INTERRUPTIBLE))
1346 return false;
1347
1348 return signal_pending(current);
1349}
1350
2d3a36a4 1351/*
b2a72dff
JG
1352 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
1353 * the caller. This function may drop the mmap_lock. If it does so, then it will
1354 * set (*locked = 0).
1355 *
1356 * (*locked == 0) means that the caller expects this function to acquire and
1357 * drop the mmap_lock. Therefore, the value of *locked will still be zero when
1358 * the function returns, even though it may have changed temporarily during
1359 * function execution.
1360 *
1361 * Please note that this function, unlike __get_user_pages(), will not return 0
1362 * for nr_pages > 0, unless FOLL_NOWAIT is used.
2d3a36a4 1363 */
64019a2e 1364static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
f0818f47
AA
1365 unsigned long start,
1366 unsigned long nr_pages,
f0818f47 1367 struct page **pages,
e716712f 1368 int *locked,
0fd71a56 1369 unsigned int flags)
f0818f47 1370{
f0818f47 1371 long ret, pages_done;
b2a72dff 1372 bool must_unlock = false;
f0818f47 1373
b2a72dff
JG
1374 /*
1375 * The internal caller expects GUP to manage the lock internally and the
1376 * lock must be released when this returns.
1377 */
9a863a6a 1378 if (!*locked) {
b2a72dff
JG
1379 if (mmap_read_lock_killable(mm))
1380 return -EAGAIN;
1381 must_unlock = true;
1382 *locked = 1;
f0818f47 1383 }
961ba472
JG
1384 else
1385 mmap_assert_locked(mm);
f0818f47 1386
a458b76a
AA
1387 if (flags & FOLL_PIN)
1388 mm_set_has_pinned_flag(&mm->flags);
008cfe44 1389
eddb1c22
JH
1390 /*
1391 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1392 * is to set FOLL_GET if the caller wants pages[] filled in (but has
1393 * carelessly failed to specify FOLL_GET), so keep doing that, but only
1394 * for FOLL_GET, not for the newer FOLL_PIN.
1395 *
1396 * FOLL_PIN always expects pages to be non-null, but no need to assert
1397 * that here, as any failures will be obvious enough.
1398 */
1399 if (pages && !(flags & FOLL_PIN))
f0818f47 1400 flags |= FOLL_GET;
f0818f47
AA
1401
1402 pages_done = 0;
f0818f47 1403 for (;;) {
64019a2e 1404 ret = __get_user_pages(mm, start, nr_pages, flags, pages,
b2cac248 1405 locked);
f04740f5 1406 if (!(flags & FOLL_UNLOCKABLE)) {
f0818f47 1407 /* VM_FAULT_RETRY couldn't trigger, bypass */
f04740f5
JG
1408 pages_done = ret;
1409 break;
1410 }
f0818f47 1411
d9272525 1412 /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
f0818f47
AA
1413 if (!*locked) {
1414 BUG_ON(ret < 0);
1415 BUG_ON(ret >= nr_pages);
1416 }
1417
f0818f47
AA
1418 if (ret > 0) {
1419 nr_pages -= ret;
1420 pages_done += ret;
1421 if (!nr_pages)
1422 break;
1423 }
1424 if (*locked) {
96312e61
AA
1425 /*
1426 * VM_FAULT_RETRY didn't trigger or it was a
1427 * FOLL_NOWAIT.
1428 */
f0818f47
AA
1429 if (!pages_done)
1430 pages_done = ret;
1431 break;
1432 }
df17277b
MR
1433 /*
1434 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1435 * For the prefault case (!pages) we only update counts.
1436 */
1437 if (likely(pages))
1438 pages += ret;
f0818f47 1439 start += ret << PAGE_SHIFT;
b2a72dff
JG
1440
1441 /* The lock was temporarily dropped, so we must unlock later */
1442 must_unlock = true;
f0818f47 1443
4426e945 1444retry:
f0818f47
AA
1445 /*
1446 * Repeat on the address that fired VM_FAULT_RETRY
4426e945
PX
1447 * with both FAULT_FLAG_ALLOW_RETRY and
1448 * FAULT_FLAG_TRIED. Note that GUP can be interrupted
93c5c61d
PX
1449 * by fatal signals of even common signals, depending on
1450 * the caller's request. So we need to check it before we
4426e945 1451 * start trying again otherwise it can loop forever.
f0818f47 1452 */
93c5c61d 1453 if (gup_signal_pending(flags)) {
ae46d2aa
HD
1454 if (!pages_done)
1455 pages_done = -EINTR;
4426e945 1456 break;
ae46d2aa 1457 }
4426e945 1458
d8ed45c5 1459 ret = mmap_read_lock_killable(mm);
71335f37
PX
1460 if (ret) {
1461 BUG_ON(ret > 0);
1462 if (!pages_done)
1463 pages_done = ret;
1464 break;
1465 }
4426e945 1466
c7b6a566 1467 *locked = 1;
64019a2e 1468 ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
b2cac248 1469 pages, locked);
4426e945
PX
1470 if (!*locked) {
1471 /* Continue to retry until we succeeded */
1472 BUG_ON(ret != 0);
1473 goto retry;
1474 }
f0818f47
AA
1475 if (ret != 1) {
1476 BUG_ON(ret > 1);
1477 if (!pages_done)
1478 pages_done = ret;
1479 break;
1480 }
1481 nr_pages--;
1482 pages_done++;
1483 if (!nr_pages)
1484 break;
df17277b
MR
1485 if (likely(pages))
1486 pages++;
f0818f47
AA
1487 start += PAGE_SIZE;
1488 }
b2a72dff 1489 if (must_unlock && *locked) {
f0818f47 1490 /*
b2a72dff
JG
1491 * We either temporarily dropped the lock, or the caller
1492 * requested that we both acquire and drop the lock. Either way,
1493 * we must now unlock, and notify the caller of that state.
f0818f47 1494 */
d8ed45c5 1495 mmap_read_unlock(mm);
f0818f47
AA
1496 *locked = 0;
1497 }
1498 return pages_done;
1499}
1500
d3649f68
CH
1501/**
1502 * populate_vma_page_range() - populate a range of pages in the vma.
1503 * @vma: target vma
1504 * @start: start address
1505 * @end: end address
c1e8d7c6 1506 * @locked: whether the mmap_lock is still held
d3649f68
CH
1507 *
1508 * This takes care of mlocking the pages too if VM_LOCKED is set.
1509 *
0a36f7f8
TY
1510 * Return either number of pages pinned in the vma, or a negative error
1511 * code on error.
d3649f68 1512 *
c1e8d7c6 1513 * vma->vm_mm->mmap_lock must be held.
d3649f68 1514 *
4f6da934 1515 * If @locked is NULL, it may be held for read or write and will
d3649f68
CH
1516 * be unperturbed.
1517 *
4f6da934
PX
1518 * If @locked is non-NULL, it must held for read only and may be
1519 * released. If it's released, *@locked will be set to 0.
d3649f68
CH
1520 */
1521long populate_vma_page_range(struct vm_area_struct *vma,
4f6da934 1522 unsigned long start, unsigned long end, int *locked)
d3649f68
CH
1523{
1524 struct mm_struct *mm = vma->vm_mm;
1525 unsigned long nr_pages = (end - start) / PAGE_SIZE;
9a863a6a 1526 int local_locked = 1;
d3649f68 1527 int gup_flags;
ece369c7 1528 long ret;
d3649f68 1529
be51eb18
ML
1530 VM_BUG_ON(!PAGE_ALIGNED(start));
1531 VM_BUG_ON(!PAGE_ALIGNED(end));
d3649f68
CH
1532 VM_BUG_ON_VMA(start < vma->vm_start, vma);
1533 VM_BUG_ON_VMA(end > vma->vm_end, vma);
42fc5414 1534 mmap_assert_locked(mm);
d3649f68 1535
b67bf49c
HD
1536 /*
1537 * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1538 * faultin_page() to break COW, so it has no work to do here.
1539 */
d3649f68 1540 if (vma->vm_flags & VM_LOCKONFAULT)
b67bf49c
HD
1541 return nr_pages;
1542
1543 gup_flags = FOLL_TOUCH;
d3649f68
CH
1544 /*
1545 * We want to touch writable mappings with a write fault in order
1546 * to break COW, except for shared mappings because these don't COW
1547 * and we would not want to dirty them for nothing.
1548 */
1549 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1550 gup_flags |= FOLL_WRITE;
1551
1552 /*
1553 * We want mlock to succeed for regions that have any permissions
1554 * other than PROT_NONE.
1555 */
3122e80e 1556 if (vma_is_accessible(vma))
d3649f68
CH
1557 gup_flags |= FOLL_FORCE;
1558
f04740f5
JG
1559 if (locked)
1560 gup_flags |= FOLL_UNLOCKABLE;
1561
d3649f68
CH
1562 /*
1563 * We made sure addr is within a VMA, so the following will
1564 * not result in a stack expansion that recurses back here.
1565 */
ece369c7 1566 ret = __get_user_pages(mm, start, nr_pages, gup_flags,
b2cac248 1567 NULL, locked ? locked : &local_locked);
ece369c7
HD
1568 lru_add_drain();
1569 return ret;
d3649f68
CH
1570}
1571
4ca9b385
DH
1572/*
1573 * faultin_vma_page_range() - populate (prefault) page tables inside the
1574 * given VMA range readable/writable
1575 *
1576 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
1577 *
1578 * @vma: target vma
1579 * @start: start address
1580 * @end: end address
1581 * @write: whether to prefault readable or writable
1582 * @locked: whether the mmap_lock is still held
1583 *
1584 * Returns either number of processed pages in the vma, or a negative error
1585 * code on error (see __get_user_pages()).
1586 *
1587 * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
6e4382c7 1588 * covered by the VMA. If it's released, *@locked will be set to 0.
4ca9b385
DH
1589 */
1590long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
1591 unsigned long end, bool write, int *locked)
1592{
1593 struct mm_struct *mm = vma->vm_mm;
1594 unsigned long nr_pages = (end - start) / PAGE_SIZE;
1595 int gup_flags;
ece369c7 1596 long ret;
4ca9b385
DH
1597
1598 VM_BUG_ON(!PAGE_ALIGNED(start));
1599 VM_BUG_ON(!PAGE_ALIGNED(end));
1600 VM_BUG_ON_VMA(start < vma->vm_start, vma);
1601 VM_BUG_ON_VMA(end > vma->vm_end, vma);
1602 mmap_assert_locked(mm);
1603
1604 /*
1605 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
1606 * the page dirty with FOLL_WRITE -- which doesn't make a
1607 * difference with !FOLL_FORCE, because the page is writable
1608 * in the page table.
1609 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
1610 * a poisoned page.
4ca9b385
DH
1611 * !FOLL_FORCE: Require proper access permissions.
1612 */
f04740f5 1613 gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
4ca9b385
DH
1614 if (write)
1615 gup_flags |= FOLL_WRITE;
1616
1617 /*
eb2faa51
DH
1618 * We want to report -EINVAL instead of -EFAULT for any permission
1619 * problems or incompatible mappings.
4ca9b385 1620 */
eb2faa51
DH
1621 if (check_vma_flags(vma, gup_flags))
1622 return -EINVAL;
1623
ece369c7 1624 ret = __get_user_pages(mm, start, nr_pages, gup_flags,
b2cac248 1625 NULL, locked);
ece369c7
HD
1626 lru_add_drain();
1627 return ret;
4ca9b385
DH
1628}
1629
d3649f68
CH
1630/*
1631 * __mm_populate - populate and/or mlock pages within a range of address space.
1632 *
1633 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1634 * flags. VMAs must be already marked with the desired vm_flags, and
c1e8d7c6 1635 * mmap_lock must not be held.
d3649f68
CH
1636 */
1637int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1638{
1639 struct mm_struct *mm = current->mm;
1640 unsigned long end, nstart, nend;
1641 struct vm_area_struct *vma = NULL;
1642 int locked = 0;
1643 long ret = 0;
1644
1645 end = start + len;
1646
1647 for (nstart = start; nstart < end; nstart = nend) {
1648 /*
1649 * We want to fault in pages for [nstart; end) address range.
1650 * Find first corresponding VMA.
1651 */
1652 if (!locked) {
1653 locked = 1;
d8ed45c5 1654 mmap_read_lock(mm);
c4d1a92d 1655 vma = find_vma_intersection(mm, nstart, end);
d3649f68 1656 } else if (nstart >= vma->vm_end)
c4d1a92d
LH
1657 vma = find_vma_intersection(mm, vma->vm_end, end);
1658
1659 if (!vma)
d3649f68
CH
1660 break;
1661 /*
1662 * Set [nstart; nend) to intersection of desired address
1663 * range with the first VMA. Also, skip undesirable VMA types.
1664 */
1665 nend = min(end, vma->vm_end);
1666 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1667 continue;
1668 if (nstart < vma->vm_start)
1669 nstart = vma->vm_start;
1670 /*
1671 * Now fault in a range of pages. populate_vma_page_range()
1672 * double checks the vma flags, so that it won't mlock pages
1673 * if the vma was already munlocked.
1674 */
1675 ret = populate_vma_page_range(vma, nstart, nend, &locked);
1676 if (ret < 0) {
1677 if (ignore_errors) {
1678 ret = 0;
1679 continue; /* continue at next VMA */
1680 }
1681 break;
1682 }
1683 nend = nstart + ret * PAGE_SIZE;
1684 ret = 0;
1685 }
1686 if (locked)
d8ed45c5 1687 mmap_read_unlock(mm);
d3649f68
CH
1688 return ret; /* 0 or negative error code */
1689}
050a9adc 1690#else /* CONFIG_MMU */
64019a2e 1691static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
050a9adc 1692 unsigned long nr_pages, struct page **pages,
b2cac248 1693 int *locked, unsigned int foll_flags)
050a9adc
CH
1694{
1695 struct vm_area_struct *vma;
b2a72dff 1696 bool must_unlock = false;
050a9adc 1697 unsigned long vm_flags;
24dc20c7 1698 long i;
050a9adc 1699
b2a72dff
JG
1700 if (!nr_pages)
1701 return 0;
1702
1703 /*
1704 * The internal caller expects GUP to manage the lock internally and the
1705 * lock must be released when this returns.
1706 */
9a863a6a 1707 if (!*locked) {
b2a72dff
JG
1708 if (mmap_read_lock_killable(mm))
1709 return -EAGAIN;
1710 must_unlock = true;
1711 *locked = 1;
1712 }
1713
050a9adc
CH
1714 /* calculate required read or write permissions.
1715 * If FOLL_FORCE is set, we only require the "MAY" flags.
1716 */
1717 vm_flags = (foll_flags & FOLL_WRITE) ?
1718 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1719 vm_flags &= (foll_flags & FOLL_FORCE) ?
1720 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1721
1722 for (i = 0; i < nr_pages; i++) {
1723 vma = find_vma(mm, start);
1724 if (!vma)
b2a72dff 1725 break;
050a9adc
CH
1726
1727 /* protect what we can, including chardevs */
1728 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1729 !(vm_flags & vma->vm_flags))
b2a72dff 1730 break;
050a9adc
CH
1731
1732 if (pages) {
396a400b 1733 pages[i] = virt_to_page((void *)start);
050a9adc
CH
1734 if (pages[i])
1735 get_page(pages[i]);
1736 }
b2cac248 1737
050a9adc
CH
1738 start = (start + PAGE_SIZE) & PAGE_MASK;
1739 }
1740
b2a72dff
JG
1741 if (must_unlock && *locked) {
1742 mmap_read_unlock(mm);
1743 *locked = 0;
1744 }
050a9adc 1745
050a9adc
CH
1746 return i ? : -EFAULT;
1747}
1748#endif /* !CONFIG_MMU */
d3649f68 1749
bb523b40
AG
1750/**
1751 * fault_in_writeable - fault in userspace address range for writing
1752 * @uaddr: start of address range
1753 * @size: size of address range
1754 *
1755 * Returns the number of bytes not faulted in (like copy_to_user() and
1756 * copy_from_user()).
1757 */
1758size_t fault_in_writeable(char __user *uaddr, size_t size)
1759{
1760 char __user *start = uaddr, *end;
1761
1762 if (unlikely(size == 0))
1763 return 0;
677b2a8c
CL
1764 if (!user_write_access_begin(uaddr, size))
1765 return size;
bb523b40 1766 if (!PAGE_ALIGNED(uaddr)) {
677b2a8c 1767 unsafe_put_user(0, uaddr, out);
bb523b40
AG
1768 uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
1769 }
1770 end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
1771 if (unlikely(end < start))
1772 end = NULL;
1773 while (uaddr != end) {
677b2a8c 1774 unsafe_put_user(0, uaddr, out);
bb523b40
AG
1775 uaddr += PAGE_SIZE;
1776 }
1777
1778out:
677b2a8c 1779 user_write_access_end();
bb523b40
AG
1780 if (size > uaddr - start)
1781 return size - (uaddr - start);
1782 return 0;
1783}
1784EXPORT_SYMBOL(fault_in_writeable);
1785
da32b581
CM
1786/**
1787 * fault_in_subpage_writeable - fault in an address range for writing
1788 * @uaddr: start of address range
1789 * @size: size of address range
1790 *
1791 * Fault in a user address range for writing while checking for permissions at
1792 * sub-page granularity (e.g. arm64 MTE). This function should be used when
1793 * the caller cannot guarantee forward progress of a copy_to_user() loop.
1794 *
1795 * Returns the number of bytes not faulted in (like copy_to_user() and
1796 * copy_from_user()).
1797 */
1798size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
1799{
1800 size_t faulted_in;
1801
1802 /*
1803 * Attempt faulting in at page granularity first for page table
1804 * permission checking. The arch-specific probe_subpage_writeable()
1805 * functions may not check for this.
1806 */
1807 faulted_in = size - fault_in_writeable(uaddr, size);
1808 if (faulted_in)
1809 faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
1810
1811 return size - faulted_in;
1812}
1813EXPORT_SYMBOL(fault_in_subpage_writeable);
1814
cdd591fc
AG
1815/*
1816 * fault_in_safe_writeable - fault in an address range for writing
1817 * @uaddr: start of address range
1818 * @size: length of address range
1819 *
fe673d3f
LT
1820 * Faults in an address range for writing. This is primarily useful when we
1821 * already know that some or all of the pages in the address range aren't in
1822 * memory.
cdd591fc 1823 *
fe673d3f 1824 * Unlike fault_in_writeable(), this function is non-destructive.
cdd591fc
AG
1825 *
1826 * Note that we don't pin or otherwise hold the pages referenced that we fault
1827 * in. There's no guarantee that they'll stay in memory for any duration of
1828 * time.
1829 *
1830 * Returns the number of bytes not faulted in, like copy_to_user() and
1831 * copy_from_user().
1832 */
1833size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
1834{
fe673d3f 1835 unsigned long start = (unsigned long)uaddr, end;
cdd591fc 1836 struct mm_struct *mm = current->mm;
fe673d3f 1837 bool unlocked = false;
cdd591fc 1838
fe673d3f
LT
1839 if (unlikely(size == 0))
1840 return 0;
cdd591fc 1841 end = PAGE_ALIGN(start + size);
fe673d3f 1842 if (end < start)
cdd591fc 1843 end = 0;
cdd591fc 1844
fe673d3f
LT
1845 mmap_read_lock(mm);
1846 do {
1847 if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
cdd591fc 1848 break;
fe673d3f
LT
1849 start = (start + PAGE_SIZE) & PAGE_MASK;
1850 } while (start != end);
1851 mmap_read_unlock(mm);
1852
1853 if (size > (unsigned long)uaddr - start)
1854 return size - ((unsigned long)uaddr - start);
1855 return 0;
cdd591fc
AG
1856}
1857EXPORT_SYMBOL(fault_in_safe_writeable);
1858
bb523b40
AG
1859/**
1860 * fault_in_readable - fault in userspace address range for reading
1861 * @uaddr: start of user address range
1862 * @size: size of user address range
1863 *
1864 * Returns the number of bytes not faulted in (like copy_to_user() and
1865 * copy_from_user()).
1866 */
1867size_t fault_in_readable(const char __user *uaddr, size_t size)
1868{
1869 const char __user *start = uaddr, *end;
1870 volatile char c;
1871
1872 if (unlikely(size == 0))
1873 return 0;
677b2a8c
CL
1874 if (!user_read_access_begin(uaddr, size))
1875 return size;
bb523b40 1876 if (!PAGE_ALIGNED(uaddr)) {
677b2a8c 1877 unsafe_get_user(c, uaddr, out);
bb523b40
AG
1878 uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
1879 }
1880 end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
1881 if (unlikely(end < start))
1882 end = NULL;
1883 while (uaddr != end) {
677b2a8c 1884 unsafe_get_user(c, uaddr, out);
bb523b40
AG
1885 uaddr += PAGE_SIZE;
1886 }
1887
1888out:
677b2a8c 1889 user_read_access_end();
bb523b40
AG
1890 (void)c;
1891 if (size > uaddr - start)
1892 return size - (uaddr - start);
1893 return 0;
1894}
1895EXPORT_SYMBOL(fault_in_readable);
1896
8f942eea
JH
1897/**
1898 * get_dump_page() - pin user page in memory while writing it to core dump
1899 * @addr: user address
1900 *
1901 * Returns struct page pointer of user page pinned for dump,
1902 * to be freed afterwards by put_page().
1903 *
1904 * Returns NULL on any kind of failure - a hole must then be inserted into
1905 * the corefile, to preserve alignment with its headers; and also returns
1906 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
f0953a1b 1907 * allowing a hole to be left in the corefile to save disk space.
8f942eea 1908 *
7f3bfab5 1909 * Called without mmap_lock (takes and releases the mmap_lock by itself).
8f942eea
JH
1910 */
1911#ifdef CONFIG_ELF_CORE
1912struct page *get_dump_page(unsigned long addr)
1913{
8f942eea 1914 struct page *page;
b2a72dff 1915 int locked = 0;
7f3bfab5 1916 int ret;
8f942eea 1917
b2cac248 1918 ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
7f3bfab5 1919 FOLL_FORCE | FOLL_DUMP | FOLL_GET);
7f3bfab5 1920 return (ret == 1) ? page : NULL;
8f942eea
JH
1921}
1922#endif /* CONFIG_ELF_CORE */
1923
d1e153fe 1924#ifdef CONFIG_MIGRATION
f68749ec 1925/*
67e139b0 1926 * Returns the number of collected pages. Return value is always >= 0.
f68749ec 1927 */
67e139b0
AP
1928static unsigned long collect_longterm_unpinnable_pages(
1929 struct list_head *movable_page_list,
1930 unsigned long nr_pages,
1931 struct page **pages)
9a4e9f3b 1932{
67e139b0 1933 unsigned long i, collected = 0;
1b7f7e58 1934 struct folio *prev_folio = NULL;
67e139b0 1935 bool drain_allow = true;
9a4e9f3b 1936
83c02c23 1937 for (i = 0; i < nr_pages; i++) {
1b7f7e58 1938 struct folio *folio = page_folio(pages[i]);
f9f38f78 1939
1b7f7e58 1940 if (folio == prev_folio)
83c02c23 1941 continue;
1b7f7e58 1942 prev_folio = folio;
f9f38f78 1943
67e139b0
AP
1944 if (folio_is_longterm_pinnable(folio))
1945 continue;
b05a79d4 1946
67e139b0 1947 collected++;
b05a79d4 1948
67e139b0 1949 if (folio_is_device_coherent(folio))
f9f38f78
CH
1950 continue;
1951
1b7f7e58 1952 if (folio_test_hugetlb(folio)) {
6aa3a920 1953 isolate_hugetlb(folio, movable_page_list);
f9f38f78
CH
1954 continue;
1955 }
9a4e9f3b 1956
1b7f7e58 1957 if (!folio_test_lru(folio) && drain_allow) {
f9f38f78
CH
1958 lru_add_drain_all();
1959 drain_allow = false;
1960 }
1961
be2d5756 1962 if (!folio_isolate_lru(folio))
f9f38f78 1963 continue;
67e139b0
AP
1964
1965 list_add_tail(&folio->lru, movable_page_list);
1b7f7e58
MWO
1966 node_stat_mod_folio(folio,
1967 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1968 folio_nr_pages(folio));
9a4e9f3b
AK
1969 }
1970
67e139b0
AP
1971 return collected;
1972}
1973
1974/*
1975 * Unpins all pages and migrates device coherent pages and movable_page_list.
1976 * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
1977 * (or partial success).
1978 */
1979static int migrate_longterm_unpinnable_pages(
1980 struct list_head *movable_page_list,
1981 unsigned long nr_pages,
1982 struct page **pages)
1983{
1984 int ret;
1985 unsigned long i;
6e7f34eb 1986
b05a79d4 1987 for (i = 0; i < nr_pages; i++) {
67e139b0
AP
1988 struct folio *folio = page_folio(pages[i]);
1989
1990 if (folio_is_device_coherent(folio)) {
1991 /*
1992 * Migration will fail if the page is pinned, so convert
1993 * the pin on the source page to a normal reference.
1994 */
1995 pages[i] = NULL;
1996 folio_get(folio);
1997 gup_put_folio(folio, 1, FOLL_PIN);
1998
1999 if (migrate_device_coherent_page(&folio->page)) {
2000 ret = -EBUSY;
2001 goto err;
2002 }
2003
b05a79d4 2004 continue;
67e139b0 2005 }
b05a79d4 2006
67e139b0
AP
2007 /*
2008 * We can't migrate pages with unexpected references, so drop
2009 * the reference obtained by __get_user_pages_locked().
2010 * Migrating pages have been added to movable_page_list after
2011 * calling folio_isolate_lru() which takes a reference so the
2012 * page won't be freed if it's migrating.
2013 */
f6d299ec 2014 unpin_user_page(pages[i]);
67e139b0 2015 pages[i] = NULL;
f68749ec 2016 }
f9f38f78 2017
67e139b0 2018 if (!list_empty(movable_page_list)) {
f9f38f78
CH
2019 struct migration_target_control mtc = {
2020 .nid = NUMA_NO_NODE,
2021 .gfp_mask = GFP_USER | __GFP_NOWARN,
2022 };
2023
67e139b0
AP
2024 if (migrate_pages(movable_page_list, alloc_migration_target,
2025 NULL, (unsigned long)&mtc, MIGRATE_SYNC,
2026 MR_LONGTERM_PIN, NULL)) {
f9f38f78 2027 ret = -ENOMEM;
67e139b0
AP
2028 goto err;
2029 }
9a4e9f3b
AK
2030 }
2031
67e139b0
AP
2032 putback_movable_pages(movable_page_list);
2033
2034 return -EAGAIN;
2035
2036err:
2037 for (i = 0; i < nr_pages; i++)
2038 if (pages[i])
2039 unpin_user_page(pages[i]);
2040 putback_movable_pages(movable_page_list);
24a95998 2041
67e139b0
AP
2042 return ret;
2043}
2044
2045/*
2046 * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
2047 * pages in the range are required to be pinned via FOLL_PIN, before calling
2048 * this routine.
2049 *
2050 * If any pages in the range are not allowed to be pinned, then this routine
2051 * will migrate those pages away, unpin all the pages in the range and return
2052 * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
2053 * call this routine again.
2054 *
2055 * If an error other than -EAGAIN occurs, this indicates a migration failure.
2056 * The caller should give up, and propagate the error back up the call stack.
2057 *
2058 * If everything is OK and all pages in the range are allowed to be pinned, then
2059 * this routine leaves all pages pinned and returns zero for success.
2060 */
2061static long check_and_migrate_movable_pages(unsigned long nr_pages,
2062 struct page **pages)
2063{
2064 unsigned long collected;
2065 LIST_HEAD(movable_page_list);
2066
2067 collected = collect_longterm_unpinnable_pages(&movable_page_list,
2068 nr_pages, pages);
2069 if (!collected)
2070 return 0;
2071
2072 return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
2073 pages);
9a4e9f3b
AK
2074}
2075#else
f68749ec 2076static long check_and_migrate_movable_pages(unsigned long nr_pages,
f6d299ec 2077 struct page **pages)
9a4e9f3b 2078{
24a95998 2079 return 0;
9a4e9f3b 2080}
d1e153fe 2081#endif /* CONFIG_MIGRATION */
9a4e9f3b 2082
2bb6d283 2083/*
932f4a63
IW
2084 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
2085 * allows us to process the FOLL_LONGTERM flag.
2bb6d283 2086 */
64019a2e 2087static long __gup_longterm_locked(struct mm_struct *mm,
932f4a63
IW
2088 unsigned long start,
2089 unsigned long nr_pages,
2090 struct page **pages,
53b2d09b 2091 int *locked,
932f4a63 2092 unsigned int gup_flags)
2bb6d283 2093{
f68749ec 2094 unsigned int flags;
24a95998 2095 long rc, nr_pinned_pages;
2bb6d283 2096
f68749ec 2097 if (!(gup_flags & FOLL_LONGTERM))
b2cac248 2098 return __get_user_pages_locked(mm, start, nr_pages, pages,
53b2d09b 2099 locked, gup_flags);
67e139b0 2100
f68749ec
PT
2101 flags = memalloc_pin_save();
2102 do {
24a95998 2103 nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
b2cac248 2104 pages, locked,
24a95998
AP
2105 gup_flags);
2106 if (nr_pinned_pages <= 0) {
2107 rc = nr_pinned_pages;
f68749ec 2108 break;
24a95998 2109 }
d64e2dbc
JG
2110
2111 /* FOLL_LONGTERM implies FOLL_PIN */
f6d299ec 2112 rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
24a95998 2113 } while (rc == -EAGAIN);
f68749ec 2114 memalloc_pin_restore(flags);
24a95998 2115 return rc ? rc : nr_pinned_pages;
2bb6d283 2116}
932f4a63 2117
d64e2dbc
JG
2118/*
2119 * Check that the given flags are valid for the exported gup/pup interface, and
2120 * update them with the required flags that the caller must have set.
2121 */
b2cac248
LS
2122static bool is_valid_gup_args(struct page **pages, int *locked,
2123 unsigned int *gup_flags_p, unsigned int to_set)
447f3e45 2124{
d64e2dbc
JG
2125 unsigned int gup_flags = *gup_flags_p;
2126
447f3e45 2127 /*
d64e2dbc
JG
2128 * These flags not allowed to be specified externally to the gup
2129 * interfaces:
2130 * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
2131 * - FOLL_REMOTE is internal only and used on follow_page()
f04740f5 2132 * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
447f3e45 2133 */
f04740f5 2134 if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
d64e2dbc
JG
2135 FOLL_REMOTE | FOLL_FAST_ONLY)))
2136 return false;
2137
2138 gup_flags |= to_set;
f04740f5
JG
2139 if (locked) {
2140 /* At the external interface locked must be set */
2141 if (WARN_ON_ONCE(*locked != 1))
2142 return false;
2143
2144 gup_flags |= FOLL_UNLOCKABLE;
2145 }
d64e2dbc
JG
2146
2147 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2148 if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
2149 (FOLL_PIN | FOLL_GET)))
2150 return false;
2151
2152 /* LONGTERM can only be specified when pinning */
2153 if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
2154 return false;
2155
2156 /* Pages input must be given if using GET/PIN */
2157 if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
447f3e45 2158 return false;
d64e2dbc 2159
d64e2dbc
JG
2160 /* We want to allow the pgmap to be hot-unplugged at all times */
2161 if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
2162 (gup_flags & FOLL_PCI_P2PDMA)))
2163 return false;
2164
d64e2dbc 2165 *gup_flags_p = gup_flags;
447f3e45
BS
2166 return true;
2167}
2168
22bf29b6 2169#ifdef CONFIG_MMU
adc8cb40 2170/**
c4237f8b 2171 * get_user_pages_remote() - pin user pages in memory
c4237f8b
JH
2172 * @mm: mm_struct of target mm
2173 * @start: starting user address
2174 * @nr_pages: number of pages from start to pin
2175 * @gup_flags: flags modifying lookup behaviour
2176 * @pages: array that receives pointers to the pages pinned.
2177 * Should be at least nr_pages long. Or NULL, if caller
2178 * only intends to ensure the pages are faulted in.
c4237f8b
JH
2179 * @locked: pointer to lock flag indicating whether lock is held and
2180 * subsequently whether VM_FAULT_RETRY functionality can be
2181 * utilised. Lock must initially be held.
2182 *
2183 * Returns either number of pages pinned (which may be less than the
2184 * number requested), or an error. Details about the return value:
2185 *
2186 * -- If nr_pages is 0, returns 0.
2187 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2188 * -- If nr_pages is >0, and some pages were pinned, returns the number of
2189 * pages pinned. Again, this may be less than nr_pages.
2190 *
2191 * The caller is responsible for releasing returned @pages, via put_page().
2192 *
c1e8d7c6 2193 * Must be called with mmap_lock held for read or write.
c4237f8b 2194 *
adc8cb40
SJ
2195 * get_user_pages_remote walks a process's page tables and takes a reference
2196 * to each struct page that each user address corresponds to at a given
c4237f8b
JH
2197 * instant. That is, it takes the page that would be accessed if a user
2198 * thread accesses the given user virtual address at that instant.
2199 *
2200 * This does not guarantee that the page exists in the user mappings when
adc8cb40 2201 * get_user_pages_remote returns, and there may even be a completely different
c4237f8b 2202 * page there in some cases (eg. if mmapped pagecache has been invalidated
5da1a868 2203 * and subsequently re-faulted). However it does guarantee that the page
c4237f8b
JH
2204 * won't be freed completely. And mostly callers simply care that the page
2205 * contains data that was valid *at some point in time*. Typically, an IO
2206 * or similar operation cannot guarantee anything stronger anyway because
2207 * locks can't be held over the syscall boundary.
2208 *
2209 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2210 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2211 * be called after the page is finished with, and before put_page is called.
2212 *
adc8cb40
SJ
2213 * get_user_pages_remote is typically used for fewer-copy IO operations,
2214 * to get a handle on the memory by some means other than accesses
2215 * via the user virtual addresses. The pages may be submitted for
2216 * DMA to devices or accessed via their kernel linear mapping (via the
2217 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
c4237f8b
JH
2218 *
2219 * See also get_user_pages_fast, for performance critical applications.
2220 *
adc8cb40 2221 * get_user_pages_remote should be phased out in favor of
c4237f8b 2222 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
adc8cb40 2223 * should use get_user_pages_remote because it cannot pass
c4237f8b
JH
2224 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2225 */
64019a2e 2226long get_user_pages_remote(struct mm_struct *mm,
c4237f8b
JH
2227 unsigned long start, unsigned long nr_pages,
2228 unsigned int gup_flags, struct page **pages,
ca5e8632 2229 int *locked)
c4237f8b 2230{
9a863a6a
JG
2231 int local_locked = 1;
2232
b2cac248 2233 if (!is_valid_gup_args(pages, locked, &gup_flags,
d64e2dbc 2234 FOLL_TOUCH | FOLL_REMOTE))
eddb1c22
JH
2235 return -EINVAL;
2236
b2cac248 2237 return __get_user_pages_locked(mm, start, nr_pages, pages,
9a863a6a 2238 locked ? locked : &local_locked,
d64e2dbc 2239 gup_flags);
c4237f8b
JH
2240}
2241EXPORT_SYMBOL(get_user_pages_remote);
2242
eddb1c22 2243#else /* CONFIG_MMU */
64019a2e 2244long get_user_pages_remote(struct mm_struct *mm,
eddb1c22
JH
2245 unsigned long start, unsigned long nr_pages,
2246 unsigned int gup_flags, struct page **pages,
ca5e8632 2247 int *locked)
eddb1c22
JH
2248{
2249 return 0;
2250}
2251#endif /* !CONFIG_MMU */
2252
adc8cb40
SJ
2253/**
2254 * get_user_pages() - pin user pages in memory
2255 * @start: starting user address
2256 * @nr_pages: number of pages from start to pin
2257 * @gup_flags: flags modifying lookup behaviour
2258 * @pages: array that receives pointers to the pages pinned.
2259 * Should be at least nr_pages long. Or NULL, if caller
2260 * only intends to ensure the pages are faulted in.
adc8cb40 2261 *
64019a2e
PX
2262 * This is the same as get_user_pages_remote(), just with a less-flexible
2263 * calling convention where we assume that the mm being operated on belongs to
2264 * the current task, and doesn't allow passing of a locked parameter. We also
2265 * obviously don't pass FOLL_REMOTE in here.
932f4a63
IW
2266 */
2267long get_user_pages(unsigned long start, unsigned long nr_pages,
54d02069 2268 unsigned int gup_flags, struct page **pages)
932f4a63 2269{
9a863a6a
JG
2270 int locked = 1;
2271
b2cac248 2272 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
eddb1c22
JH
2273 return -EINVAL;
2274
afa3c33e 2275 return __get_user_pages_locked(current->mm, start, nr_pages, pages,
b2cac248 2276 &locked, gup_flags);
932f4a63
IW
2277}
2278EXPORT_SYMBOL(get_user_pages);
2bb6d283 2279
acc3c8d1 2280/*
d3649f68 2281 * get_user_pages_unlocked() is suitable to replace the form:
acc3c8d1 2282 *
3e4e28c5 2283 * mmap_read_lock(mm);
64019a2e 2284 * get_user_pages(mm, ..., pages, NULL);
3e4e28c5 2285 * mmap_read_unlock(mm);
d3649f68
CH
2286 *
2287 * with:
2288 *
64019a2e 2289 * get_user_pages_unlocked(mm, ..., pages);
d3649f68
CH
2290 *
2291 * It is functionally equivalent to get_user_pages_fast so
2292 * get_user_pages_fast should be used instead if specific gup_flags
2293 * (e.g. FOLL_FORCE) are not required.
acc3c8d1 2294 */
d3649f68
CH
2295long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2296 struct page **pages, unsigned int gup_flags)
acc3c8d1 2297{
b2a72dff 2298 int locked = 0;
acc3c8d1 2299
b2cac248 2300 if (!is_valid_gup_args(pages, NULL, &gup_flags,
f04740f5 2301 FOLL_TOUCH | FOLL_UNLOCKABLE))
d64e2dbc
JG
2302 return -EINVAL;
2303
afa3c33e 2304 return __get_user_pages_locked(current->mm, start, nr_pages, pages,
b2cac248 2305 &locked, gup_flags);
4bbd4c77 2306}
d3649f68 2307EXPORT_SYMBOL(get_user_pages_unlocked);
2667f50e
SC
2308
2309/*
67a929e0 2310 * Fast GUP
2667f50e
SC
2311 *
2312 * get_user_pages_fast attempts to pin user pages by walking the page
2313 * tables directly and avoids taking locks. Thus the walker needs to be
2314 * protected from page table pages being freed from under it, and should
2315 * block any THP splits.
2316 *
2317 * One way to achieve this is to have the walker disable interrupts, and
2318 * rely on IPIs from the TLB flushing code blocking before the page table
2319 * pages are freed. This is unsuitable for architectures that do not need
2320 * to broadcast an IPI when invalidating TLBs.
2321 *
2322 * Another way to achieve this is to batch up page table containing pages
2323 * belonging to more than one mm_user, then rcu_sched a callback to free those
2324 * pages. Disabling interrupts will allow the fast_gup walker to both block
2325 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2326 * (which is a relatively rare event). The code below adopts this strategy.
2327 *
2328 * Before activating this code, please be aware that the following assumptions
2329 * are currently made:
2330 *
ff2e6d72 2331 * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
e585513b 2332 * free pages containing page tables or TLB flushing requires IPI broadcast.
2667f50e 2333 *
2667f50e
SC
2334 * *) ptes can be read atomically by the architecture.
2335 *
2336 * *) access_ok is sufficient to validate userspace address ranges.
2337 *
2338 * The last two assumptions can be relaxed by the addition of helper functions.
2339 *
2340 * This code is based heavily on the PowerPC implementation by Nick Piggin.
2341 */
67a929e0 2342#ifdef CONFIG_HAVE_FAST_GUP
3faa52c0 2343
a6e79df9
LS
2344/*
2345 * Used in the GUP-fast path to determine whether a pin is permitted for a
2346 * specific folio.
2347 *
2348 * This call assumes the caller has pinned the folio, that the lowest page table
2349 * level still points to this folio, and that interrupts have been disabled.
2350 *
2351 * Writing to pinned file-backed dirty tracked folios is inherently problematic
2352 * (see comment describing the writable_file_mapping_allowed() function). We
2353 * therefore try to avoid the most egregious case of a long-term mapping doing
2354 * so.
2355 *
2356 * This function cannot be as thorough as that one as the VMA is not available
2357 * in the fast path, so instead we whitelist known good cases and if in doubt,
2358 * fall back to the slow path.
2359 */
2360static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
2361{
2362 struct address_space *mapping;
2363 unsigned long mapping_flags;
2364
2365 /*
2366 * If we aren't pinning then no problematic write can occur. A long term
2367 * pin is the most egregious case so this is the one we disallow.
2368 */
2369 if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
2370 (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
2371 return true;
2372
2373 /* The folio is pinned, so we can safely access folio fields. */
2374
2375 if (WARN_ON_ONCE(folio_test_slab(folio)))
2376 return false;
2377
2378 /* hugetlb mappings do not require dirty-tracking. */
2379 if (folio_test_hugetlb(folio))
2380 return true;
2381
2382 /*
2383 * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
2384 * cannot proceed, which means no actions performed under RCU can
2385 * proceed either.
2386 *
2387 * inodes and thus their mappings are freed under RCU, which means the
2388 * mapping cannot be freed beneath us and thus we can safely dereference
2389 * it.
2390 */
2391 lockdep_assert_irqs_disabled();
2392
2393 /*
2394 * However, there may be operations which _alter_ the mapping, so ensure
2395 * we read it once and only once.
2396 */
2397 mapping = READ_ONCE(folio->mapping);
2398
2399 /*
2400 * The mapping may have been truncated, in any case we cannot determine
2401 * if this mapping is safe - fall back to slow path to determine how to
2402 * proceed.
2403 */
2404 if (!mapping)
2405 return false;
2406
2407 /* Anonymous folios pose no problem. */
2408 mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
2409 if (mapping_flags)
2410 return mapping_flags & PAGE_MAPPING_ANON;
2411
2412 /*
2413 * At this point, we know the mapping is non-null and points to an
2414 * address_space object. The only remaining whitelisted file system is
2415 * shmem.
2416 */
2417 return shmem_mapping(mapping);
2418}
2419
790c7369 2420static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
3b78d834 2421 unsigned int flags,
790c7369 2422 struct page **pages)
b59f65fa
KS
2423{
2424 while ((*nr) - nr_start) {
2425 struct page *page = pages[--(*nr)];
2426
2427 ClearPageReferenced(page);
3faa52c0
JH
2428 if (flags & FOLL_PIN)
2429 unpin_user_page(page);
2430 else
2431 put_page(page);
b59f65fa
KS
2432 }
2433}
2434
3010a5ea 2435#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
70cbc3cc
YS
2436/*
2437 * Fast-gup relies on pte change detection to avoid concurrent pgtable
2438 * operations.
2439 *
2440 * To pin the page, fast-gup needs to do below in order:
2441 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2442 *
2443 * For the rest of pgtable operations where pgtable updates can be racy
2444 * with fast-gup, we need to do (1) clear pte, then (2) check whether page
2445 * is pinned.
2446 *
2447 * Above will work for all pte-level operations, including THP split.
2448 *
2449 * For THP collapse, it's a bit more complicated because fast-gup may be
2450 * walking a pgtable page that is being freed (pte is still valid but pmd
2451 * can be cleared already). To avoid race in such condition, we need to
2452 * also check pmd here to make sure pmd doesn't change (corresponds to
2453 * pmdp_collapse_flush() in the THP collapse code path).
2454 */
2455static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2456 unsigned long end, unsigned int flags,
2457 struct page **pages, int *nr)
2667f50e 2458{
b59f65fa
KS
2459 struct dev_pagemap *pgmap = NULL;
2460 int nr_start = *nr, ret = 0;
2667f50e 2461 pte_t *ptep, *ptem;
2667f50e
SC
2462
2463 ptem = ptep = pte_offset_map(&pmd, addr);
04dee9e8
HD
2464 if (!ptep)
2465 return 0;
2667f50e 2466 do {
2a4a06da 2467 pte_t pte = ptep_get_lockless(ptep);
b0496fe4
MWO
2468 struct page *page;
2469 struct folio *folio;
2667f50e 2470
0cf45986 2471 if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
e7884f8e
KS
2472 goto pte_unmap;
2473
b798bec4 2474 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
e7884f8e
KS
2475 goto pte_unmap;
2476
b59f65fa 2477 if (pte_devmap(pte)) {
7af75561
IW
2478 if (unlikely(flags & FOLL_LONGTERM))
2479 goto pte_unmap;
2480
b59f65fa
KS
2481 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2482 if (unlikely(!pgmap)) {
3b78d834 2483 undo_dev_pagemap(nr, nr_start, flags, pages);
b59f65fa
KS
2484 goto pte_unmap;
2485 }
2486 } else if (pte_special(pte))
2667f50e
SC
2487 goto pte_unmap;
2488
2489 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2490 page = pte_page(pte);
2491
b0496fe4
MWO
2492 folio = try_grab_folio(page, 1, flags);
2493 if (!folio)
2667f50e
SC
2494 goto pte_unmap;
2495
1507f512 2496 if (unlikely(page_is_secretmem(page))) {
b0496fe4 2497 gup_put_folio(folio, 1, flags);
1507f512
MR
2498 goto pte_unmap;
2499 }
2500
70cbc3cc 2501 if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
c33c7948 2502 unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
b0496fe4 2503 gup_put_folio(folio, 1, flags);
2667f50e
SC
2504 goto pte_unmap;
2505 }
2506
a6e79df9
LS
2507 if (!folio_fast_pin_allowed(folio, flags)) {
2508 gup_put_folio(folio, 1, flags);
2509 goto pte_unmap;
2510 }
2511
84209e87 2512 if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
a7f22660
DH
2513 gup_put_folio(folio, 1, flags);
2514 goto pte_unmap;
2515 }
2516
f28d4363
CI
2517 /*
2518 * We need to make the page accessible if and only if we are
2519 * going to access its content (the FOLL_PIN case). Please
2520 * see Documentation/core-api/pin_user_pages.rst for
2521 * details.
2522 */
2523 if (flags & FOLL_PIN) {
2524 ret = arch_make_page_accessible(page);
2525 if (ret) {
b0496fe4 2526 gup_put_folio(folio, 1, flags);
f28d4363
CI
2527 goto pte_unmap;
2528 }
2529 }
b0496fe4 2530 folio_set_referenced(folio);
2667f50e
SC
2531 pages[*nr] = page;
2532 (*nr)++;
2667f50e
SC
2533 } while (ptep++, addr += PAGE_SIZE, addr != end);
2534
2535 ret = 1;
2536
2537pte_unmap:
832d7aa0
CH
2538 if (pgmap)
2539 put_dev_pagemap(pgmap);
2667f50e
SC
2540 pte_unmap(ptem);
2541 return ret;
2542}
2543#else
2544
2545/*
2546 * If we can't determine whether or not a pte is special, then fail immediately
2547 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2548 * to be special.
2549 *
2550 * For a futex to be placed on a THP tail page, get_futex_key requires a
dadbb612 2551 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2667f50e
SC
2552 * useful to have gup_huge_pmd even if we can't operate on ptes.
2553 */
70cbc3cc
YS
2554static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2555 unsigned long end, unsigned int flags,
2556 struct page **pages, int *nr)
2667f50e
SC
2557{
2558 return 0;
2559}
3010a5ea 2560#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2667f50e 2561
17596731 2562#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
b59f65fa 2563static int __gup_device_huge(unsigned long pfn, unsigned long addr,
86dfbed4
JH
2564 unsigned long end, unsigned int flags,
2565 struct page **pages, int *nr)
b59f65fa
KS
2566{
2567 int nr_start = *nr;
2568 struct dev_pagemap *pgmap = NULL;
2569
2570 do {
2571 struct page *page = pfn_to_page(pfn);
2572
2573 pgmap = get_dev_pagemap(pfn, pgmap);
2574 if (unlikely(!pgmap)) {
3b78d834 2575 undo_dev_pagemap(nr, nr_start, flags, pages);
6401c4eb 2576 break;
b59f65fa 2577 }
4003f107
LG
2578
2579 if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
2580 undo_dev_pagemap(nr, nr_start, flags, pages);
2581 break;
2582 }
2583
b59f65fa
KS
2584 SetPageReferenced(page);
2585 pages[*nr] = page;
0f089235 2586 if (unlikely(try_grab_page(page, flags))) {
3faa52c0 2587 undo_dev_pagemap(nr, nr_start, flags, pages);
6401c4eb 2588 break;
3faa52c0 2589 }
b59f65fa
KS
2590 (*nr)++;
2591 pfn++;
2592 } while (addr += PAGE_SIZE, addr != end);
832d7aa0 2593
6401c4eb 2594 put_dev_pagemap(pgmap);
20b7fee7 2595 return addr == end;
b59f65fa
KS
2596}
2597
a9b6de77 2598static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
86dfbed4
JH
2599 unsigned long end, unsigned int flags,
2600 struct page **pages, int *nr)
b59f65fa
KS
2601{
2602 unsigned long fault_pfn;
a9b6de77
DW
2603 int nr_start = *nr;
2604
2605 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
86dfbed4 2606 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
a9b6de77 2607 return 0;
b59f65fa 2608
a9b6de77 2609 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
3b78d834 2610 undo_dev_pagemap(nr, nr_start, flags, pages);
a9b6de77
DW
2611 return 0;
2612 }
2613 return 1;
b59f65fa
KS
2614}
2615
a9b6de77 2616static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
86dfbed4
JH
2617 unsigned long end, unsigned int flags,
2618 struct page **pages, int *nr)
b59f65fa
KS
2619{
2620 unsigned long fault_pfn;
a9b6de77
DW
2621 int nr_start = *nr;
2622
2623 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
86dfbed4 2624 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
a9b6de77 2625 return 0;
b59f65fa 2626
a9b6de77 2627 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
3b78d834 2628 undo_dev_pagemap(nr, nr_start, flags, pages);
a9b6de77
DW
2629 return 0;
2630 }
2631 return 1;
b59f65fa
KS
2632}
2633#else
a9b6de77 2634static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
86dfbed4
JH
2635 unsigned long end, unsigned int flags,
2636 struct page **pages, int *nr)
b59f65fa
KS
2637{
2638 BUILD_BUG();
2639 return 0;
2640}
2641
a9b6de77 2642static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
86dfbed4
JH
2643 unsigned long end, unsigned int flags,
2644 struct page **pages, int *nr)
b59f65fa
KS
2645{
2646 BUILD_BUG();
2647 return 0;
2648}
2649#endif
2650
a43e9820
JH
2651static int record_subpages(struct page *page, unsigned long addr,
2652 unsigned long end, struct page **pages)
2653{
2654 int nr;
2655
c228afb1
MWO
2656 for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
2657 pages[nr] = nth_page(page, nr);
a43e9820
JH
2658
2659 return nr;
2660}
2661
cbd34da7
CH
2662#ifdef CONFIG_ARCH_HAS_HUGEPD
2663static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2664 unsigned long sz)
2665{
2666 unsigned long __boundary = (addr + sz) & ~(sz-1);
2667 return (__boundary - 1 < end - 1) ? __boundary : end;
2668}
2669
2670static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
0cd22afd
JH
2671 unsigned long end, unsigned int flags,
2672 struct page **pages, int *nr)
cbd34da7
CH
2673{
2674 unsigned long pte_end;
09a1626e
MWO
2675 struct page *page;
2676 struct folio *folio;
cbd34da7
CH
2677 pte_t pte;
2678 int refs;
2679
2680 pte_end = (addr + sz) & ~(sz-1);
2681 if (pte_end < end)
2682 end = pte_end;
2683
55ca2263 2684 pte = huge_ptep_get(ptep);
cbd34da7 2685
0cd22afd 2686 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
cbd34da7
CH
2687 return 0;
2688
2689 /* hugepages are never "special" */
2690 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2691
09a1626e 2692 page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
a43e9820 2693 refs = record_subpages(page, addr, end, pages + *nr);
cbd34da7 2694
09a1626e
MWO
2695 folio = try_grab_folio(page, refs, flags);
2696 if (!folio)
cbd34da7 2697 return 0;
cbd34da7 2698
c33c7948 2699 if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
09a1626e 2700 gup_put_folio(folio, refs, flags);
cbd34da7
CH
2701 return 0;
2702 }
2703
a6e79df9
LS
2704 if (!folio_fast_pin_allowed(folio, flags)) {
2705 gup_put_folio(folio, refs, flags);
2706 return 0;
2707 }
2708
84209e87 2709 if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
a7f22660
DH
2710 gup_put_folio(folio, refs, flags);
2711 return 0;
2712 }
2713
a43e9820 2714 *nr += refs;
09a1626e 2715 folio_set_referenced(folio);
cbd34da7
CH
2716 return 1;
2717}
2718
2719static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
0cd22afd 2720 unsigned int pdshift, unsigned long end, unsigned int flags,
cbd34da7
CH
2721 struct page **pages, int *nr)
2722{
2723 pte_t *ptep;
2724 unsigned long sz = 1UL << hugepd_shift(hugepd);
2725 unsigned long next;
2726
2727 ptep = hugepte_offset(hugepd, addr, pdshift);
2728 do {
2729 next = hugepte_addr_end(addr, end, sz);
0cd22afd 2730 if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
cbd34da7
CH
2731 return 0;
2732 } while (ptep++, addr = next, addr != end);
2733
2734 return 1;
2735}
2736#else
2737static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
0cd22afd 2738 unsigned int pdshift, unsigned long end, unsigned int flags,
cbd34da7
CH
2739 struct page **pages, int *nr)
2740{
2741 return 0;
2742}
2743#endif /* CONFIG_ARCH_HAS_HUGEPD */
2744
2667f50e 2745static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
0cd22afd
JH
2746 unsigned long end, unsigned int flags,
2747 struct page **pages, int *nr)
2667f50e 2748{
667ed1f7
MWO
2749 struct page *page;
2750 struct folio *folio;
2667f50e
SC
2751 int refs;
2752
b798bec4 2753 if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2667f50e
SC
2754 return 0;
2755
7af75561
IW
2756 if (pmd_devmap(orig)) {
2757 if (unlikely(flags & FOLL_LONGTERM))
2758 return 0;
86dfbed4
JH
2759 return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2760 pages, nr);
7af75561 2761 }
b59f65fa 2762
c228afb1 2763 page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
a43e9820 2764 refs = record_subpages(page, addr, end, pages + *nr);
2667f50e 2765
667ed1f7
MWO
2766 folio = try_grab_folio(page, refs, flags);
2767 if (!folio)
2667f50e 2768 return 0;
2667f50e
SC
2769
2770 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
667ed1f7 2771 gup_put_folio(folio, refs, flags);
2667f50e
SC
2772 return 0;
2773 }
2774
a6e79df9
LS
2775 if (!folio_fast_pin_allowed(folio, flags)) {
2776 gup_put_folio(folio, refs, flags);
2777 return 0;
2778 }
84209e87 2779 if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
a7f22660
DH
2780 gup_put_folio(folio, refs, flags);
2781 return 0;
2782 }
2783
a43e9820 2784 *nr += refs;
667ed1f7 2785 folio_set_referenced(folio);
2667f50e
SC
2786 return 1;
2787}
2788
2789static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
86dfbed4
JH
2790 unsigned long end, unsigned int flags,
2791 struct page **pages, int *nr)
2667f50e 2792{
83afb52e
MWO
2793 struct page *page;
2794 struct folio *folio;
2667f50e
SC
2795 int refs;
2796
b798bec4 2797 if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2667f50e
SC
2798 return 0;
2799
7af75561
IW
2800 if (pud_devmap(orig)) {
2801 if (unlikely(flags & FOLL_LONGTERM))
2802 return 0;
86dfbed4
JH
2803 return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2804 pages, nr);
7af75561 2805 }
b59f65fa 2806
c228afb1 2807 page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
a43e9820 2808 refs = record_subpages(page, addr, end, pages + *nr);
2667f50e 2809
83afb52e
MWO
2810 folio = try_grab_folio(page, refs, flags);
2811 if (!folio)
2667f50e 2812 return 0;
2667f50e
SC
2813
2814 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
83afb52e 2815 gup_put_folio(folio, refs, flags);
2667f50e
SC
2816 return 0;
2817 }
2818
a6e79df9
LS
2819 if (!folio_fast_pin_allowed(folio, flags)) {
2820 gup_put_folio(folio, refs, flags);
2821 return 0;
2822 }
2823
84209e87 2824 if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
a7f22660
DH
2825 gup_put_folio(folio, refs, flags);
2826 return 0;
2827 }
2828
a43e9820 2829 *nr += refs;
83afb52e 2830 folio_set_referenced(folio);
2667f50e
SC
2831 return 1;
2832}
2833
f30c59e9 2834static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
b798bec4 2835 unsigned long end, unsigned int flags,
f30c59e9
AK
2836 struct page **pages, int *nr)
2837{
2838 int refs;
2d7919a2
MWO
2839 struct page *page;
2840 struct folio *folio;
f30c59e9 2841
b798bec4 2842 if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
f30c59e9
AK
2843 return 0;
2844
b59f65fa 2845 BUILD_BUG_ON(pgd_devmap(orig));
a43e9820 2846
c228afb1 2847 page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
a43e9820 2848 refs = record_subpages(page, addr, end, pages + *nr);
f30c59e9 2849
2d7919a2
MWO
2850 folio = try_grab_folio(page, refs, flags);
2851 if (!folio)
f30c59e9 2852 return 0;
f30c59e9
AK
2853
2854 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2d7919a2 2855 gup_put_folio(folio, refs, flags);
f30c59e9
AK
2856 return 0;
2857 }
2858
31115034
LS
2859 if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2860 gup_put_folio(folio, refs, flags);
2861 return 0;
2862 }
2863
a6e79df9
LS
2864 if (!folio_fast_pin_allowed(folio, flags)) {
2865 gup_put_folio(folio, refs, flags);
2866 return 0;
2867 }
2868
a43e9820 2869 *nr += refs;
2d7919a2 2870 folio_set_referenced(folio);
f30c59e9
AK
2871 return 1;
2872}
2873
d3f7b1bb 2874static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
b798bec4 2875 unsigned int flags, struct page **pages, int *nr)
2667f50e
SC
2876{
2877 unsigned long next;
2878 pmd_t *pmdp;
2879
d3f7b1bb 2880 pmdp = pmd_offset_lockless(pudp, pud, addr);
2667f50e 2881 do {
1180e732 2882 pmd_t pmd = pmdp_get_lockless(pmdp);
2667f50e
SC
2883
2884 next = pmd_addr_end(addr, end);
84c3fc4e 2885 if (!pmd_present(pmd))
2667f50e
SC
2886 return 0;
2887
414fd080
YZ
2888 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2889 pmd_devmap(pmd))) {
0cf45986
DH
2890 if (pmd_protnone(pmd) &&
2891 !gup_can_follow_protnone(flags))
2667f50e
SC
2892 return 0;
2893
b798bec4 2894 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2667f50e
SC
2895 pages, nr))
2896 return 0;
2897
f30c59e9
AK
2898 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2899 /*
2900 * architecture have different format for hugetlbfs
2901 * pmd format and THP pmd format
2902 */
2903 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
b798bec4 2904 PMD_SHIFT, next, flags, pages, nr))
f30c59e9 2905 return 0;
70cbc3cc 2906 } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
2923117b 2907 return 0;
2667f50e
SC
2908 } while (pmdp++, addr = next, addr != end);
2909
2910 return 1;
2911}
2912
d3f7b1bb 2913static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
b798bec4 2914 unsigned int flags, struct page **pages, int *nr)
2667f50e
SC
2915{
2916 unsigned long next;
2917 pud_t *pudp;
2918
d3f7b1bb 2919 pudp = pud_offset_lockless(p4dp, p4d, addr);
2667f50e 2920 do {
e37c6982 2921 pud_t pud = READ_ONCE(*pudp);
2667f50e
SC
2922
2923 next = pud_addr_end(addr, end);
15494520 2924 if (unlikely(!pud_present(pud)))
2667f50e 2925 return 0;
fcd0ccd8 2926 if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
b798bec4 2927 if (!gup_huge_pud(pud, pudp, addr, next, flags,
f30c59e9
AK
2928 pages, nr))
2929 return 0;
2930 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2931 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
b798bec4 2932 PUD_SHIFT, next, flags, pages, nr))
2667f50e 2933 return 0;
d3f7b1bb 2934 } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2667f50e
SC
2935 return 0;
2936 } while (pudp++, addr = next, addr != end);
2937
2938 return 1;
2939}
2940
d3f7b1bb 2941static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
b798bec4 2942 unsigned int flags, struct page **pages, int *nr)
c2febafc
KS
2943{
2944 unsigned long next;
2945 p4d_t *p4dp;
2946
d3f7b1bb 2947 p4dp = p4d_offset_lockless(pgdp, pgd, addr);
c2febafc
KS
2948 do {
2949 p4d_t p4d = READ_ONCE(*p4dp);
2950
2951 next = p4d_addr_end(addr, end);
2952 if (p4d_none(p4d))
2953 return 0;
2954 BUILD_BUG_ON(p4d_huge(p4d));
2955 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2956 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
b798bec4 2957 P4D_SHIFT, next, flags, pages, nr))
c2febafc 2958 return 0;
d3f7b1bb 2959 } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
c2febafc
KS
2960 return 0;
2961 } while (p4dp++, addr = next, addr != end);
2962
2963 return 1;
2964}
2965
5b65c467 2966static void gup_pgd_range(unsigned long addr, unsigned long end,
b798bec4 2967 unsigned int flags, struct page **pages, int *nr)
5b65c467
KS
2968{
2969 unsigned long next;
2970 pgd_t *pgdp;
2971
2972 pgdp = pgd_offset(current->mm, addr);
2973 do {
2974 pgd_t pgd = READ_ONCE(*pgdp);
2975
2976 next = pgd_addr_end(addr, end);
2977 if (pgd_none(pgd))
2978 return;
2979 if (unlikely(pgd_huge(pgd))) {
b798bec4 2980 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
5b65c467
KS
2981 pages, nr))
2982 return;
2983 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2984 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
b798bec4 2985 PGDIR_SHIFT, next, flags, pages, nr))
5b65c467 2986 return;
d3f7b1bb 2987 } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
5b65c467
KS
2988 return;
2989 } while (pgdp++, addr = next, addr != end);
2990}
050a9adc
CH
2991#else
2992static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2993 unsigned int flags, struct page **pages, int *nr)
2994{
2995}
2996#endif /* CONFIG_HAVE_FAST_GUP */
5b65c467
KS
2997
2998#ifndef gup_fast_permitted
2999/*
dadbb612 3000 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
5b65c467
KS
3001 * we need to fall back to the slow version:
3002 */
26f4c328 3003static bool gup_fast_permitted(unsigned long start, unsigned long end)
5b65c467 3004{
26f4c328 3005 return true;
5b65c467
KS
3006}
3007#endif
3008
c28b1fc7
JG
3009static unsigned long lockless_pages_from_mm(unsigned long start,
3010 unsigned long end,
3011 unsigned int gup_flags,
3012 struct page **pages)
3013{
3014 unsigned long flags;
3015 int nr_pinned = 0;
57efa1fe 3016 unsigned seq;
c28b1fc7
JG
3017
3018 if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
3019 !gup_fast_permitted(start, end))
3020 return 0;
3021
57efa1fe
JG
3022 if (gup_flags & FOLL_PIN) {
3023 seq = raw_read_seqcount(&current->mm->write_protect_seq);
3024 if (seq & 1)
3025 return 0;
3026 }
3027
c28b1fc7
JG
3028 /*
3029 * Disable interrupts. The nested form is used, in order to allow full,
3030 * general purpose use of this routine.
3031 *
3032 * With interrupts disabled, we block page table pages from being freed
3033 * from under us. See struct mmu_table_batch comments in
3034 * include/asm-generic/tlb.h for more details.
3035 *
3036 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
3037 * that come from THPs splitting.
3038 */
3039 local_irq_save(flags);
3040 gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
3041 local_irq_restore(flags);
57efa1fe
JG
3042
3043 /*
3044 * When pinning pages for DMA there could be a concurrent write protect
3045 * from fork() via copy_page_range(), in this case always fail fast GUP.
3046 */
3047 if (gup_flags & FOLL_PIN) {
3048 if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
b6a2619c 3049 unpin_user_pages_lockless(pages, nr_pinned);
57efa1fe 3050 return 0;
b6a2619c
DH
3051 } else {
3052 sanity_check_pinned_pages(pages, nr_pinned);
57efa1fe
JG
3053 }
3054 }
c28b1fc7
JG
3055 return nr_pinned;
3056}
3057
3058static int internal_get_user_pages_fast(unsigned long start,
3059 unsigned long nr_pages,
eddb1c22
JH
3060 unsigned int gup_flags,
3061 struct page **pages)
2667f50e 3062{
c28b1fc7
JG
3063 unsigned long len, end;
3064 unsigned long nr_pinned;
b2a72dff 3065 int locked = 0;
c28b1fc7 3066 int ret;
2667f50e 3067
f4000fdf 3068 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
376a34ef 3069 FOLL_FORCE | FOLL_PIN | FOLL_GET |
4003f107
LG
3070 FOLL_FAST_ONLY | FOLL_NOFAULT |
3071 FOLL_PCI_P2PDMA)))
817be129
CH
3072 return -EINVAL;
3073
a458b76a
AA
3074 if (gup_flags & FOLL_PIN)
3075 mm_set_has_pinned_flag(&current->mm->flags);
008cfe44 3076
f81cd178 3077 if (!(gup_flags & FOLL_FAST_ONLY))
da1c55f1 3078 might_lock_read(&current->mm->mmap_lock);
f81cd178 3079
f455c854 3080 start = untagged_addr(start) & PAGE_MASK;
c28b1fc7
JG
3081 len = nr_pages << PAGE_SHIFT;
3082 if (check_add_overflow(start, len, &end))
c61611f7 3083 return 0;
6014bc27
LT
3084 if (end > TASK_SIZE_MAX)
3085 return -EFAULT;
96d4f267 3086 if (unlikely(!access_ok((void __user *)start, len)))
c61611f7 3087 return -EFAULT;
73e10a61 3088
c28b1fc7
JG
3089 nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
3090 if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
3091 return nr_pinned;
2667f50e 3092
c28b1fc7
JG
3093 /* Slow path: try to get the remaining pages with get_user_pages */
3094 start += nr_pinned << PAGE_SHIFT;
3095 pages += nr_pinned;
b2a72dff 3096 ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
b2cac248 3097 pages, &locked,
f04740f5 3098 gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
c28b1fc7
JG
3099 if (ret < 0) {
3100 /*
3101 * The caller has to unpin the pages we already pinned so
3102 * returning -errno is not an option
3103 */
3104 if (nr_pinned)
3105 return nr_pinned;
3106 return ret;
2667f50e 3107 }
c28b1fc7 3108 return ret + nr_pinned;
2667f50e 3109}
c28b1fc7 3110
dadbb612
SJ
3111/**
3112 * get_user_pages_fast_only() - pin user pages in memory
3113 * @start: starting user address
3114 * @nr_pages: number of pages from start to pin
3115 * @gup_flags: flags modifying pin behaviour
3116 * @pages: array that receives pointers to the pages pinned.
3117 * Should be at least nr_pages long.
3118 *
9e1f0580
JH
3119 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
3120 * the regular GUP.
9e1f0580
JH
3121 *
3122 * If the architecture does not support this function, simply return with no
3123 * pages pinned.
3124 *
3125 * Careful, careful! COW breaking can go either way, so a non-write
3126 * access can get ambiguous page results. If you call this function without
3127 * 'write' set, you'd better be sure that you're ok with that ambiguity.
3128 */
dadbb612
SJ
3129int get_user_pages_fast_only(unsigned long start, int nr_pages,
3130 unsigned int gup_flags, struct page **pages)
9e1f0580 3131{
9e1f0580
JH
3132 /*
3133 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
3134 * because gup fast is always a "pin with a +1 page refcount" request.
376a34ef
JH
3135 *
3136 * FOLL_FAST_ONLY is required in order to match the API description of
3137 * this routine: no fall back to regular ("slow") GUP.
9e1f0580 3138 */
b2cac248 3139 if (!is_valid_gup_args(pages, NULL, &gup_flags,
d64e2dbc
JG
3140 FOLL_GET | FOLL_FAST_ONLY))
3141 return -EINVAL;
9e1f0580 3142
9198a919 3143 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
9e1f0580 3144}
dadbb612 3145EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
9e1f0580 3146
eddb1c22
JH
3147/**
3148 * get_user_pages_fast() - pin user pages in memory
3faa52c0
JH
3149 * @start: starting user address
3150 * @nr_pages: number of pages from start to pin
3151 * @gup_flags: flags modifying pin behaviour
3152 * @pages: array that receives pointers to the pages pinned.
3153 * Should be at least nr_pages long.
eddb1c22 3154 *
c1e8d7c6 3155 * Attempt to pin user pages in memory without taking mm->mmap_lock.
eddb1c22
JH
3156 * If not successful, it will fall back to taking the lock and
3157 * calling get_user_pages().
3158 *
3159 * Returns number of pages pinned. This may be fewer than the number requested.
3160 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
3161 * -errno.
3162 */
3163int get_user_pages_fast(unsigned long start, int nr_pages,
3164 unsigned int gup_flags, struct page **pages)
3165{
94202f12
JH
3166 /*
3167 * The caller may or may not have explicitly set FOLL_GET; either way is
3168 * OK. However, internally (within mm/gup.c), gup fast variants must set
3169 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
3170 * request.
3171 */
b2cac248 3172 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
d64e2dbc 3173 return -EINVAL;
eddb1c22
JH
3174 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
3175}
050a9adc 3176EXPORT_SYMBOL_GPL(get_user_pages_fast);
eddb1c22
JH
3177
3178/**
3179 * pin_user_pages_fast() - pin user pages in memory without taking locks
3180 *
3faa52c0
JH
3181 * @start: starting user address
3182 * @nr_pages: number of pages from start to pin
3183 * @gup_flags: flags modifying pin behaviour
3184 * @pages: array that receives pointers to the pages pinned.
3185 * Should be at least nr_pages long.
3186 *
3187 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
3188 * get_user_pages_fast() for documentation on the function arguments, because
3189 * the arguments here are identical.
3190 *
3191 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52 3192 * see Documentation/core-api/pin_user_pages.rst for further details.
eddb1c22
JH
3193 */
3194int pin_user_pages_fast(unsigned long start, int nr_pages,
3195 unsigned int gup_flags, struct page **pages)
3196{
b2cac248 3197 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
3faa52c0 3198 return -EINVAL;
3faa52c0 3199 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
eddb1c22
JH
3200}
3201EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3202
3203/**
64019a2e 3204 * pin_user_pages_remote() - pin pages of a remote process
eddb1c22 3205 *
3faa52c0
JH
3206 * @mm: mm_struct of target mm
3207 * @start: starting user address
3208 * @nr_pages: number of pages from start to pin
3209 * @gup_flags: flags modifying lookup behaviour
3210 * @pages: array that receives pointers to the pages pinned.
0768c8de 3211 * Should be at least nr_pages long.
3faa52c0
JH
3212 * @locked: pointer to lock flag indicating whether lock is held and
3213 * subsequently whether VM_FAULT_RETRY functionality can be
3214 * utilised. Lock must initially be held.
3215 *
3216 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
3217 * get_user_pages_remote() for documentation on the function arguments, because
3218 * the arguments here are identical.
3219 *
3220 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52 3221 * see Documentation/core-api/pin_user_pages.rst for details.
eddb1c22 3222 */
64019a2e 3223long pin_user_pages_remote(struct mm_struct *mm,
eddb1c22
JH
3224 unsigned long start, unsigned long nr_pages,
3225 unsigned int gup_flags, struct page **pages,
0b295316 3226 int *locked)
eddb1c22 3227{
9a863a6a
JG
3228 int local_locked = 1;
3229
b2cac248 3230 if (!is_valid_gup_args(pages, locked, &gup_flags,
d64e2dbc
JG
3231 FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
3232 return 0;
b2cac248 3233 return __gup_longterm_locked(mm, start, nr_pages, pages,
9a863a6a 3234 locked ? locked : &local_locked,
d64e2dbc 3235 gup_flags);
eddb1c22
JH
3236}
3237EXPORT_SYMBOL(pin_user_pages_remote);
3238
3239/**
3240 * pin_user_pages() - pin user pages in memory for use by other devices
3241 *
3faa52c0
JH
3242 * @start: starting user address
3243 * @nr_pages: number of pages from start to pin
3244 * @gup_flags: flags modifying lookup behaviour
3245 * @pages: array that receives pointers to the pages pinned.
0768c8de 3246 * Should be at least nr_pages long.
3faa52c0
JH
3247 *
3248 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
3249 * FOLL_PIN is set.
3250 *
3251 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52 3252 * see Documentation/core-api/pin_user_pages.rst for details.
eddb1c22
JH
3253 */
3254long pin_user_pages(unsigned long start, unsigned long nr_pages,
4c630f30 3255 unsigned int gup_flags, struct page **pages)
eddb1c22 3256{
9a863a6a
JG
3257 int locked = 1;
3258
b2cac248 3259 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
d64e2dbc 3260 return 0;
64019a2e 3261 return __gup_longterm_locked(current->mm, start, nr_pages,
b2cac248 3262 pages, &locked, gup_flags);
eddb1c22
JH
3263}
3264EXPORT_SYMBOL(pin_user_pages);
91429023
JH
3265
3266/*
3267 * pin_user_pages_unlocked() is the FOLL_PIN variant of
3268 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
3269 * FOLL_PIN and rejects FOLL_GET.
3270 */
3271long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3272 struct page **pages, unsigned int gup_flags)
3273{
b2a72dff 3274 int locked = 0;
91429023 3275
b2cac248 3276 if (!is_valid_gup_args(pages, NULL, &gup_flags,
f04740f5 3277 FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
d64e2dbc 3278 return 0;
0768c8de 3279
b2cac248 3280 return __gup_longterm_locked(current->mm, start, nr_pages, pages,
b2a72dff 3281 &locked, gup_flags);
91429023
JH
3282}
3283EXPORT_SYMBOL(pin_user_pages_unlocked);