1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2015 Red Hat, Inc.
9 #include <linux/sched/signal.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/swapops.h>
14 #include <linux/userfaultfd_k.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/hugetlb.h>
17 #include <linux/shmem_fs.h>
18 #include <asm/tlbflush.h>
22 static __always_inline
23 struct vm_area_struct
*find_dst_vma(struct mm_struct
*dst_mm
,
24 unsigned long dst_start
,
28 * Make sure that the dst range is both valid and fully within a
29 * single existing vma.
31 struct vm_area_struct
*dst_vma
;
33 dst_vma
= find_vma(dst_mm
, dst_start
);
34 if (!range_in_vma(dst_vma
, dst_start
, dst_start
+ len
))
38 * Check the vma is registered in uffd, this is required to
39 * enforce the VM_MAYWRITE check done at uffd registration
42 if (!dst_vma
->vm_userfaultfd_ctx
.ctx
)
48 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
49 static bool mfill_file_over_size(struct vm_area_struct
*dst_vma
,
50 unsigned long dst_addr
)
53 pgoff_t offset
, max_off
;
55 if (!dst_vma
->vm_file
)
58 inode
= dst_vma
->vm_file
->f_inode
;
59 offset
= linear_page_index(dst_vma
, dst_addr
);
60 max_off
= DIV_ROUND_UP(i_size_read(inode
), PAGE_SIZE
);
61 return offset
>= max_off
;
65 * Install PTEs, to map dst_addr (within dst_vma) to page.
67 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
68 * and anon, and for both shared and private VMAs.
70 int mfill_atomic_install_pte(pmd_t
*dst_pmd
,
71 struct vm_area_struct
*dst_vma
,
72 unsigned long dst_addr
, struct page
*page
,
73 bool newly_allocated
, uffd_flags_t flags
)
76 struct mm_struct
*dst_mm
= dst_vma
->vm_mm
;
77 pte_t _dst_pte
, *dst_pte
;
78 bool writable
= dst_vma
->vm_flags
& VM_WRITE
;
79 bool vm_shared
= dst_vma
->vm_flags
& VM_SHARED
;
80 bool page_in_cache
= page_mapping(page
);
84 _dst_pte
= mk_pte(page
, dst_vma
->vm_page_prot
);
85 _dst_pte
= pte_mkdirty(_dst_pte
);
86 if (page_in_cache
&& !vm_shared
)
89 _dst_pte
= pte_mkwrite(_dst_pte
, dst_vma
);
90 if (flags
& MFILL_ATOMIC_WP
)
91 _dst_pte
= pte_mkuffd_wp(_dst_pte
);
94 dst_pte
= pte_offset_map_lock(dst_mm
, dst_pmd
, dst_addr
, &ptl
);
98 if (mfill_file_over_size(dst_vma
, dst_addr
)) {
105 * We allow to overwrite a pte marker: consider when both MISSING|WP
106 * registered, we firstly wr-protect a none pte which has no page cache
107 * page backing it, then access the page.
109 if (!pte_none_mostly(ptep_get(dst_pte
)))
112 folio
= page_folio(page
);
114 /* Usually, cache pages are already added to LRU */
116 folio_add_lru(folio
);
117 page_add_file_rmap(page
, dst_vma
, false);
119 page_add_new_anon_rmap(page
, dst_vma
, dst_addr
);
120 folio_add_lru_vma(folio
, dst_vma
);
124 * Must happen after rmap, as mm_counter() checks mapping (via
125 * PageAnon()), which is set by __page_set_anon_rmap().
127 inc_mm_counter(dst_mm
, mm_counter(page
));
129 set_pte_at(dst_mm
, dst_addr
, dst_pte
, _dst_pte
);
131 /* No need to invalidate - it was non-present before */
132 update_mmu_cache(dst_vma
, dst_addr
, dst_pte
);
135 pte_unmap_unlock(dst_pte
, ptl
);
140 static int mfill_atomic_pte_copy(pmd_t
*dst_pmd
,
141 struct vm_area_struct
*dst_vma
,
142 unsigned long dst_addr
,
143 unsigned long src_addr
,
145 struct folio
**foliop
)
153 folio
= vma_alloc_folio(GFP_HIGHUSER_MOVABLE
, 0, dst_vma
,
158 kaddr
= kmap_local_folio(folio
, 0);
160 * The read mmap_lock is held here. Despite the
161 * mmap_lock being read recursive a deadlock is still
162 * possible if a writer has taken a lock. For example:
164 * process A thread 1 takes read lock on own mmap_lock
165 * process A thread 2 calls mmap, blocks taking write lock
166 * process B thread 1 takes page fault, read lock on own mmap lock
167 * process B thread 2 calls mmap, blocks taking write lock
168 * process A thread 1 blocks taking read lock on process B
169 * process B thread 1 blocks taking read lock on process A
171 * Disable page faults to prevent potential deadlock
172 * and retry the copy outside the mmap_lock.
175 ret
= copy_from_user(kaddr
, (const void __user
*) src_addr
,
180 /* fallback to copy_from_user outside mmap_lock */
184 /* don't free the page */
188 flush_dcache_folio(folio
);
195 * The memory barrier inside __folio_mark_uptodate makes sure that
196 * preceding stores to the page contents become visible before
197 * the set_pte_at() write.
199 __folio_mark_uptodate(folio
);
202 if (mem_cgroup_charge(folio
, dst_vma
->vm_mm
, GFP_KERNEL
))
205 ret
= mfill_atomic_install_pte(dst_pmd
, dst_vma
, dst_addr
,
206 &folio
->page
, true, flags
);
216 static int mfill_atomic_pte_zeropage(pmd_t
*dst_pmd
,
217 struct vm_area_struct
*dst_vma
,
218 unsigned long dst_addr
)
220 pte_t _dst_pte
, *dst_pte
;
224 _dst_pte
= pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr
),
225 dst_vma
->vm_page_prot
));
227 dst_pte
= pte_offset_map_lock(dst_vma
->vm_mm
, dst_pmd
, dst_addr
, &ptl
);
230 if (mfill_file_over_size(dst_vma
, dst_addr
)) {
235 if (!pte_none(ptep_get(dst_pte
)))
237 set_pte_at(dst_vma
->vm_mm
, dst_addr
, dst_pte
, _dst_pte
);
238 /* No need to invalidate - it was non-present before */
239 update_mmu_cache(dst_vma
, dst_addr
, dst_pte
);
242 pte_unmap_unlock(dst_pte
, ptl
);
247 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
248 static int mfill_atomic_pte_continue(pmd_t
*dst_pmd
,
249 struct vm_area_struct
*dst_vma
,
250 unsigned long dst_addr
,
253 struct inode
*inode
= file_inode(dst_vma
->vm_file
);
254 pgoff_t pgoff
= linear_page_index(dst_vma
, dst_addr
);
259 ret
= shmem_get_folio(inode
, pgoff
, &folio
, SGP_NOALLOC
);
260 /* Our caller expects us to return -EFAULT if we failed to find folio */
270 page
= folio_file_page(folio
, pgoff
);
271 if (PageHWPoison(page
)) {
276 ret
= mfill_atomic_install_pte(dst_pmd
, dst_vma
, dst_addr
,
291 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
292 static int mfill_atomic_pte_poison(pmd_t
*dst_pmd
,
293 struct vm_area_struct
*dst_vma
,
294 unsigned long dst_addr
,
298 struct mm_struct
*dst_mm
= dst_vma
->vm_mm
;
299 pte_t _dst_pte
, *dst_pte
;
302 _dst_pte
= make_pte_marker(PTE_MARKER_POISONED
);
304 dst_pte
= pte_offset_map_lock(dst_mm
, dst_pmd
, dst_addr
, &ptl
);
308 if (mfill_file_over_size(dst_vma
, dst_addr
)) {
314 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
315 if (!pte_none(*dst_pte
))
318 set_pte_at(dst_mm
, dst_addr
, dst_pte
, _dst_pte
);
320 /* No need to invalidate - it was non-present before */
321 update_mmu_cache(dst_vma
, dst_addr
, dst_pte
);
324 pte_unmap_unlock(dst_pte
, ptl
);
329 static pmd_t
*mm_alloc_pmd(struct mm_struct
*mm
, unsigned long address
)
335 pgd
= pgd_offset(mm
, address
);
336 p4d
= p4d_alloc(mm
, pgd
, address
);
339 pud
= pud_alloc(mm
, p4d
, address
);
343 * Note that we didn't run this because the pmd was
344 * missing, the *pmd may be already established and in
345 * turn it may also be a trans_huge_pmd.
347 return pmd_alloc(mm
, pud
, address
);
350 #ifdef CONFIG_HUGETLB_PAGE
352 * mfill_atomic processing for HUGETLB vmas. Note that this routine is
353 * called with mmap_lock held, it will release mmap_lock before returning.
355 static __always_inline ssize_t
mfill_atomic_hugetlb(
356 struct vm_area_struct
*dst_vma
,
357 unsigned long dst_start
,
358 unsigned long src_start
,
362 struct mm_struct
*dst_mm
= dst_vma
->vm_mm
;
363 int vm_shared
= dst_vma
->vm_flags
& VM_SHARED
;
366 unsigned long src_addr
, dst_addr
;
369 unsigned long vma_hpagesize
;
372 struct address_space
*mapping
;
375 * There is no default zero huge page for all huge page sizes as
376 * supported by hugetlb. A PMD_SIZE huge pages may exist as used
377 * by THP. Since we can not reliably insert a zero page, this
378 * feature is not supported.
380 if (uffd_flags_mode_is(flags
, MFILL_ATOMIC_ZEROPAGE
)) {
381 mmap_read_unlock(dst_mm
);
385 src_addr
= src_start
;
386 dst_addr
= dst_start
;
389 vma_hpagesize
= vma_kernel_pagesize(dst_vma
);
392 * Validate alignment based on huge page size
395 if (dst_start
& (vma_hpagesize
- 1) || len
& (vma_hpagesize
- 1))
400 * On routine entry dst_vma is set. If we had to drop mmap_lock and
401 * retry, dst_vma will be set to NULL and we must lookup again.
405 dst_vma
= find_dst_vma(dst_mm
, dst_start
, len
);
406 if (!dst_vma
|| !is_vm_hugetlb_page(dst_vma
))
410 if (vma_hpagesize
!= vma_kernel_pagesize(dst_vma
))
413 vm_shared
= dst_vma
->vm_flags
& VM_SHARED
;
417 * If not shared, ensure the dst_vma has a anon_vma.
421 if (unlikely(anon_vma_prepare(dst_vma
)))
425 while (src_addr
< src_start
+ len
) {
426 BUG_ON(dst_addr
>= dst_start
+ len
);
429 * Serialize via vma_lock and hugetlb_fault_mutex.
430 * vma_lock ensures the dst_pte remains valid even
431 * in the case of shared pmds. fault mutex prevents
432 * races with other faulting threads.
434 idx
= linear_page_index(dst_vma
, dst_addr
);
435 mapping
= dst_vma
->vm_file
->f_mapping
;
436 hash
= hugetlb_fault_mutex_hash(mapping
, idx
);
437 mutex_lock(&hugetlb_fault_mutex_table
[hash
]);
438 hugetlb_vma_lock_read(dst_vma
);
441 dst_pte
= huge_pte_alloc(dst_mm
, dst_vma
, dst_addr
, vma_hpagesize
);
443 hugetlb_vma_unlock_read(dst_vma
);
444 mutex_unlock(&hugetlb_fault_mutex_table
[hash
]);
448 if (!uffd_flags_mode_is(flags
, MFILL_ATOMIC_CONTINUE
) &&
449 !huge_pte_none_mostly(huge_ptep_get(dst_pte
))) {
451 hugetlb_vma_unlock_read(dst_vma
);
452 mutex_unlock(&hugetlb_fault_mutex_table
[hash
]);
456 err
= hugetlb_mfill_atomic_pte(dst_pte
, dst_vma
, dst_addr
,
457 src_addr
, flags
, &folio
);
459 hugetlb_vma_unlock_read(dst_vma
);
460 mutex_unlock(&hugetlb_fault_mutex_table
[hash
]);
464 if (unlikely(err
== -ENOENT
)) {
465 mmap_read_unlock(dst_mm
);
468 err
= copy_folio_from_user(folio
,
469 (const void __user
*)src_addr
, true);
474 mmap_read_lock(dst_mm
);
482 dst_addr
+= vma_hpagesize
;
483 src_addr
+= vma_hpagesize
;
484 copied
+= vma_hpagesize
;
486 if (fatal_signal_pending(current
))
494 mmap_read_unlock(dst_mm
);
500 BUG_ON(!copied
&& !err
);
501 return copied
? copied
: err
;
503 #else /* !CONFIG_HUGETLB_PAGE */
504 /* fail at build time if gcc attempts to use this */
505 extern ssize_t
mfill_atomic_hugetlb(struct vm_area_struct
*dst_vma
,
506 unsigned long dst_start
,
507 unsigned long src_start
,
510 #endif /* CONFIG_HUGETLB_PAGE */
512 static __always_inline ssize_t
mfill_atomic_pte(pmd_t
*dst_pmd
,
513 struct vm_area_struct
*dst_vma
,
514 unsigned long dst_addr
,
515 unsigned long src_addr
,
517 struct folio
**foliop
)
521 if (uffd_flags_mode_is(flags
, MFILL_ATOMIC_CONTINUE
)) {
522 return mfill_atomic_pte_continue(dst_pmd
, dst_vma
,
524 } else if (uffd_flags_mode_is(flags
, MFILL_ATOMIC_POISON
)) {
525 return mfill_atomic_pte_poison(dst_pmd
, dst_vma
,
530 * The normal page fault path for a shmem will invoke the
531 * fault, fill the hole in the file and COW it right away. The
532 * result generates plain anonymous memory. So when we are
533 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
534 * generate anonymous memory directly without actually filling
535 * the hole. For the MAP_PRIVATE case the robustness check
536 * only happens in the pagetable (to verify it's still none)
537 * and not in the radix tree.
539 if (!(dst_vma
->vm_flags
& VM_SHARED
)) {
540 if (uffd_flags_mode_is(flags
, MFILL_ATOMIC_COPY
))
541 err
= mfill_atomic_pte_copy(dst_pmd
, dst_vma
,
545 err
= mfill_atomic_pte_zeropage(dst_pmd
,
548 err
= shmem_mfill_atomic_pte(dst_pmd
, dst_vma
,
556 static __always_inline ssize_t
mfill_atomic(struct mm_struct
*dst_mm
,
557 unsigned long dst_start
,
558 unsigned long src_start
,
560 atomic_t
*mmap_changing
,
563 struct vm_area_struct
*dst_vma
;
566 unsigned long src_addr
, dst_addr
;
571 * Sanitize the command parameters:
573 BUG_ON(dst_start
& ~PAGE_MASK
);
574 BUG_ON(len
& ~PAGE_MASK
);
576 /* Does the address range wrap, or is the span zero-sized? */
577 BUG_ON(src_start
+ len
<= src_start
);
578 BUG_ON(dst_start
+ len
<= dst_start
);
580 src_addr
= src_start
;
581 dst_addr
= dst_start
;
585 mmap_read_lock(dst_mm
);
588 * If memory mappings are changing because of non-cooperative
589 * operation (e.g. mremap) running in parallel, bail out and
590 * request the user to retry later
593 if (mmap_changing
&& atomic_read(mmap_changing
))
597 * Make sure the vma is not shared, that the dst range is
598 * both valid and fully within a single existing vma.
601 dst_vma
= find_dst_vma(dst_mm
, dst_start
, len
);
607 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
608 * it will overwrite vm_ops, so vma_is_anonymous must return false.
610 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma
) &&
611 dst_vma
->vm_flags
& VM_SHARED
))
615 * validate 'mode' now that we know the dst_vma: don't allow
616 * a wrprotect copy if the userfaultfd didn't register as WP.
618 if ((flags
& MFILL_ATOMIC_WP
) && !(dst_vma
->vm_flags
& VM_UFFD_WP
))
622 * If this is a HUGETLB vma, pass off to appropriate routine
624 if (is_vm_hugetlb_page(dst_vma
))
625 return mfill_atomic_hugetlb(dst_vma
, dst_start
,
626 src_start
, len
, flags
);
628 if (!vma_is_anonymous(dst_vma
) && !vma_is_shmem(dst_vma
))
630 if (!vma_is_shmem(dst_vma
) &&
631 uffd_flags_mode_is(flags
, MFILL_ATOMIC_CONTINUE
))
635 * Ensure the dst_vma has a anon_vma or this page
636 * would get a NULL anon_vma when moved in the
640 if (!(dst_vma
->vm_flags
& VM_SHARED
) &&
641 unlikely(anon_vma_prepare(dst_vma
)))
644 while (src_addr
< src_start
+ len
) {
647 BUG_ON(dst_addr
>= dst_start
+ len
);
649 dst_pmd
= mm_alloc_pmd(dst_mm
, dst_addr
);
650 if (unlikely(!dst_pmd
)) {
655 dst_pmdval
= pmdp_get_lockless(dst_pmd
);
657 * If the dst_pmd is mapped as THP don't
658 * override it and just be strict.
660 if (unlikely(pmd_trans_huge(dst_pmdval
))) {
664 if (unlikely(pmd_none(dst_pmdval
)) &&
665 unlikely(__pte_alloc(dst_mm
, dst_pmd
))) {
669 /* If an huge pmd materialized from under us fail */
670 if (unlikely(pmd_trans_huge(*dst_pmd
))) {
675 BUG_ON(pmd_none(*dst_pmd
));
676 BUG_ON(pmd_trans_huge(*dst_pmd
));
678 err
= mfill_atomic_pte(dst_pmd
, dst_vma
, dst_addr
,
679 src_addr
, flags
, &folio
);
682 if (unlikely(err
== -ENOENT
)) {
685 mmap_read_unlock(dst_mm
);
688 kaddr
= kmap_local_folio(folio
, 0);
689 err
= copy_from_user(kaddr
,
690 (const void __user
*) src_addr
,
697 flush_dcache_folio(folio
);
703 dst_addr
+= PAGE_SIZE
;
704 src_addr
+= PAGE_SIZE
;
707 if (fatal_signal_pending(current
))
715 mmap_read_unlock(dst_mm
);
721 BUG_ON(!copied
&& !err
);
722 return copied
? copied
: err
;
725 ssize_t
mfill_atomic_copy(struct mm_struct
*dst_mm
, unsigned long dst_start
,
726 unsigned long src_start
, unsigned long len
,
727 atomic_t
*mmap_changing
, uffd_flags_t flags
)
729 return mfill_atomic(dst_mm
, dst_start
, src_start
, len
, mmap_changing
,
730 uffd_flags_set_mode(flags
, MFILL_ATOMIC_COPY
));
733 ssize_t
mfill_atomic_zeropage(struct mm_struct
*dst_mm
, unsigned long start
,
734 unsigned long len
, atomic_t
*mmap_changing
)
736 return mfill_atomic(dst_mm
, start
, 0, len
, mmap_changing
,
737 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE
));
740 ssize_t
mfill_atomic_continue(struct mm_struct
*dst_mm
, unsigned long start
,
741 unsigned long len
, atomic_t
*mmap_changing
,
744 return mfill_atomic(dst_mm
, start
, 0, len
, mmap_changing
,
745 uffd_flags_set_mode(flags
, MFILL_ATOMIC_CONTINUE
));
748 ssize_t
mfill_atomic_poison(struct mm_struct
*dst_mm
, unsigned long start
,
749 unsigned long len
, atomic_t
*mmap_changing
,
752 return mfill_atomic(dst_mm
, start
, 0, len
, mmap_changing
,
753 uffd_flags_set_mode(flags
, MFILL_ATOMIC_POISON
));
756 long uffd_wp_range(struct vm_area_struct
*dst_vma
,
757 unsigned long start
, unsigned long len
, bool enable_wp
)
759 unsigned int mm_cp_flags
;
760 struct mmu_gather tlb
;
763 VM_WARN_ONCE(start
< dst_vma
->vm_start
|| start
+ len
> dst_vma
->vm_end
,
764 "The address range exceeds VMA boundary.\n");
766 mm_cp_flags
= MM_CP_UFFD_WP
;
768 mm_cp_flags
= MM_CP_UFFD_WP_RESOLVE
;
771 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
772 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
773 * to be write-protected as default whenever protection changes.
774 * Try upgrading write permissions manually.
776 if (!enable_wp
&& vma_wants_manual_pte_write_upgrade(dst_vma
))
777 mm_cp_flags
|= MM_CP_TRY_CHANGE_WRITABLE
;
778 tlb_gather_mmu(&tlb
, dst_vma
->vm_mm
);
779 ret
= change_protection(&tlb
, dst_vma
, start
, start
+ len
, mm_cp_flags
);
780 tlb_finish_mmu(&tlb
);
785 int mwriteprotect_range(struct mm_struct
*dst_mm
, unsigned long start
,
786 unsigned long len
, bool enable_wp
,
787 atomic_t
*mmap_changing
)
789 unsigned long end
= start
+ len
;
790 unsigned long _start
, _end
;
791 struct vm_area_struct
*dst_vma
;
792 unsigned long page_mask
;
794 VMA_ITERATOR(vmi
, dst_mm
, start
);
797 * Sanitize the command parameters:
799 BUG_ON(start
& ~PAGE_MASK
);
800 BUG_ON(len
& ~PAGE_MASK
);
802 /* Does the address range wrap, or is the span zero-sized? */
803 BUG_ON(start
+ len
<= start
);
805 mmap_read_lock(dst_mm
);
808 * If memory mappings are changing because of non-cooperative
809 * operation (e.g. mremap) running in parallel, bail out and
810 * request the user to retry later
813 if (mmap_changing
&& atomic_read(mmap_changing
))
817 for_each_vma_range(vmi
, dst_vma
, end
) {
819 if (!userfaultfd_wp(dst_vma
)) {
824 if (is_vm_hugetlb_page(dst_vma
)) {
826 page_mask
= vma_kernel_pagesize(dst_vma
) - 1;
827 if ((start
& page_mask
) || (len
& page_mask
))
831 _start
= max(dst_vma
->vm_start
, start
);
832 _end
= min(dst_vma
->vm_end
, end
);
834 err
= uffd_wp_range(dst_vma
, _start
, _end
- _start
, enable_wp
);
836 /* Return 0 on success, <0 on failures */
842 mmap_read_unlock(dst_mm
);