[thirdparty/kernel/stable.git] / mm / userfaultfd.c

/*
 *  mm/userfaultfd.c
 *
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 */

#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include "internal.h"

static int mcopy_atomic_pte(struct mm_struct *dst_mm,
			    pmd_t *dst_pmd,
			    struct vm_area_struct *dst_vma,
			    unsigned long dst_addr,
			    unsigned long src_addr,
			    struct page **pagep)
{
	struct mem_cgroup *memcg;
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	void *page_kaddr;
	int ret;
	struct page *page;
	pgoff_t offset, max_off;
	struct inode *inode;

	if (!*pagep) {
		ret = -ENOMEM;
		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
		if (!page)
			goto out;

		page_kaddr = kmap_atomic(page);
		ret = copy_from_user(page_kaddr,
				     (const void __user *) src_addr,
				     PAGE_SIZE);
		kunmap_atomic(page_kaddr);

		/* fallback to copy_from_user outside mmap_sem */
		if (unlikely(ret)) {
			ret = -ENOENT;
			*pagep = page;
			/* don't free the page */
			goto out;
		}
	} else {
		page = *pagep;
		*pagep = NULL;
	}

	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * preceeding stores to the page contents become visible before
	 * the set_pte_at() write.
	 */
	__SetPageUptodate(page);

	ret = -ENOMEM;
	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
		goto out_release;

	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	if (dst_vma->vm_flags & VM_WRITE)
		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));

	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (dst_vma->vm_file) {
		/* the shmem MAP_PRIVATE case requires checking the i_size */
		inode = dst_vma->vm_file->f_inode;
		offset = linear_page_index(dst_vma, dst_addr);
		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
		ret = -EFAULT;
		if (unlikely(offset >= max_off))
			goto out_release_uncharge_unlock;
	}
	ret = -EEXIST;
	if (!pte_none(*dst_pte))
		goto out_release_uncharge_unlock;

	inc_mm_counter(dst_mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
	mem_cgroup_commit_charge(page, memcg, false, false);
	lru_cache_add_active_or_unevictable(page, dst_vma);

	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);

	pte_unmap_unlock(dst_pte, ptl);
	ret = 0;
out:
	return ret;
out_release_uncharge_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	mem_cgroup_cancel_charge(page, memcg, false);
out_release:
	put_page(page);
	goto out;
}

static int mfill_zeropage_pte(struct mm_struct *dst_mm,
			      pmd_t *dst_pmd,
			      struct vm_area_struct *dst_vma,
			      unsigned long dst_addr)
{
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	int ret;
	pgoff_t offset, max_off;
	struct inode *inode;

	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
					 dst_vma->vm_page_prot));
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (dst_vma->vm_file) {
		/* the shmem MAP_PRIVATE case requires checking the i_size */
		inode = dst_vma->vm_file->f_inode;
		offset = linear_page_index(dst_vma, dst_addr);
		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
		ret = -EFAULT;
		if (unlikely(offset >= max_off))
			goto out_unlock;
	}
	ret = -EEXIST;
	if (!pte_none(*dst_pte))
		goto out_unlock;
	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	ret = 0;
out_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	return ret;
}

static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;

	pgd = pgd_offset(mm, address);
	p4d = p4d_alloc(mm, pgd, address);
	if (!p4d)
		return NULL;
	pud = pud_alloc(mm, p4d, address);
	if (!pud)
		return NULL;
	/*
	 * Note that we didn't run this because the pmd was
	 * missing, the *pmd may be already established and in
	 * turn it may also be a trans_huge_pmd.
	 */
	return pmd_alloc(mm, pud, address);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
 * called with mmap_sem held, it will release mmap_sem before returning.
 */
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
					      struct vm_area_struct *dst_vma,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	ssize_t err;
	pte_t *dst_pte;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;
	struct hstate *h;
	unsigned long vma_hpagesize;
	pgoff_t idx;
	u32 hash;
	struct address_space *mapping;

	/*
	 * There is no default zero huge page for all huge page sizes as
	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
	 * by THP.  Since we can not reliably insert a zero page, this
	 * feature is not supported.
	 */
	if (zeropage) {
		up_read(&dst_mm->mmap_sem);
		return -EINVAL;
	}

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
	vma_hpagesize = vma_kernel_pagesize(dst_vma);

	/*
	 * Validate alignment based on huge page size
	 */
	err = -EINVAL;
	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
		goto out_unlock;

retry:
	/*
	 * On routine entry dst_vma is set.  If we had to drop mmap_sem and
	 * retry, dst_vma will be set to NULL and we must lookup again.
	 */
	if (!dst_vma) {
		err = -ENOENT;
		dst_vma = find_vma(dst_mm, dst_start);
		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
			goto out_unlock;
		/*
		 * Check the vma is registered in uffd, this is
		 * required to enforce the VM_MAYWRITE check done at
		 * uffd registration time.
		 */
		if (!dst_vma->vm_userfaultfd_ctx.ctx)
			goto out_unlock;

		if (dst_start < dst_vma->vm_start ||
		    dst_start + len > dst_vma->vm_end)
			goto out_unlock;

		err = -EINVAL;
		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
			goto out_unlock;

		vm_shared = dst_vma->vm_flags & VM_SHARED;
	}

	if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
		    (len - copied) & (vma_hpagesize - 1)))
		goto out_unlock;

	/*
	 * If not shared, ensure the dst_vma has a anon_vma.
	 */
	err = -ENOMEM;
	if (!vm_shared) {
		if (unlikely(anon_vma_prepare(dst_vma)))
			goto out_unlock;
	}

	h = hstate_vma(dst_vma);

	while (src_addr < src_start + len) {
		pte_t dst_pteval;

		BUG_ON(dst_addr >= dst_start + len);
		VM_BUG_ON(dst_addr & ~huge_page_mask(h));

		/*
		 * Serialize via hugetlb_fault_mutex
		 */
		idx = linear_page_index(dst_vma, dst_addr);
		mapping = dst_vma->vm_file->f_mapping;
		hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
								idx, dst_addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

		err = -ENOMEM;
		dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
		if (!dst_pte) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = -EEXIST;
		dst_pteval = huge_ptep_get(dst_pte);
		if (!huge_pte_none(dst_pteval)) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
						dst_addr, src_addr, &page);

		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		vm_alloc_shared = vm_shared;

		cond_resched();

		if (unlikely(err == -ENOENT)) {
			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			err = copy_huge_page_from_user(page,
						(const void __user *)src_addr,
						pages_per_huge_page(h), true);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			down_read(&dst_mm->mmap_sem);

			dst_vma = NULL;
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += vma_hpagesize;
			src_addr += vma_hpagesize;
			copied += vma_hpagesize;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page) {
		/*
		 * We encountered an error and are about to free a newly
		 * allocated huge page.
		 *
		 * Reservation handling is very subtle, and is different for
		 * private and shared mappings.  See the routine
		 * restore_reserve_on_error for details.  Unfortunately, we
		 * can not call restore_reserve_on_error now as it would
		 * require holding mmap_sem.
		 *
		 * If a reservation for the page existed in the reservation
		 * map of a private mapping, the map was modified to indicate
		 * the reservation was consumed when the page was allocated.
		 * We clear the PagePrivate flag now so that the global
		 * reserve count will not be incremented in free_huge_page.
		 * The reservation map will still indicate the reservation
		 * was consumed and possibly prevent later page allocation.
		 * This is better than leaking a global reservation.  If no
		 * reservation existed, it is still safe to clear PagePrivate
		 * as no adjustments to reservation counts were made during
		 * allocation.
		 *
		 * The reservation map for shared mappings indicates which
		 * pages have reservations.  When a huge page is allocated
		 * for an address with a reservation, no change is made to
		 * the reserve map.  In this case PagePrivate will be set
		 * to indicate that the global reservation count should be
		 * incremented when the page is freed.  This is the desired
		 * behavior.  However, when a huge page is allocated for an
		 * address without a reservation a reservation entry is added
		 * to the reservation map, and PagePrivate will not be set.
		 * When the page is freed, the global reserve count will NOT
		 * be incremented and it will appear as though we have leaked
		 * reserved page.  In this case, set PagePrivate so that the
		 * global reserve count will be incremented to match the
		 * reservation map entry which was created.
		 *
		 * Note that vm_alloc_shared is based on the flags of the vma
		 * for which the page was originally allocated.  dst_vma could
		 * be different or NULL on error.
		 */
		if (vm_alloc_shared)
			SetPagePrivate(page);
		else
			ClearPagePrivate(page);
		put_page(page);
	}
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
				      struct vm_area_struct *dst_vma,
				      unsigned long dst_start,
				      unsigned long src_start,
				      unsigned long len,
				      bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
						pmd_t *dst_pmd,
						struct vm_area_struct *dst_vma,
						unsigned long dst_addr,
						unsigned long src_addr,
						struct page **page,
						bool zeropage)
{
	ssize_t err;

	/*
	 * The normal page fault path for a shmem will invoke the
	 * fault, fill the hole in the file and COW it right away. The
	 * result generates plain anonymous memory. So when we are
	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
	 * generate anonymous memory directly without actually filling
	 * the hole. For the MAP_PRIVATE case the robustness check
	 * only happens in the pagetable (to verify it's still none)
	 * and not in the radix tree.
	 */
	if (!(dst_vma->vm_flags & VM_SHARED)) {
		if (!zeropage)
			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
					       dst_addr, src_addr, page);
		else
			err = mfill_zeropage_pte(dst_mm, dst_pmd,
						 dst_vma, dst_addr);
	} else {
		if (!zeropage)
			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
						     dst_vma, dst_addr,
						     src_addr, page);
		else
			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
						       dst_vma, dst_addr);
	}

	return err;
}

static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage,
					      bool *mmap_changing)
{
	struct vm_area_struct *dst_vma;
	ssize_t err;
	pmd_t *dst_pmd;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;

	/*
	 * Sanitize the command parameters:
	 */
	BUG_ON(dst_start & ~PAGE_MASK);
	BUG_ON(len & ~PAGE_MASK);

	/* Does the address range wrap, or is the span zero-sized? */
	BUG_ON(src_start + len <= src_start);
	BUG_ON(dst_start + len <= dst_start);

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
retry:
	down_read(&dst_mm->mmap_sem);

	/*
	 * If memory mappings are changing because of non-cooperative
	 * operation (e.g. mremap) running in parallel, bail out and
	 * request the user to retry later
	 */
	err = -EAGAIN;
	if (mmap_changing && READ_ONCE(*mmap_changing))
		goto out_unlock;

	/*
	 * Make sure the vma is not shared, that the dst range is
	 * both valid and fully within a single existing vma.
	 */
	err = -ENOENT;
	dst_vma = find_vma(dst_mm, dst_start);
	if (!dst_vma)
		goto out_unlock;
	/*
	 * Check the vma is registered in uffd, this is required to
	 * enforce the VM_MAYWRITE check done at uffd registration
	 * time.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	if (dst_start < dst_vma->vm_start ||
	    dst_start + len > dst_vma->vm_end)
		goto out_unlock;

	err = -EINVAL;
	/*
	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
	 */
	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	    dst_vma->vm_flags & VM_SHARED))
		goto out_unlock;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(dst_vma))
		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
						src_start, len, zeropage);

	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma or this page
	 * would get a NULL anon_vma when moved in the
	 * dst_vma.
	 */
	err = -ENOMEM;
	if (!(dst_vma->vm_flags & VM_SHARED) &&
	    unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	while (src_addr < src_start + len) {
		pmd_t dst_pmdval;

		BUG_ON(dst_addr >= dst_start + len);

		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
		if (unlikely(!dst_pmd)) {
			err = -ENOMEM;
			break;
		}

		dst_pmdval = pmd_read_atomic(dst_pmd);
		/*
		 * If the dst_pmd is mapped as THP don't
		 * override it and just be strict.
		 */
		if (unlikely(pmd_trans_huge(dst_pmdval))) {
			err = -EEXIST;
			break;
		}
		if (unlikely(pmd_none(dst_pmdval)) &&
		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
			err = -ENOMEM;
			break;
		}
		/* If an huge pmd materialized from under us fail */
		if (unlikely(pmd_trans_huge(*dst_pmd))) {
			err = -EFAULT;
			break;
		}

		BUG_ON(pmd_none(*dst_pmd));
		BUG_ON(pmd_trans_huge(*dst_pmd));

		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
				       src_addr, &page, zeropage);
		cond_resched();

		if (unlikely(err == -ENOENT)) {
			void *page_kaddr;

			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			page_kaddr = kmap(page);
			err = copy_from_user(page_kaddr,
					     (const void __user *) src_addr,
					     PAGE_SIZE);
			kunmap(page);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += PAGE_SIZE;
			src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page)
		put_page(page);
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}

ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
		     unsigned long src_start, unsigned long len,
		     bool *mmap_changing)
{
	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
			      mmap_changing);
}

ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
		       unsigned long len, bool *mmap_changing)
{
	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
}
Commit	Line	Data
c1a4de99 AA	1	/*
	2	* mm/userfaultfd.c
	3	*
	4	* Copyright (C) 2015 Red Hat, Inc.
	5	*
	6	* This work is licensed under the terms of the GNU GPL, version 2. See
	7	* the COPYING file in the top-level directory.
	8	*/
	9
	10	#include <linux/mm.h>
174cd4b1	11	#include <linux/sched/signal.h>
c1a4de99 AA	12	#include <linux/pagemap.h>
	13	#include <linux/rmap.h>
	14	#include <linux/swap.h>
	15	#include <linux/swapops.h>
	16	#include <linux/userfaultfd_k.h>
	17	#include <linux/mmu_notifier.h>
60d4d2d2	18	#include <linux/hugetlb.h>
26071ced	19	#include <linux/shmem_fs.h>
c1a4de99 AA	20	#include <asm/tlbflush.h>
	21	#include "internal.h"
	22
	23	static int mcopy_atomic_pte(struct mm_struct *dst_mm,
	24	pmd_t *dst_pmd,
	25	struct vm_area_struct *dst_vma,
	26	unsigned long dst_addr,
b6ebaedb AA	27	unsigned long src_addr,
b6ebaedb AA	28	struct page **pagep)
c1a4de99 AA	29	{
	30	struct mem_cgroup *memcg;
	31	pte_t _dst_pte, *dst_pte;
	32	spinlock_t *ptl;
c1a4de99 AA	33	void *page_kaddr;
c1a4de99 AA	34	int ret;
b6ebaedb	35	struct page *page;
e2a50c1f AA	36	pgoff_t offset, max_off;
e2a50c1f AA	37	struct inode *inode;
c1a4de99	38
b6ebaedb AA	39	if (!*pagep) {
	40	ret = -ENOMEM;
	41	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
	42	if (!page)
	43	goto out;
	44
	45	page_kaddr = kmap_atomic(page);
	46	ret = copy_from_user(page_kaddr,
	47	(const void __user *) src_addr,
	48	PAGE_SIZE);
	49	kunmap_atomic(page_kaddr);
	50
	51	/* fallback to copy_from_user outside mmap_sem */
	52	if (unlikely(ret)) {
9e368259	53	ret = -ENOENT;
b6ebaedb AA	54	*pagep = page;
	55	/* don't free the page */
	56	goto out;
	57	}
	58	} else {
	59	page = *pagep;
	60	*pagep = NULL;
	61	}
c1a4de99 AA	62
	63	/*
	64	* The memory barrier inside __SetPageUptodate makes sure that
	65	* preceeding stores to the page contents become visible before
	66	* the set_pte_at() write.
	67	*/
	68	__SetPageUptodate(page);
	69
	70	ret = -ENOMEM;
f627c2f5	71	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
c1a4de99 AA	72	goto out_release;
	73
	74	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	75	if (dst_vma->vm_flags & VM_WRITE)
	76	_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
	77
c1a4de99	78	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
e2a50c1f AA	79	if (dst_vma->vm_file) {
	80	/* the shmem MAP_PRIVATE case requires checking the i_size */
	81	inode = dst_vma->vm_file->f_inode;
	82	offset = linear_page_index(dst_vma, dst_addr);
	83	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	84	ret = -EFAULT;
	85	if (unlikely(offset >= max_off))
	86	goto out_release_uncharge_unlock;
	87	}
	88	ret = -EEXIST;
c1a4de99 AA	89	if (!pte_none(*dst_pte))
	90	goto out_release_uncharge_unlock;
	91
	92	inc_mm_counter(dst_mm, MM_ANONPAGES);
d281ee61	93	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
f627c2f5	94	mem_cgroup_commit_charge(page, memcg, false, false);
c1a4de99 AA	95	lru_cache_add_active_or_unevictable(page, dst_vma);
	96
	97	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	98
	99	/* No need to invalidate - it was non-present before */
	100	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	101
	102	pte_unmap_unlock(dst_pte, ptl);
	103	ret = 0;
	104	out:
	105	return ret;
	106	out_release_uncharge_unlock:
	107	pte_unmap_unlock(dst_pte, ptl);
f627c2f5	108	mem_cgroup_cancel_charge(page, memcg, false);
c1a4de99	109	out_release:
09cbfeaf	110	put_page(page);
c1a4de99	111	goto out;
c1a4de99 AA	112	}
	113
	114	static int mfill_zeropage_pte(struct mm_struct *dst_mm,
	115	pmd_t *dst_pmd,
	116	struct vm_area_struct *dst_vma,
	117	unsigned long dst_addr)
	118	{
	119	pte_t _dst_pte, *dst_pte;
	120	spinlock_t *ptl;
	121	int ret;
e2a50c1f AA	122	pgoff_t offset, max_off;
e2a50c1f AA	123	struct inode *inode;
c1a4de99 AA	124
	125	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
	126	dst_vma->vm_page_prot));
c1a4de99	127	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
e2a50c1f AA	128	if (dst_vma->vm_file) {
	129	/* the shmem MAP_PRIVATE case requires checking the i_size */
	130	inode = dst_vma->vm_file->f_inode;
	131	offset = linear_page_index(dst_vma, dst_addr);
	132	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	133	ret = -EFAULT;
	134	if (unlikely(offset >= max_off))
	135	goto out_unlock;
	136	}
	137	ret = -EEXIST;
c1a4de99 AA	138	if (!pte_none(*dst_pte))
	139	goto out_unlock;
	140	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	141	/* No need to invalidate - it was non-present before */
	142	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	143	ret = 0;
	144	out_unlock:
	145	pte_unmap_unlock(dst_pte, ptl);
	146	return ret;
	147	}
	148
	149	static pmd_t mm_alloc_pmd(struct mm_struct mm, unsigned long address)
	150	{
	151	pgd_t *pgd;
c2febafc	152	p4d_t *p4d;
c1a4de99	153	pud_t *pud;
c1a4de99 AA	154
c1a4de99 AA	155	pgd = pgd_offset(mm, address);
c2febafc KS	156	p4d = p4d_alloc(mm, pgd, address);
	157	if (!p4d)
	158	return NULL;
	159	pud = pud_alloc(mm, p4d, address);
	160	if (!pud)
	161	return NULL;
	162	/*
	163	* Note that we didn't run this because the pmd was
	164	* missing, the *pmd may be already established and in
	165	* turn it may also be a trans_huge_pmd.
	166	*/
	167	return pmd_alloc(mm, pud, address);
c1a4de99 AA	168	}
c1a4de99 AA	169
60d4d2d2 MK	170	#ifdef CONFIG_HUGETLB_PAGE
	171	/*
	172	* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
	173	* called with mmap_sem held, it will release mmap_sem before returning.
	174	*/
	175	static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	176	struct vm_area_struct *dst_vma,
	177	unsigned long dst_start,
	178	unsigned long src_start,
	179	unsigned long len,
	180	bool zeropage)
	181	{
1c9e8def MK	182	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
1c9e8def MK	183	int vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2 MK	184	ssize_t err;
	185	pte_t *dst_pte;
	186	unsigned long src_addr, dst_addr;
	187	long copied;
	188	struct page *page;
	189	struct hstate *h;
	190	unsigned long vma_hpagesize;
	191	pgoff_t idx;
	192	u32 hash;
	193	struct address_space *mapping;
	194
	195	/*
	196	* There is no default zero huge page for all huge page sizes as
	197	* supported by hugetlb. A PMD_SIZE huge pages may exist as used
	198	* by THP. Since we can not reliably insert a zero page, this
	199	* feature is not supported.
	200	*/
	201	if (zeropage) {
	202	up_read(&dst_mm->mmap_sem);
	203	return -EINVAL;
	204	}
	205
	206	src_addr = src_start;
	207	dst_addr = dst_start;
	208	copied = 0;
	209	page = NULL;
	210	vma_hpagesize = vma_kernel_pagesize(dst_vma);
	211
	212	/*
	213	* Validate alignment based on huge page size
	214	*/
	215	err = -EINVAL;
	216	if (dst_start & (vma_hpagesize - 1) \|\| len & (vma_hpagesize - 1))
	217	goto out_unlock;
	218
	219	retry:
	220	/*
	221	* On routine entry dst_vma is set. If we had to drop mmap_sem and
	222	* retry, dst_vma will be set to NULL and we must lookup again.
	223	*/
	224	if (!dst_vma) {
27d02568	225	err = -ENOENT;
60d4d2d2 MK	226	dst_vma = find_vma(dst_mm, dst_start);
	227	if (!dst_vma \|\| !is_vm_hugetlb_page(dst_vma))
	228	goto out_unlock;
60d4d2d2	229	/*
29ec9066 AA	230	* Check the vma is registered in uffd, this is
	231	* required to enforce the VM_MAYWRITE check done at
	232	* uffd registration time.
60d4d2d2	233	*/
27d02568 MR	234	if (!dst_vma->vm_userfaultfd_ctx.ctx)
	235	goto out_unlock;
	236
60d4d2d2 MK	237	if (dst_start < dst_vma->vm_start \|\|
	238	dst_start + len > dst_vma->vm_end)
	239	goto out_unlock;
1c9e8def	240
27d02568 MR	241	err = -EINVAL;
	242	if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
	243	goto out_unlock;
	244
1c9e8def	245	vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2 MK	246	}
	247
	248	if (WARN_ON(dst_addr & (vma_hpagesize - 1) \|\|
	249	(len - copied) & (vma_hpagesize - 1)))
	250	goto out_unlock;
	251
60d4d2d2	252	/*
1c9e8def	253	* If not shared, ensure the dst_vma has a anon_vma.
60d4d2d2 MK	254	*/
60d4d2d2 MK	255	err = -ENOMEM;
1c9e8def MK	256	if (!vm_shared) {
	257	if (unlikely(anon_vma_prepare(dst_vma)))
	258	goto out_unlock;
	259	}
60d4d2d2 MK	260
	261	h = hstate_vma(dst_vma);
	262
	263	while (src_addr < src_start + len) {
	264	pte_t dst_pteval;
	265
	266	BUG_ON(dst_addr >= dst_start + len);
	267	VM_BUG_ON(dst_addr & ~huge_page_mask(h));
	268
	269	/*
ddeaab32	270	* Serialize via hugetlb_fault_mutex
60d4d2d2	271	*/
b43a9990	272	idx = linear_page_index(dst_vma, dst_addr);
ddeaab32	273	mapping = dst_vma->vm_file->f_mapping;
60d4d2d2 MK	274	hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
	275	idx, dst_addr);
	276	mutex_lock(&hugetlb_fault_mutex_table[hash]);
	277
	278	err = -ENOMEM;
	279	dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
	280	if (!dst_pte) {
	281	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	282	goto out_unlock;
	283	}
	284
	285	err = -EEXIST;
	286	dst_pteval = huge_ptep_get(dst_pte);
	287	if (!huge_pte_none(dst_pteval)) {
	288	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	289	goto out_unlock;
	290	}
	291
	292	err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
	293	dst_addr, src_addr, &page);
	294
	295	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
1c9e8def	296	vm_alloc_shared = vm_shared;
60d4d2d2 MK	297
	298	cond_resched();
	299
9e368259	300	if (unlikely(err == -ENOENT)) {
60d4d2d2 MK	301	up_read(&dst_mm->mmap_sem);
	302	BUG_ON(!page);
	303
	304	err = copy_huge_page_from_user(page,
	305	(const void __user *)src_addr,
810a56b9	306	pages_per_huge_page(h), true);
60d4d2d2 MK	307	if (unlikely(err)) {
	308	err = -EFAULT;
	309	goto out;
	310	}
	311	down_read(&dst_mm->mmap_sem);
	312
	313	dst_vma = NULL;
	314	goto retry;
	315	} else
	316	BUG_ON(page);
	317
	318	if (!err) {
	319	dst_addr += vma_hpagesize;
	320	src_addr += vma_hpagesize;
	321	copied += vma_hpagesize;
	322
	323	if (fatal_signal_pending(current))
	324	err = -EINTR;
	325	}
	326	if (err)
	327	break;
	328	}
	329
	330	out_unlock:
	331	up_read(&dst_mm->mmap_sem);
	332	out:
21205bf8 MK	333	if (page) {
	334	/*
	335	* We encountered an error and are about to free a newly
1c9e8def MK	336	* allocated huge page.
	337	*
	338	* Reservation handling is very subtle, and is different for
	339	* private and shared mappings. See the routine
	340	* restore_reserve_on_error for details. Unfortunately, we
	341	* can not call restore_reserve_on_error now as it would
	342	* require holding mmap_sem.
	343	*
	344	* If a reservation for the page existed in the reservation
	345	* map of a private mapping, the map was modified to indicate
	346	* the reservation was consumed when the page was allocated.
	347	* We clear the PagePrivate flag now so that the global
21205bf8 MK	348	* reserve count will not be incremented in free_huge_page.
	349	* The reservation map will still indicate the reservation
	350	* was consumed and possibly prevent later page allocation.
1c9e8def MK	351	* This is better than leaking a global reservation. If no
	352	* reservation existed, it is still safe to clear PagePrivate
	353	* as no adjustments to reservation counts were made during
	354	* allocation.
	355	*
	356	* The reservation map for shared mappings indicates which
	357	* pages have reservations. When a huge page is allocated
	358	* for an address with a reservation, no change is made to
	359	* the reserve map. In this case PagePrivate will be set
	360	* to indicate that the global reservation count should be
	361	* incremented when the page is freed. This is the desired
	362	* behavior. However, when a huge page is allocated for an
	363	* address without a reservation a reservation entry is added
	364	* to the reservation map, and PagePrivate will not be set.
	365	* When the page is freed, the global reserve count will NOT
	366	* be incremented and it will appear as though we have leaked
	367	* reserved page. In this case, set PagePrivate so that the
	368	* global reserve count will be incremented to match the
	369	* reservation map entry which was created.
	370	*
	371	* Note that vm_alloc_shared is based on the flags of the vma
	372	* for which the page was originally allocated. dst_vma could
	373	* be different or NULL on error.
21205bf8	374	*/
1c9e8def MK	375	if (vm_alloc_shared)
	376	SetPagePrivate(page);
	377	else
	378	ClearPagePrivate(page);
60d4d2d2	379	put_page(page);
21205bf8	380	}
60d4d2d2 MK	381	BUG_ON(copied < 0);
	382	BUG_ON(err > 0);
	383	BUG_ON(!copied && !err);
	384	return copied ? copied : err;
	385	}
	386	#else /* !CONFIG_HUGETLB_PAGE */
	387	/* fail at build time if gcc attempts to use this */
	388	extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	389	struct vm_area_struct *dst_vma,
	390	unsigned long dst_start,
	391	unsigned long src_start,
	392	unsigned long len,
	393	bool zeropage);
	394	#endif /* CONFIG_HUGETLB_PAGE */
	395
3217d3c7 MR	396	static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
	397	pmd_t *dst_pmd,
	398	struct vm_area_struct *dst_vma,
	399	unsigned long dst_addr,
	400	unsigned long src_addr,
	401	struct page **page,
	402	bool zeropage)
	403	{
	404	ssize_t err;
	405
5b51072e AA	406	/*
	407	* The normal page fault path for a shmem will invoke the
	408	* fault, fill the hole in the file and COW it right away. The
	409	* result generates plain anonymous memory. So when we are
	410	* asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
	411	* generate anonymous memory directly without actually filling
	412	* the hole. For the MAP_PRIVATE case the robustness check
	413	* only happens in the pagetable (to verify it's still none)
	414	* and not in the radix tree.
	415	*/
	416	if (!(dst_vma->vm_flags & VM_SHARED)) {
3217d3c7 MR	417	if (!zeropage)
	418	err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
	419	dst_addr, src_addr, page);
	420	else
	421	err = mfill_zeropage_pte(dst_mm, dst_pmd,
	422	dst_vma, dst_addr);
	423	} else {
8fb44e54	424	if (!zeropage)
3217d3c7 MR	425	err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
	426	dst_vma, dst_addr,
	427	src_addr, page);
8fb44e54 MR	428	else
	429	err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
	430	dst_vma, dst_addr);
3217d3c7 MR	431	}
	432
	433	return err;
	434	}
	435
c1a4de99 AA	436	static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
	437	unsigned long dst_start,
	438	unsigned long src_start,
	439	unsigned long len,
df2cc96e MR	440	bool zeropage,
df2cc96e MR	441	bool *mmap_changing)
c1a4de99 AA	442	{
	443	struct vm_area_struct *dst_vma;
	444	ssize_t err;
	445	pmd_t *dst_pmd;
	446	unsigned long src_addr, dst_addr;
b6ebaedb AA	447	long copied;
b6ebaedb AA	448	struct page *page;
c1a4de99 AA	449
	450	/*
	451	* Sanitize the command parameters:
	452	*/
	453	BUG_ON(dst_start & ~PAGE_MASK);
	454	BUG_ON(len & ~PAGE_MASK);
	455
	456	/* Does the address range wrap, or is the span zero-sized? */
	457	BUG_ON(src_start + len <= src_start);
	458	BUG_ON(dst_start + len <= dst_start);
	459
b6ebaedb AA	460	src_addr = src_start;
	461	dst_addr = dst_start;
	462	copied = 0;
	463	page = NULL;
	464	retry:
c1a4de99 AA	465	down_read(&dst_mm->mmap_sem);
c1a4de99 AA	466
df2cc96e MR	467	/*
	468	* If memory mappings are changing because of non-cooperative
	469	* operation (e.g. mremap) running in parallel, bail out and
	470	* request the user to retry later
	471	*/
	472	err = -EAGAIN;
	473	if (mmap_changing && READ_ONCE(*mmap_changing))
	474	goto out_unlock;
	475
c1a4de99 AA	476	/*
	477	* Make sure the vma is not shared, that the dst range is
	478	* both valid and fully within a single existing vma.
	479	*/
27d02568	480	err = -ENOENT;
c1a4de99	481	dst_vma = find_vma(dst_mm, dst_start);
26071ced MR	482	if (!dst_vma)
26071ced MR	483	goto out_unlock;
1c9e8def	484	/*
29ec9066 AA	485	* Check the vma is registered in uffd, this is required to
	486	* enforce the VM_MAYWRITE check done at uffd registration
	487	* time.
1c9e8def	488	*/
27d02568	489	if (!dst_vma->vm_userfaultfd_ctx.ctx)
b6ebaedb	490	goto out_unlock;
1c9e8def	491
c1a4de99 AA	492	if (dst_start < dst_vma->vm_start \|\|
c1a4de99 AA	493	dst_start + len > dst_vma->vm_end)
b6ebaedb	494	goto out_unlock;
c1a4de99	495
27d02568 MR	496	err = -EINVAL;
	497	/*
	498	* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS\|MAP_SHARED but
	499	* it will overwrite vm_ops, so vma_is_anonymous must return false.
	500	*/
	501	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	502	dst_vma->vm_flags & VM_SHARED))
	503	goto out_unlock;
	504
60d4d2d2 MK	505	/*
	506	* If this is a HUGETLB vma, pass off to appropriate routine
	507	*/
	508	if (is_vm_hugetlb_page(dst_vma))
	509	return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
	510	src_start, len, zeropage);
	511
26071ced	512	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
b6ebaedb	513	goto out_unlock;
c1a4de99 AA	514
	515	/*
	516	* Ensure the dst_vma has a anon_vma or this page
	517	* would get a NULL anon_vma when moved in the
	518	* dst_vma.
	519	*/
	520	err = -ENOMEM;
5b51072e AA	521	if (!(dst_vma->vm_flags & VM_SHARED) &&
5b51072e AA	522	unlikely(anon_vma_prepare(dst_vma)))
b6ebaedb	523	goto out_unlock;
c1a4de99	524
b6ebaedb	525	while (src_addr < src_start + len) {
c1a4de99	526	pmd_t dst_pmdval;
b6ebaedb	527
c1a4de99	528	BUG_ON(dst_addr >= dst_start + len);
b6ebaedb	529
c1a4de99 AA	530	dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
	531	if (unlikely(!dst_pmd)) {
	532	err = -ENOMEM;
	533	break;
	534	}
	535
	536	dst_pmdval = pmd_read_atomic(dst_pmd);
	537	/*
	538	* If the dst_pmd is mapped as THP don't
	539	* override it and just be strict.
	540	*/
	541	if (unlikely(pmd_trans_huge(dst_pmdval))) {
	542	err = -EEXIST;
	543	break;
	544	}
	545	if (unlikely(pmd_none(dst_pmdval)) &&
4cf58924	546	unlikely(__pte_alloc(dst_mm, dst_pmd))) {
c1a4de99 AA	547	err = -ENOMEM;
	548	break;
	549	}
	550	/* If an huge pmd materialized from under us fail */
	551	if (unlikely(pmd_trans_huge(*dst_pmd))) {
	552	err = -EFAULT;
	553	break;
	554	}
	555
	556	BUG_ON(pmd_none(*dst_pmd));
	557	BUG_ON(pmd_trans_huge(*dst_pmd));
	558
3217d3c7 MR	559	err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
3217d3c7 MR	560	src_addr, &page, zeropage);
c1a4de99 AA	561	cond_resched();
c1a4de99 AA	562
9e368259	563	if (unlikely(err == -ENOENT)) {
b6ebaedb AA	564	void *page_kaddr;
	565
	566	up_read(&dst_mm->mmap_sem);
	567	BUG_ON(!page);
	568
	569	page_kaddr = kmap(page);
	570	err = copy_from_user(page_kaddr,
	571	(const void __user *) src_addr,
	572	PAGE_SIZE);
	573	kunmap(page);
	574	if (unlikely(err)) {
	575	err = -EFAULT;
	576	goto out;
	577	}
	578	goto retry;
	579	} else
	580	BUG_ON(page);
	581
c1a4de99 AA	582	if (!err) {
	583	dst_addr += PAGE_SIZE;
	584	src_addr += PAGE_SIZE;
	585	copied += PAGE_SIZE;
	586
	587	if (fatal_signal_pending(current))
	588	err = -EINTR;
	589	}
	590	if (err)
	591	break;
	592	}
	593
b6ebaedb	594	out_unlock:
c1a4de99	595	up_read(&dst_mm->mmap_sem);
b6ebaedb AA	596	out:
b6ebaedb AA	597	if (page)
09cbfeaf	598	put_page(page);
c1a4de99 AA	599	BUG_ON(copied < 0);
	600	BUG_ON(err > 0);
	601	BUG_ON(!copied && !err);
	602	return copied ? copied : err;
	603	}
	604
	605	ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
df2cc96e MR	606	unsigned long src_start, unsigned long len,
df2cc96e MR	607	bool *mmap_changing)
c1a4de99	608	{
df2cc96e MR	609	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
df2cc96e MR	610	mmap_changing);
c1a4de99 AA	611	}
	612
	613	ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
df2cc96e	614	unsigned long len, bool *mmap_changing)
c1a4de99	615	{
df2cc96e	616	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
c1a4de99	617	}