+++ /dev/null
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table. Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep. Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file. As part of truncation, it
-unmaps everyone who has the file mapped. If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called. For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd. If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse. This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
- huge_pmd_share is only called via huge_pte_alloc, so callers of
- huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers
- of huge_pte_alloc continue to hold the semaphore until finished with the
- ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
- called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c | 61 ++++++++++++++++++++++++++++++++++++++++++----------
- mm/memory-failure.c | 16 +++++++++++--
- mm/migrate.c | 13 ++++++++++-
- mm/rmap.c | 4 +++
- mm/userfaultfd.c | 11 +++++++--
- 5 files changed, 89 insertions(+), 16 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3215,6 +3215,7 @@ int copy_hugetlb_page_range(struct mm_st
- struct page *ptepage;
- unsigned long addr;
- int cow;
-+ struct address_space *mapping = vma->vm_file->f_mapping;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
-@@ -3227,12 +3228,23 @@ int copy_hugetlb_page_range(struct mm_st
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+ else {
-+ /*
-+ * For shared mappings i_mmap_rwsem must be held to call
-+ * huge_pte_alloc, otherwise the returned ptep could go
-+ * away if part of a shared pmd and another thread calls
-+ * huge_pmd_unshare.
-+ */
-+ i_mmap_lock_read(mapping);
-+ }
-
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
- spinlock_t *src_ptl, *dst_ptl;
-+
- src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
- continue;
-+
- dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
-@@ -3298,6 +3310,8 @@ int copy_hugetlb_page_range(struct mm_st
-
- if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+ else
-+ i_mmap_unlock_read(mapping);
-
- return ret;
- }
-@@ -3724,14 +3738,18 @@ retry:
- };
-
- /*
-- * hugetlb_fault_mutex must be dropped before
-- * handling userfault. Reacquire after handling
-- * fault to make calling code simpler.
-+ * hugetlb_fault_mutex and i_mmap_rwsem must be
-+ * dropped before handling userfault. Reacquire
-+ * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, address);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
-+
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+ i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- goto out;
- }
-@@ -3884,6 +3902,11 @@ int hugetlb_fault(struct mm_struct *mm,
-
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
- if (ptep) {
-+ /*
-+ * Since we hold no locks, ptep could be stale. That is
-+ * OK as we are only making decisions based on content and
-+ * not actually modifying content here.
-+ */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
-@@ -3897,14 +3920,29 @@ int hugetlb_fault(struct mm_struct *mm,
- return VM_FAULT_OOM;
- }
-
-+ /*
-+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+ * until finished with ptep. This prevents huge_pmd_unshare from
-+ * being called elsewhere and making the ptep no longer valid.
-+ *
-+ * ptep could have already be assigned via huge_pte_offset. That
-+ * is OK, as huge_pte_alloc will return the same value unless
-+ * something changed.
-+ */
- mapping = vma->vm_file->f_mapping;
-- idx = vma_hugecache_offset(h, vma, address);
-+ i_mmap_lock_read(mapping);
-+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
-+ if (!ptep) {
-+ i_mmap_unlock_read(mapping);
-+ return VM_FAULT_OOM;
-+ }
-
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
-+ idx = vma_hugecache_offset(h, vma, address);
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-@@ -3992,6 +4030,7 @@ out_ptl:
- }
- out_mutex:
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- /*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4576,10 +4615,12 @@ void adjust_range_if_pmd_sharing_possibl
- * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
- * and returns the corresponding pte. While this is not necessary for the
- * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space. This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
- */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4596,7 +4637,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
-- i_mmap_lock_write(mapping);
- vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-@@ -4626,7 +4666,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- spin_unlock(ptl);
- out:
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
-- i_mmap_unlock_write(mapping);
- return pte;
- }
-
-@@ -4637,7 +4676,7 @@ out:
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -933,7 +933,7 @@ static bool hwpoison_user_mappings(struc
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- struct address_space *mapping;
- LIST_HEAD(tokill);
-- bool unmap_success;
-+ bool unmap_success = true;
- int kill = 1, forcekill;
- struct page *hpage = *hpagep;
- bool mlocked = PageMlocked(hpage);
-@@ -995,7 +995,19 @@ static bool hwpoison_user_mappings(struc
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-
-- unmap_success = try_to_unmap(hpage, ttu);
-+ if (!PageHuge(hpage)) {
-+ unmap_success = try_to_unmap(hpage, ttu);
-+ } else if (mapping) {
-+ /*
-+ * For hugetlb pages, try_to_unmap could potentially call
-+ * huge_pmd_unshare. Because of this, take semaphore in
-+ * write mode here and set TTU_RMAP_LOCKED to indicate we
-+ * have taken the lock at this higer level.
-+ */
-+ i_mmap_lock_write(mapping);
-+ unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
-+ }
- if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_
- goto put_anon;
-
- if (page_mapped(hpage)) {
-+ struct address_space *mapping = page_mapping(hpage);
-+
-+ /*
-+ * try_to_unmap could potentially call huge_pmd_unshare.
-+ * Because of this, take semaphore in write mode here and
-+ * set TTU_RMAP_LOCKED to let lower levels know we have
-+ * taken the lock.
-+ */
-+ i_mmap_lock_write(mapping);
- try_to_unmap(hpage,
-- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+ TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
- page_was_mapped = 1;
- }
-
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
- * page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
-+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1370,6 +1371,9 @@ static bool try_to_unmap_one(struct page
- /*
- * If sharing is possible, start and end will be adjusted
- * accordingly.
-+ *
-+ * If called for a huge page, caller must hold i_mmap_rwsem
-+ * in write mode as it is possible to call huge_pmd_unshare.
- */
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
- }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -268,10 +268,14 @@ retry:
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-
- /*
-- * Serialize via hugetlb_fault_mutex
-+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+ * i_mmap_rwsem ensures the dst_pte remains valid even
-+ * in the case of shared pmds. fault mutex prevents
-+ * races with other faulting threads.
- */
-- idx = linear_page_index(dst_vma, dst_addr);
- mapping = dst_vma->vm_file->f_mapping;
-+ i_mmap_lock_read(mapping);
-+ idx = linear_page_index(dst_vma, dst_addr);
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -280,6 +284,7 @@ retry:
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
- if (!dst_pte) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -287,6 +292,7 @@ retry:
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -294,6 +300,7 @@ retry:
- dst_addr, src_addr, &page);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
-
- cond_resched();
+++ /dev/null
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race. One obvious omission in the
-current code is removing a page newly added to the page cache. This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations. To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out. There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv. Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race. i_mmap_rwsem is now held in read mode for the
-duration of page fault processing. Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen. Remove the dead code and
-expand comments to explain reasoning. Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
- Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c | 61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c | 21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -393,17 +393,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
- * truncation is indicated by end of range being LLONG_MAX
- * In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- * maps and global counts. Page faults can not race with truncation
-- * in this routine. hugetlb_no_page() prevents page faults in the
-- * truncated range. It checks i_size before allocation, and again after
-- * with the page table lock for the page held. The same lock must be
-- * acquired to unmap a page.
-+ * maps and global counts.
- * hole punch is indicated if end is not LLONG_MAX
- * In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
-- * pages are not modified. Page faults can race with hole punch.
-- * This is indicated if we find a mapped page.
-+ * pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
- * Note: If the passed end of range value is beyond the end of file, but
- * not LLONG_MAX this routine still performs a hole punch operation.
- */
-@@ -433,32 +432,14 @@ static void remove_inode_hugepages(struc
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
-- u32 hash;
-
- index = page->index;
-- hash = hugetlb_fault_mutex_hash(h, current->mm,
-- &pseudo_vma,
-- mapping, index, 0);
-- mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
- /*
-- * If page is mapped, it was faulted in after being
-- * unmapped in caller. Unmap (again) now after taking
-- * the fault mutex. The mutex will prevent faults
-- * until we finish removing the page.
-- *
-- * This race can only happen in the hole punch case.
-- * Getting here in a truncate operation is a bug.
-+ * A mapped page is impossible as callers should unmap
-+ * all references before calling. And, i_mmap_rwsem
-+ * prevents the creation of additional mappings.
- */
-- if (unlikely(page_mapped(page))) {
-- BUG_ON(truncate_op);
--
-- i_mmap_lock_write(mapping);
-- hugetlb_vmdelete_list(&mapping->i_mmap,
-- index * pages_per_huge_page(h),
-- (index + 1) * pages_per_huge_page(h));
-- i_mmap_unlock_write(mapping);
-- }
-+ VM_BUG_ON(page_mapped(page));
-
- lock_page(page);
- /*
-@@ -480,7 +461,6 @@ static void remove_inode_hugepages(struc
- }
-
- unlock_page(page);
-- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- }
- huge_pagevec_release(&pvec);
- cond_resched();
-@@ -492,9 +472,20 @@ static void remove_inode_hugepages(struc
-
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+ struct address_space *mapping = inode->i_mapping;
- struct resv_map *resv_map;
-
-+ /*
-+ * The vfs layer guarantees that there are no other users of this
-+ * inode. Therefore, it would be safe to call remove_inode_hugepages
-+ * without holding i_mmap_rwsem. We acquire and hold here to be
-+ * consistent with other callers. Since there will be no contention
-+ * on the semaphore, overhead is negligible.
-+ */
-+ i_mmap_lock_write(mapping);
- remove_inode_hugepages(inode, 0, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
-+
- resv_map = (struct resv_map *)inode->i_mapping->private_data;
- /* root inode doesn't have the resv_map, so we should check it */
- if (resv_map)
-@@ -515,8 +506,8 @@ static int hugetlb_vmtruncate(struct ino
- i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
- return 0;
- }
-
-@@ -542,8 +533,8 @@ static long hugetlbfs_punch_hole(struct
- hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
-+ i_mmap_unlock_write(mapping);
- inode_unlock(inode);
- }
-
-@@ -620,7 +611,11 @@ static long hugetlbfs_fallocate(struct f
- /* addr is the offset within the file (zero based) */
- addr = index * hpage_size;
-
-- /* mutex taken here, fault path and hole punch */
-+ /*
-+ * fault mutex taken here, protects against fault path
-+ * and hole punch. inode_lock previously taken protects
-+ * against truncation.
-+ */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3709,16 +3709,16 @@ static int hugetlb_no_page(struct mm_str
- }
-
- /*
-- * Use page lock to guard against racing truncation
-- * before we get page_table_lock.
-+ * We can not race with truncation due to holding i_mmap_rwsem.
-+ * Check once here for faults beyond end of file.
- */
-+ size = i_size_read(mapping->host) >> huge_page_shift(h);
-+ if (idx >= size)
-+ goto out;
-+
- retry:
- page = find_lock_page(mapping, idx);
- if (!page) {
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto out;
--
- /*
- * Check for page in userfault range
- */
-@@ -3812,9 +3812,6 @@ retry:
- }
-
- ptl = huge_pte_lock(h, mm, ptep);
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto backout;
-
- ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3922,8 +3919,10 @@ int hugetlb_fault(struct mm_struct *mm,
-
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-- * until finished with ptep. This prevents huge_pmd_unshare from
-- * being called elsewhere and making the ptep no longer valid.
-+ * until finished with ptep. This serves two purposes:
-+ * 1) It prevents huge_pmd_unshare from being called elsewhere
-+ * and making the ptep no longer valid.
-+ * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
mm-swap-fix-swapoff-with-ksm-pages.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
sunrpc-fix-cache_head-leak-due-to-queued-request.patch
sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
sunrpc-use-after-free-in-svc_process_common.patch
+++ /dev/null
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table. Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep. Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file. As part of truncation, it
-unmaps everyone who has the file mapped. If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called. For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd. If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse. This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
- huge_pmd_share is only called via huge_pte_alloc, so callers of
- huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers
- of huge_pte_alloc continue to hold the semaphore until finished with the
- ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
- called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c | 65 ++++++++++++++++++++++++++++++++++++++++------------
- mm/memory-failure.c | 16 +++++++++++-
- mm/migrate.c | 13 +++++++++-
- mm/rmap.c | 4 +++
- mm/userfaultfd.c | 11 +++++++-
- 5 files changed, 89 insertions(+), 20 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3237,6 +3237,7 @@ int copy_hugetlb_page_range(struct mm_st
- struct page *ptepage;
- unsigned long addr;
- int cow;
-+ struct address_space *mapping = vma->vm_file->f_mapping;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
-@@ -3249,12 +3250,23 @@ int copy_hugetlb_page_range(struct mm_st
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+ else {
-+ /*
-+ * For shared mappings i_mmap_rwsem must be held to call
-+ * huge_pte_alloc, otherwise the returned ptep could go
-+ * away if part of a shared pmd and another thread calls
-+ * huge_pmd_unshare.
-+ */
-+ i_mmap_lock_read(mapping);
-+ }
-
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
- spinlock_t *src_ptl, *dst_ptl;
-+
- src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
- continue;
-+
- dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
-@@ -3325,6 +3337,8 @@ int copy_hugetlb_page_range(struct mm_st
-
- if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+ else
-+ i_mmap_unlock_read(mapping);
-
- return ret;
- }
-@@ -3772,14 +3786,18 @@ retry:
- };
-
- /*
-- * hugetlb_fault_mutex must be dropped before
-- * handling userfault. Reacquire after handling
-- * fault to make calling code simpler.
-+ * hugetlb_fault_mutex and i_mmap_rwsem must be
-+ * dropped before handling userfault. Reacquire
-+ * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
-+
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+ i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- goto out;
- }
-@@ -3927,6 +3945,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
-
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
-+ /*
-+ * Since we hold no locks, ptep could be stale. That is
-+ * OK as we are only making decisions based on content and
-+ * not actually modifying content here.
-+ */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
-@@ -3934,20 +3957,31 @@ vm_fault_t hugetlb_fault(struct mm_struc
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
-- } else {
-- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-- if (!ptep)
-- return VM_FAULT_OOM;
- }
-
-+ /*
-+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+ * until finished with ptep. This prevents huge_pmd_unshare from
-+ * being called elsewhere and making the ptep no longer valid.
-+ *
-+ * ptep could have already be assigned via huge_pte_offset. That
-+ * is OK, as huge_pte_alloc will return the same value unless
-+ * something changed.
-+ */
- mapping = vma->vm_file->f_mapping;
-- idx = vma_hugecache_offset(h, vma, haddr);
-+ i_mmap_lock_read(mapping);
-+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-+ if (!ptep) {
-+ i_mmap_unlock_read(mapping);
-+ return VM_FAULT_OOM;
-+ }
-
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
-+ idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-@@ -4035,6 +4069,7 @@ out_ptl:
- }
- out_mutex:
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- /*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4639,10 +4674,12 @@ void adjust_range_if_pmd_sharing_possibl
- * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
- * and returns the corresponding pte. While this is not necessary for the
- * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space. This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
- */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4659,7 +4696,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
-- i_mmap_lock_write(mapping);
- vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-@@ -4689,7 +4725,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- spin_unlock(ptl);
- out:
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
-- i_mmap_unlock_write(mapping);
- return pte;
- }
-
-@@ -4700,7 +4735,7 @@ out:
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struc
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- struct address_space *mapping;
- LIST_HEAD(tokill);
-- bool unmap_success;
-+ bool unmap_success = true;
- int kill = 1, forcekill;
- struct page *hpage = *hpagep;
- bool mlocked = PageMlocked(hpage);
-@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-
-- unmap_success = try_to_unmap(hpage, ttu);
-+ if (!PageHuge(hpage)) {
-+ unmap_success = try_to_unmap(hpage, ttu);
-+ } else if (mapping) {
-+ /*
-+ * For hugetlb pages, try_to_unmap could potentially call
-+ * huge_pmd_unshare. Because of this, take semaphore in
-+ * write mode here and set TTU_RMAP_LOCKED to indicate we
-+ * have taken the lock at this higer level.
-+ */
-+ i_mmap_lock_write(mapping);
-+ unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
-+ }
- if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_
- goto put_anon;
-
- if (page_mapped(hpage)) {
-+ struct address_space *mapping = page_mapping(hpage);
-+
-+ /*
-+ * try_to_unmap could potentially call huge_pmd_unshare.
-+ * Because of this, take semaphore in write mode here and
-+ * set TTU_RMAP_LOCKED to let lower levels know we have
-+ * taken the lock.
-+ */
-+ i_mmap_lock_write(mapping);
- try_to_unmap(hpage,
-- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+ TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
- page_was_mapped = 1;
- }
-
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
- * page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
-+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page
- /*
- * If sharing is possible, start and end will be adjusted
- * accordingly.
-+ *
-+ * If called for a huge page, caller must hold i_mmap_rwsem
-+ * in write mode as it is possible to call huge_pmd_unshare.
- */
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
- }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -267,10 +267,14 @@ retry:
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-
- /*
-- * Serialize via hugetlb_fault_mutex
-+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+ * i_mmap_rwsem ensures the dst_pte remains valid even
-+ * in the case of shared pmds. fault mutex prevents
-+ * races with other faulting threads.
- */
-- idx = linear_page_index(dst_vma, dst_addr);
- mapping = dst_vma->vm_file->f_mapping;
-+ i_mmap_lock_read(mapping);
-+ idx = linear_page_index(dst_vma, dst_addr);
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -279,6 +283,7 @@ retry:
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
- if (!dst_pte) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -286,6 +291,7 @@ retry:
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -293,6 +299,7 @@ retry:
- dst_addr, src_addr, &page);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
-
- cond_resched();
+++ /dev/null
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race. One obvious omission in the
-current code is removing a page newly added to the page cache. This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations. To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out. There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv. Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race. i_mmap_rwsem is now held in read mode for the
-duration of page fault processing. Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen. Remove the dead code and
-expand comments to explain reasoning. Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
- Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c | 61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c | 21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
- * truncation is indicated by end of range being LLONG_MAX
- * In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- * maps and global counts. Page faults can not race with truncation
-- * in this routine. hugetlb_no_page() prevents page faults in the
-- * truncated range. It checks i_size before allocation, and again after
-- * with the page table lock for the page held. The same lock must be
-- * acquired to unmap a page.
-+ * maps and global counts.
- * hole punch is indicated if end is not LLONG_MAX
- * In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
-- * pages are not modified. Page faults can race with hole punch.
-- * This is indicated if we find a mapped page.
-+ * pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
- * Note: If the passed end of range value is beyond the end of file, but
- * not LLONG_MAX this routine still performs a hole punch operation.
- */
-@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
-- u32 hash;
-
- index = page->index;
-- hash = hugetlb_fault_mutex_hash(h, current->mm,
-- &pseudo_vma,
-- mapping, index, 0);
-- mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
- /*
-- * If page is mapped, it was faulted in after being
-- * unmapped in caller. Unmap (again) now after taking
-- * the fault mutex. The mutex will prevent faults
-- * until we finish removing the page.
-- *
-- * This race can only happen in the hole punch case.
-- * Getting here in a truncate operation is a bug.
-+ * A mapped page is impossible as callers should unmap
-+ * all references before calling. And, i_mmap_rwsem
-+ * prevents the creation of additional mappings.
- */
-- if (unlikely(page_mapped(page))) {
-- BUG_ON(truncate_op);
--
-- i_mmap_lock_write(mapping);
-- hugetlb_vmdelete_list(&mapping->i_mmap,
-- index * pages_per_huge_page(h),
-- (index + 1) * pages_per_huge_page(h));
-- i_mmap_unlock_write(mapping);
-- }
-+ VM_BUG_ON(page_mapped(page));
-
- lock_page(page);
- /*
-@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc
- }
-
- unlock_page(page);
-- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- }
- huge_pagevec_release(&pvec);
- cond_resched();
-@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struc
-
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+ struct address_space *mapping = inode->i_mapping;
- struct resv_map *resv_map;
-
-+ /*
-+ * The vfs layer guarantees that there are no other users of this
-+ * inode. Therefore, it would be safe to call remove_inode_hugepages
-+ * without holding i_mmap_rwsem. We acquire and hold here to be
-+ * consistent with other callers. Since there will be no contention
-+ * on the semaphore, overhead is negligible.
-+ */
-+ i_mmap_lock_write(mapping);
- remove_inode_hugepages(inode, 0, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
-+
- resv_map = (struct resv_map *)inode->i_mapping->private_data;
- /* root inode doesn't have the resv_map, so we should check it */
- if (resv_map)
-@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct ino
- i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
- return 0;
- }
-
-@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct
- hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
-+ i_mmap_unlock_write(mapping);
- inode_unlock(inode);
- }
-
-@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct f
- /* addr is the offset within the file (zero based) */
- addr = index * hpage_size;
-
-- /* mutex taken here, fault path and hole punch */
-+ /*
-+ * fault mutex taken here, protects against fault path
-+ * and hole punch. inode_lock previously taken protects
-+ * against truncation.
-+ */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3757,16 +3757,16 @@ static vm_fault_t hugetlb_no_page(struct
- }
-
- /*
-- * Use page lock to guard against racing truncation
-- * before we get page_table_lock.
-+ * We can not race with truncation due to holding i_mmap_rwsem.
-+ * Check once here for faults beyond end of file.
- */
-+ size = i_size_read(mapping->host) >> huge_page_shift(h);
-+ if (idx >= size)
-+ goto out;
-+
- retry:
- page = find_lock_page(mapping, idx);
- if (!page) {
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto out;
--
- /*
- * Check for page in userfault range
- */
-@@ -3856,9 +3856,6 @@ retry:
- }
-
- ptl = huge_pte_lock(h, mm, ptep);
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto backout;
-
- ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3961,8 +3958,10 @@ vm_fault_t hugetlb_fault(struct mm_struc
-
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-- * until finished with ptep. This prevents huge_pmd_unshare from
-- * being called elsewhere and making the ptep no longer valid.
-+ * until finished with ptep. This serves two purposes:
-+ * 1) It prevents huge_pmd_unshare from being called elsewhere
-+ * and making the ptep no longer valid.
-+ * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
mm-swap-fix-swapoff-with-ksm-pages.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
sunrpc-fix-cache_head-leak-due-to-queued-request.patch
sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
+++ /dev/null
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table. Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep. Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file. As part of truncation, it
-unmaps everyone who has the file mapped. If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called. For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd. If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse. This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
- huge_pmd_share is only called via huge_pte_alloc, so callers of
- huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers
- of huge_pte_alloc continue to hold the semaphore until finished with the
- ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
- called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c | 65 ++++++++++++++++++++++++++++++++++++++++------------
- mm/memory-failure.c | 16 +++++++++++-
- mm/migrate.c | 13 +++++++++-
- mm/rmap.c | 4 +++
- mm/userfaultfd.c | 11 +++++++-
- 5 files changed, 89 insertions(+), 20 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3238,6 +3238,7 @@ int copy_hugetlb_page_range(struct mm_st
- struct page *ptepage;
- unsigned long addr;
- int cow;
-+ struct address_space *mapping = vma->vm_file->f_mapping;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
-@@ -3250,12 +3251,23 @@ int copy_hugetlb_page_range(struct mm_st
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+ else {
-+ /*
-+ * For shared mappings i_mmap_rwsem must be held to call
-+ * huge_pte_alloc, otherwise the returned ptep could go
-+ * away if part of a shared pmd and another thread calls
-+ * huge_pmd_unshare.
-+ */
-+ i_mmap_lock_read(mapping);
-+ }
-
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
- spinlock_t *src_ptl, *dst_ptl;
-+
- src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
- continue;
-+
- dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
-@@ -3326,6 +3338,8 @@ int copy_hugetlb_page_range(struct mm_st
-
- if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+ else
-+ i_mmap_unlock_read(mapping);
-
- return ret;
- }
-@@ -3773,14 +3787,18 @@ retry:
- };
-
- /*
-- * hugetlb_fault_mutex must be dropped before
-- * handling userfault. Reacquire after handling
-- * fault to make calling code simpler.
-+ * hugetlb_fault_mutex and i_mmap_rwsem must be
-+ * dropped before handling userfault. Reacquire
-+ * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
-+
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+ i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- goto out;
- }
-@@ -3928,6 +3946,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
-
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
-+ /*
-+ * Since we hold no locks, ptep could be stale. That is
-+ * OK as we are only making decisions based on content and
-+ * not actually modifying content here.
-+ */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
-@@ -3935,20 +3958,31 @@ vm_fault_t hugetlb_fault(struct mm_struc
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
-- } else {
-- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-- if (!ptep)
-- return VM_FAULT_OOM;
- }
-
-+ /*
-+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+ * until finished with ptep. This prevents huge_pmd_unshare from
-+ * being called elsewhere and making the ptep no longer valid.
-+ *
-+ * ptep could have already be assigned via huge_pte_offset. That
-+ * is OK, as huge_pte_alloc will return the same value unless
-+ * something changed.
-+ */
- mapping = vma->vm_file->f_mapping;
-- idx = vma_hugecache_offset(h, vma, haddr);
-+ i_mmap_lock_read(mapping);
-+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-+ if (!ptep) {
-+ i_mmap_unlock_read(mapping);
-+ return VM_FAULT_OOM;
-+ }
-
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
-+ idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-@@ -4036,6 +4070,7 @@ out_ptl:
- }
- out_mutex:
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- /*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4640,10 +4675,12 @@ void adjust_range_if_pmd_sharing_possibl
- * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
- * and returns the corresponding pte. While this is not necessary for the
- * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space. This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
- */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4660,7 +4697,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
-- i_mmap_lock_write(mapping);
- vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-@@ -4690,7 +4726,6 @@ pte_t *huge_pmd_share(struct mm_struct *
- spin_unlock(ptl);
- out:
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
-- i_mmap_unlock_write(mapping);
- return pte;
- }
-
-@@ -4701,7 +4736,7 @@ out:
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struc
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- struct address_space *mapping;
- LIST_HEAD(tokill);
-- bool unmap_success;
-+ bool unmap_success = true;
- int kill = 1, forcekill;
- struct page *hpage = *hpagep;
- bool mlocked = PageMlocked(hpage);
-@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-
-- unmap_success = try_to_unmap(hpage, ttu);
-+ if (!PageHuge(hpage)) {
-+ unmap_success = try_to_unmap(hpage, ttu);
-+ } else if (mapping) {
-+ /*
-+ * For hugetlb pages, try_to_unmap could potentially call
-+ * huge_pmd_unshare. Because of this, take semaphore in
-+ * write mode here and set TTU_RMAP_LOCKED to indicate we
-+ * have taken the lock at this higer level.
-+ */
-+ i_mmap_lock_write(mapping);
-+ unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
-+ }
- if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1297,8 +1297,19 @@ static int unmap_and_move_huge_page(new_
- goto put_anon;
-
- if (page_mapped(hpage)) {
-+ struct address_space *mapping = page_mapping(hpage);
-+
-+ /*
-+ * try_to_unmap could potentially call huge_pmd_unshare.
-+ * Because of this, take semaphore in write mode here and
-+ * set TTU_RMAP_LOCKED to let lower levels know we have
-+ * taken the lock.
-+ */
-+ i_mmap_lock_write(mapping);
- try_to_unmap(hpage,
-- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+ TTU_RMAP_LOCKED);
-+ i_mmap_unlock_write(mapping);
- page_was_mapped = 1;
- }
-
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
- * page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
-+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page
- /*
- * If sharing is possible, start and end will be adjusted
- * accordingly.
-+ *
-+ * If called for a huge page, caller must hold i_mmap_rwsem
-+ * in write mode as it is possible to call huge_pmd_unshare.
- */
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
- }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -267,10 +267,14 @@ retry:
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-
- /*
-- * Serialize via hugetlb_fault_mutex
-+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+ * i_mmap_rwsem ensures the dst_pte remains valid even
-+ * in the case of shared pmds. fault mutex prevents
-+ * races with other faulting threads.
- */
-- idx = linear_page_index(dst_vma, dst_addr);
- mapping = dst_vma->vm_file->f_mapping;
-+ i_mmap_lock_read(mapping);
-+ idx = linear_page_index(dst_vma, dst_addr);
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -279,6 +283,7 @@ retry:
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
- if (!dst_pte) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -286,6 +291,7 @@ retry:
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- goto out_unlock;
- }
-
-@@ -293,6 +299,7 @@ retry:
- dst_addr, src_addr, &page);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+ i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
-
- cond_resched();
+++ /dev/null
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race. One obvious omission in the
-current code is removing a page newly added to the page cache. This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations. To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out. There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv. Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race. i_mmap_rwsem is now held in read mode for the
-duration of page fault processing. Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen. Remove the dead code and
-expand comments to explain reasoning. Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
- Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c | 61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c | 21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
- * truncation is indicated by end of range being LLONG_MAX
- * In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- * maps and global counts. Page faults can not race with truncation
-- * in this routine. hugetlb_no_page() prevents page faults in the
-- * truncated range. It checks i_size before allocation, and again after
-- * with the page table lock for the page held. The same lock must be
-- * acquired to unmap a page.
-+ * maps and global counts.
- * hole punch is indicated if end is not LLONG_MAX
- * In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
-- * pages are not modified. Page faults can race with hole punch.
-- * This is indicated if we find a mapped page.
-+ * pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
- * Note: If the passed end of range value is beyond the end of file, but
- * not LLONG_MAX this routine still performs a hole punch operation.
- */
-@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
-- u32 hash;
-
- index = page->index;
-- hash = hugetlb_fault_mutex_hash(h, current->mm,
-- &pseudo_vma,
-- mapping, index, 0);
-- mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
- /*
-- * If page is mapped, it was faulted in after being
-- * unmapped in caller. Unmap (again) now after taking
-- * the fault mutex. The mutex will prevent faults
-- * until we finish removing the page.
-- *
-- * This race can only happen in the hole punch case.
-- * Getting here in a truncate operation is a bug.
-+ * A mapped page is impossible as callers should unmap
-+ * all references before calling. And, i_mmap_rwsem
-+ * prevents the creation of additional mappings.
- */
-- if (unlikely(page_mapped(page))) {
-- BUG_ON(truncate_op);
--
-- i_mmap_lock_write(mapping);
-- hugetlb_vmdelete_list(&mapping->i_mmap,
-- index * pages_per_huge_page(h),
-- (index + 1) * pages_per_huge_page(h));
-- i_mmap_unlock_write(mapping);
-- }
-+ VM_BUG_ON(page_mapped(page));
-
- lock_page(page);
- /*
-@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc
- }
-
- unlock_page(page);
-- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- }
- huge_pagevec_release(&pvec);
- cond_resched();
-@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struc
-
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+ struct address_space *mapping = inode->i_mapping;
- struct resv_map *resv_map;
-
-+ /*
-+ * The vfs layer guarantees that there are no other users of this
-+ * inode. Therefore, it would be safe to call remove_inode_hugepages
-+ * without holding i_mmap_rwsem. We acquire and hold here to be
-+ * consistent with other callers. Since there will be no contention
-+ * on the semaphore, overhead is negligible.
-+ */
-+ i_mmap_lock_write(mapping);
- remove_inode_hugepages(inode, 0, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
-+
- resv_map = (struct resv_map *)inode->i_mapping->private_data;
- /* root inode doesn't have the resv_map, so we should check it */
- if (resv_map)
-@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct ino
- i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
-+ i_mmap_unlock_write(mapping);
- return 0;
- }
-
-@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct
- hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
-- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
-+ i_mmap_unlock_write(mapping);
- inode_unlock(inode);
- }
-
-@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct f
- /* addr is the offset within the file (zero based) */
- addr = index * hpage_size;
-
-- /* mutex taken here, fault path and hole punch */
-+ /*
-+ * fault mutex taken here, protects against fault path
-+ * and hole punch. inode_lock previously taken protects
-+ * against truncation.
-+ */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3758,16 +3758,16 @@ static vm_fault_t hugetlb_no_page(struct
- }
-
- /*
-- * Use page lock to guard against racing truncation
-- * before we get page_table_lock.
-+ * We can not race with truncation due to holding i_mmap_rwsem.
-+ * Check once here for faults beyond end of file.
- */
-+ size = i_size_read(mapping->host) >> huge_page_shift(h);
-+ if (idx >= size)
-+ goto out;
-+
- retry:
- page = find_lock_page(mapping, idx);
- if (!page) {
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto out;
--
- /*
- * Check for page in userfault range
- */
-@@ -3857,9 +3857,6 @@ retry:
- }
-
- ptl = huge_pte_lock(h, mm, ptep);
-- size = i_size_read(mapping->host) >> huge_page_shift(h);
-- if (idx >= size)
-- goto backout;
-
- ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3962,8 +3959,10 @@ vm_fault_t hugetlb_fault(struct mm_struc
-
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-- * until finished with ptep. This prevents huge_pmd_unshare from
-- * being called elsewhere and making the ptep no longer valid.
-+ * until finished with ptep. This serves two purposes:
-+ * 1) It prevents huge_pmd_unshare from being called elsewhere
-+ * and making the ptep no longer valid.
-+ * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
mm-devm_memremap_pages-fix-shutdown-handling.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
sunrpc-fix-cache_head-leak-due-to-queued-request.patch
sunrpc-use-svc_net-in-svcauth_gss_-functions.patch