]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
drop hugetlbfs patches from 4.20.y, 4.19.y, and 4.14.y
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Jan 2019 16:53:09 +0000 (17:53 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Jan 2019 16:53:09 +0000 (17:53 +0100)
queue-4.14/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch [deleted file]
queue-4.14/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch [deleted file]
queue-4.14/series
queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch [deleted file]
queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch [deleted file]
queue-4.19/series
queue-4.20/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch [deleted file]
queue-4.20/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch [deleted file]
queue-4.20/series

diff --git a/queue-4.14/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch b/queue-4.14/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
deleted file mode 100644 (file)
index c8b4b5b..0000000
+++ /dev/null
@@ -1,339 +0,0 @@
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table.  Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep.  Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file.  As part of truncation, it
-unmaps everyone who has the file mapped.  If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called.  For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd.  If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse.  This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
-  huge_pmd_share is only called via huge_pte_alloc, so callers of
-  huge_pte_alloc take i_mmap_rwsem before calling.  In addition, callers
-  of huge_pte_alloc continue to hold the semaphore until finished with the
-  ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
-  called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c        |   61 ++++++++++++++++++++++++++++++++++++++++++----------
- mm/memory-failure.c |   16 +++++++++++--
- mm/migrate.c        |   13 ++++++++++-
- mm/rmap.c           |    4 +++
- mm/userfaultfd.c    |   11 +++++++--
- 5 files changed, 89 insertions(+), 16 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3215,6 +3215,7 @@ int copy_hugetlb_page_range(struct mm_st
-       struct page *ptepage;
-       unsigned long addr;
-       int cow;
-+      struct address_space *mapping = vma->vm_file->f_mapping;
-       struct hstate *h = hstate_vma(vma);
-       unsigned long sz = huge_page_size(h);
-       unsigned long mmun_start;       /* For mmu_notifiers */
-@@ -3227,12 +3228,23 @@ int copy_hugetlb_page_range(struct mm_st
-       mmun_end = vma->vm_end;
-       if (cow)
-               mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+      else {
-+              /*
-+               * For shared mappings i_mmap_rwsem must be held to call
-+               * huge_pte_alloc, otherwise the returned ptep could go
-+               * away if part of a shared pmd and another thread calls
-+               * huge_pmd_unshare.
-+               */
-+              i_mmap_lock_read(mapping);
-+      }
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
-               spinlock_t *src_ptl, *dst_ptl;
-+
-               src_pte = huge_pte_offset(src, addr, sz);
-               if (!src_pte)
-                       continue;
-+
-               dst_pte = huge_pte_alloc(dst, addr, sz);
-               if (!dst_pte) {
-                       ret = -ENOMEM;
-@@ -3298,6 +3310,8 @@ int copy_hugetlb_page_range(struct mm_st
-       if (cow)
-               mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+      else
-+              i_mmap_unlock_read(mapping);
-       return ret;
- }
-@@ -3724,14 +3738,18 @@ retry:
-                       };
-                       /*
--                       * hugetlb_fault_mutex must be dropped before
--                       * handling userfault.  Reacquire after handling
--                       * fault to make calling code simpler.
-+                       * hugetlb_fault_mutex and i_mmap_rwsem must be
-+                       * dropped before handling userfault.  Reacquire
-+                       * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, address);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-+
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+                      i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                       goto out;
-               }
-@@ -3884,6 +3902,11 @@ int hugetlb_fault(struct mm_struct *mm,
-       ptep = huge_pte_offset(mm, address, huge_page_size(h));
-       if (ptep) {
-+              /*
-+               * Since we hold no locks, ptep could be stale.  That is
-+               * OK as we are only making decisions based on content and
-+               * not actually modifying content here.
-+               */
-               entry = huge_ptep_get(ptep);
-               if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(vma, mm, ptep);
-@@ -3897,14 +3920,29 @@ int hugetlb_fault(struct mm_struct *mm,
-                       return VM_FAULT_OOM;
-       }
-+      /*
-+       * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+       * until finished with ptep.  This prevents huge_pmd_unshare from
-+       * being called elsewhere and making the ptep no longer valid.
-+       *
-+       * ptep could have already be assigned via huge_pte_offset.  That
-+       * is OK, as huge_pte_alloc will return the same value unless
-+       * something changed.
-+       */
-       mapping = vma->vm_file->f_mapping;
--      idx = vma_hugecache_offset(h, vma, address);
-+      i_mmap_lock_read(mapping);
-+      ptep = huge_pte_alloc(mm, address, huge_page_size(h));
-+      if (!ptep) {
-+              i_mmap_unlock_read(mapping);
-+              return VM_FAULT_OOM;
-+      }
-       /*
-        * Serialize hugepage allocation and instantiation, so that we don't
-        * get spurious allocation failures if two CPUs race to instantiate
-        * the same page in the page cache.
-        */
-+      idx = vma_hugecache_offset(h, vma, address);
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -3992,6 +4030,7 @@ out_ptl:
-       }
- out_mutex:
-       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+      i_mmap_unlock_read(mapping);
-       /*
-        * Generally it's safe to hold refcount during waiting page lock. But
-        * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4576,10 +4615,12 @@ void adjust_range_if_pmd_sharing_possibl
-  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
-  * and returns the corresponding pte. While this is not necessary for the
-  * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space.  This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
-  */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4596,7 +4637,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_lock_write(mapping);
-       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
-               if (svma == vma)
-                       continue;
-@@ -4626,7 +4666,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       spin_unlock(ptl);
- out:
-       pte = (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_unlock_write(mapping);
-       return pte;
- }
-@@ -4637,7 +4676,7 @@ out:
-  * indicated by page_count > 1, unmap is achieved by clearing pud and
-  * decrementing the ref count. If count == 1, the pte page is not shared.
-  *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
-  *
-  * returns: 1 successfully unmapped a shared pte page
-  *        0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -933,7 +933,7 @@ static bool hwpoison_user_mappings(struc
-       enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
-       struct address_space *mapping;
-       LIST_HEAD(tokill);
--      bool unmap_success;
-+      bool unmap_success = true;
-       int kill = 1, forcekill;
-       struct page *hpage = *hpagep;
-       bool mlocked = PageMlocked(hpage);
-@@ -995,7 +995,19 @@ static bool hwpoison_user_mappings(struc
-       if (kill)
-               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
--      unmap_success = try_to_unmap(hpage, ttu);
-+      if (!PageHuge(hpage)) {
-+              unmap_success = try_to_unmap(hpage, ttu);
-+      } else if (mapping) {
-+              /*
-+               * For hugetlb pages, try_to_unmap could potentially call
-+               * huge_pmd_unshare.  Because of this, take semaphore in
-+               * write mode here and set TTU_RMAP_LOCKED to indicate we
-+               * have taken the lock at this higer level.
-+               */
-+              i_mmap_lock_write(mapping);
-+              unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-+      }
-       if (!unmap_success)
-               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
-                      pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_
-               goto put_anon;
-       if (page_mapped(hpage)) {
-+              struct address_space *mapping = page_mapping(hpage);
-+
-+              /*
-+               * try_to_unmap could potentially call huge_pmd_unshare.
-+               * Because of this, take semaphore in write mode here and
-+               * set TTU_RMAP_LOCKED to let lower levels know we have
-+               * taken the lock.
-+               */
-+              i_mmap_lock_write(mapping);
-               try_to_unmap(hpage,
--                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+                      TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-               page_was_mapped = 1;
-       }
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
-  *     page->flags PG_locked (lock_page)
-  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
-  *         mapping->i_mmap_rwsem
-+ *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
-  *           anon_vma->rwsem
-  *             mm->page_table_lock or pte_lock
-  *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1370,6 +1371,9 @@ static bool try_to_unmap_one(struct page
-               /*
-                * If sharing is possible, start and end will be adjusted
-                * accordingly.
-+               *
-+               * If called for a huge page, caller must hold i_mmap_rwsem
-+               * in write mode as it is possible to call huge_pmd_unshare.
-                */
-               adjust_range_if_pmd_sharing_possible(vma, &start, &end);
-       }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -268,10 +268,14 @@ retry:
-               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-               /*
--               * Serialize via hugetlb_fault_mutex
-+               * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+               * i_mmap_rwsem ensures the dst_pte remains valid even
-+               * in the case of shared pmds.  fault mutex prevents
-+               * races with other faulting threads.
-                */
--              idx = linear_page_index(dst_vma, dst_addr);
-               mapping = dst_vma->vm_file->f_mapping;
-+              i_mmap_lock_read(mapping);
-+              idx = linear_page_index(dst_vma, dst_addr);
-               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
-                                                               idx, dst_addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -280,6 +284,7 @@ retry:
-               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
-               if (!dst_pte) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -287,6 +292,7 @@ retry:
-               dst_pteval = huge_ptep_get(dst_pte);
-               if (!huge_pte_none(dst_pteval)) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -294,6 +300,7 @@ retry:
-                                               dst_addr, src_addr, &page);
-               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+              i_mmap_unlock_read(mapping);
-               vm_alloc_shared = vm_shared;
-               cond_resched();
diff --git a/queue-4.14/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch b/queue-4.14/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
deleted file mode 100644 (file)
index 1dc6f29..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race.  One obvious omission in the
-current code is removing a page newly added to the page cache.  This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations.  To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out.  There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv.  Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race.  i_mmap_rwsem is now held in read mode for the
-duration of page fault processing.  Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen.  Remove the dead code and
-expand comments to explain reasoning.  Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
-  Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c |   61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c         |   21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -393,17 +393,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
-  * truncation is indicated by end of range being LLONG_MAX
-  *    In this case, we first scan the range and release found pages.
-  *    After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- *    maps and global counts.  Page faults can not race with truncation
-- *    in this routine.  hugetlb_no_page() prevents page faults in the
-- *    truncated range.  It checks i_size before allocation, and again after
-- *    with the page table lock for the page held.  The same lock must be
-- *    acquired to unmap a page.
-+ *    maps and global counts.
-  * hole punch is indicated if end is not LLONG_MAX
-  *    In the hole punch case we scan the range and release found pages.
-  *    Only when releasing a page is the associated region/reserv map
-  *    deleted.  The region/reserv map for ranges without associated
-- *    pages are not modified.  Page faults can race with hole punch.
-- *    This is indicated if we find a mapped page.
-+ *    pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
-  * Note: If the passed end of range value is beyond the end of file, but
-  * not LLONG_MAX this routine still performs a hole punch operation.
-  */
-@@ -433,32 +432,14 @@ static void remove_inode_hugepages(struc
-               for (i = 0; i < pagevec_count(&pvec); ++i) {
-                       struct page *page = pvec.pages[i];
--                      u32 hash;
-                       index = page->index;
--                      hash = hugetlb_fault_mutex_hash(h, current->mm,
--                                                      &pseudo_vma,
--                                                      mapping, index, 0);
--                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
-                       /*
--                       * If page is mapped, it was faulted in after being
--                       * unmapped in caller.  Unmap (again) now after taking
--                       * the fault mutex.  The mutex will prevent faults
--                       * until we finish removing the page.
--                       *
--                       * This race can only happen in the hole punch case.
--                       * Getting here in a truncate operation is a bug.
-+                       * A mapped page is impossible as callers should unmap
-+                       * all references before calling.  And, i_mmap_rwsem
-+                       * prevents the creation of additional mappings.
-                        */
--                      if (unlikely(page_mapped(page))) {
--                              BUG_ON(truncate_op);
--
--                              i_mmap_lock_write(mapping);
--                              hugetlb_vmdelete_list(&mapping->i_mmap,
--                                      index * pages_per_huge_page(h),
--                                      (index + 1) * pages_per_huge_page(h));
--                              i_mmap_unlock_write(mapping);
--                      }
-+                      VM_BUG_ON(page_mapped(page));
-                       lock_page(page);
-                       /*
-@@ -480,7 +461,6 @@ static void remove_inode_hugepages(struc
-                       }
-                       unlock_page(page);
--                      mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-               }
-               huge_pagevec_release(&pvec);
-               cond_resched();
-@@ -492,9 +472,20 @@ static void remove_inode_hugepages(struc
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+      struct address_space *mapping = inode->i_mapping;
-       struct resv_map *resv_map;
-+      /*
-+       * The vfs layer guarantees that there are no other users of this
-+       * inode.  Therefore, it would be safe to call remove_inode_hugepages
-+       * without holding i_mmap_rwsem.  We acquire and hold here to be
-+       * consistent with other callers.  Since there will be no contention
-+       * on the semaphore, overhead is negligible.
-+       */
-+      i_mmap_lock_write(mapping);
-       remove_inode_hugepages(inode, 0, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-+
-       resv_map = (struct resv_map *)inode->i_mapping->private_data;
-       /* root inode doesn't have the resv_map, so we should check it */
-       if (resv_map)
-@@ -515,8 +506,8 @@ static int hugetlb_vmtruncate(struct ino
-       i_mmap_lock_write(mapping);
-       if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
-               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
--      i_mmap_unlock_write(mapping);
-       remove_inode_hugepages(inode, offset, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-       return 0;
- }
-@@ -542,8 +533,8 @@ static long hugetlbfs_punch_hole(struct
-                       hugetlb_vmdelete_list(&mapping->i_mmap,
-                                               hole_start >> PAGE_SHIFT,
-                                               hole_end  >> PAGE_SHIFT);
--              i_mmap_unlock_write(mapping);
-               remove_inode_hugepages(inode, hole_start, hole_end);
-+              i_mmap_unlock_write(mapping);
-               inode_unlock(inode);
-       }
-@@ -620,7 +611,11 @@ static long hugetlbfs_fallocate(struct f
-               /* addr is the offset within the file (zero based) */
-               addr = index * hpage_size;
--              /* mutex taken here, fault path and hole punch */
-+              /*
-+               * fault mutex taken here, protects against fault path
-+               * and hole punch.  inode_lock previously taken protects
-+               * against truncation.
-+               */
-               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-                                               index, addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3709,16 +3709,16 @@ static int hugetlb_no_page(struct mm_str
-       }
-       /*
--       * Use page lock to guard against racing truncation
--       * before we get page_table_lock.
-+       * We can not race with truncation due to holding i_mmap_rwsem.
-+       * Check once here for faults beyond end of file.
-        */
-+      size = i_size_read(mapping->host) >> huge_page_shift(h);
-+      if (idx >= size)
-+              goto out;
-+
- retry:
-       page = find_lock_page(mapping, idx);
-       if (!page) {
--              size = i_size_read(mapping->host) >> huge_page_shift(h);
--              if (idx >= size)
--                      goto out;
--
-               /*
-                * Check for page in userfault range
-                */
-@@ -3812,9 +3812,6 @@ retry:
-       }
-       ptl = huge_pte_lock(h, mm, ptep);
--      size = i_size_read(mapping->host) >> huge_page_shift(h);
--      if (idx >= size)
--              goto backout;
-       ret = 0;
-       if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3922,8 +3919,10 @@ int hugetlb_fault(struct mm_struct *mm,
-       /*
-        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
--       * until finished with ptep.  This prevents huge_pmd_unshare from
--       * being called elsewhere and making the ptep no longer valid.
-+       * until finished with ptep.  This serves two purposes:
-+       * 1) It prevents huge_pmd_unshare from being called elsewhere
-+       *    and making the ptep no longer valid.
-+       * 2) It synchronizes us with file truncation.
-        *
-        * ptep could have already be assigned via huge_pte_offset.  That
-        * is OK, as huge_pte_alloc will return the same value unless
index 0ebac38c6f49f3b241573106ca2682af4a7798a6..aa6d500937e34bc8bed8044b30a271d9d807e028 100644 (file)
@@ -64,8 +64,6 @@ mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
 mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
 mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
 mm-swap-fix-swapoff-with-ksm-pages.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
 sunrpc-fix-cache_head-leak-due-to-queued-request.patch
 sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
 sunrpc-use-after-free-in-svc_process_common.patch
diff --git a/queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch b/queue-4.19/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
deleted file mode 100644 (file)
index 1a8a9de..0000000
+++ /dev/null
@@ -1,345 +0,0 @@
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table.  Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep.  Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file.  As part of truncation, it
-unmaps everyone who has the file mapped.  If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called.  For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd.  If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse.  This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
-  huge_pmd_share is only called via huge_pte_alloc, so callers of
-  huge_pte_alloc take i_mmap_rwsem before calling.  In addition, callers
-  of huge_pte_alloc continue to hold the semaphore until finished with the
-  ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
-  called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c        |   65 ++++++++++++++++++++++++++++++++++++++++------------
- mm/memory-failure.c |   16 +++++++++++-
- mm/migrate.c        |   13 +++++++++-
- mm/rmap.c           |    4 +++
- mm/userfaultfd.c    |   11 +++++++-
- 5 files changed, 89 insertions(+), 20 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3237,6 +3237,7 @@ int copy_hugetlb_page_range(struct mm_st
-       struct page *ptepage;
-       unsigned long addr;
-       int cow;
-+      struct address_space *mapping = vma->vm_file->f_mapping;
-       struct hstate *h = hstate_vma(vma);
-       unsigned long sz = huge_page_size(h);
-       unsigned long mmun_start;       /* For mmu_notifiers */
-@@ -3249,12 +3250,23 @@ int copy_hugetlb_page_range(struct mm_st
-       mmun_end = vma->vm_end;
-       if (cow)
-               mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+      else {
-+              /*
-+               * For shared mappings i_mmap_rwsem must be held to call
-+               * huge_pte_alloc, otherwise the returned ptep could go
-+               * away if part of a shared pmd and another thread calls
-+               * huge_pmd_unshare.
-+               */
-+              i_mmap_lock_read(mapping);
-+      }
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
-               spinlock_t *src_ptl, *dst_ptl;
-+
-               src_pte = huge_pte_offset(src, addr, sz);
-               if (!src_pte)
-                       continue;
-+
-               dst_pte = huge_pte_alloc(dst, addr, sz);
-               if (!dst_pte) {
-                       ret = -ENOMEM;
-@@ -3325,6 +3337,8 @@ int copy_hugetlb_page_range(struct mm_st
-       if (cow)
-               mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+      else
-+              i_mmap_unlock_read(mapping);
-       return ret;
- }
-@@ -3772,14 +3786,18 @@ retry:
-                       };
-                       /*
--                       * hugetlb_fault_mutex must be dropped before
--                       * handling userfault.  Reacquire after handling
--                       * fault to make calling code simpler.
-+                       * hugetlb_fault_mutex and i_mmap_rwsem must be
-+                       * dropped before handling userfault.  Reacquire
-+                       * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, haddr);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-+
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+                      i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                       goto out;
-               }
-@@ -3927,6 +3945,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
-       ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
-       if (ptep) {
-+              /*
-+               * Since we hold no locks, ptep could be stale.  That is
-+               * OK as we are only making decisions based on content and
-+               * not actually modifying content here.
-+               */
-               entry = huge_ptep_get(ptep);
-               if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(vma, mm, ptep);
-@@ -3934,20 +3957,31 @@ vm_fault_t hugetlb_fault(struct mm_struc
-               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-                       return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(hstate_index(h));
--      } else {
--              ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
--              if (!ptep)
--                      return VM_FAULT_OOM;
-       }
-+      /*
-+       * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+       * until finished with ptep.  This prevents huge_pmd_unshare from
-+       * being called elsewhere and making the ptep no longer valid.
-+       *
-+       * ptep could have already be assigned via huge_pte_offset.  That
-+       * is OK, as huge_pte_alloc will return the same value unless
-+       * something changed.
-+       */
-       mapping = vma->vm_file->f_mapping;
--      idx = vma_hugecache_offset(h, vma, haddr);
-+      i_mmap_lock_read(mapping);
-+      ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-+      if (!ptep) {
-+              i_mmap_unlock_read(mapping);
-+              return VM_FAULT_OOM;
-+      }
-       /*
-        * Serialize hugepage allocation and instantiation, so that we don't
-        * get spurious allocation failures if two CPUs race to instantiate
-        * the same page in the page cache.
-        */
-+      idx = vma_hugecache_offset(h, vma, haddr);
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -4035,6 +4069,7 @@ out_ptl:
-       }
- out_mutex:
-       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+      i_mmap_unlock_read(mapping);
-       /*
-        * Generally it's safe to hold refcount during waiting page lock. But
-        * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4639,10 +4674,12 @@ void adjust_range_if_pmd_sharing_possibl
-  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
-  * and returns the corresponding pte. While this is not necessary for the
-  * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space.  This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
-  */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4659,7 +4696,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_lock_write(mapping);
-       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
-               if (svma == vma)
-                       continue;
-@@ -4689,7 +4725,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       spin_unlock(ptl);
- out:
-       pte = (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_unlock_write(mapping);
-       return pte;
- }
-@@ -4700,7 +4735,7 @@ out:
-  * indicated by page_count > 1, unmap is achieved by clearing pud and
-  * decrementing the ref count. If count == 1, the pte page is not shared.
-  *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
-  *
-  * returns: 1 successfully unmapped a shared pte page
-  *        0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struc
-       enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
-       struct address_space *mapping;
-       LIST_HEAD(tokill);
--      bool unmap_success;
-+      bool unmap_success = true;
-       int kill = 1, forcekill;
-       struct page *hpage = *hpagep;
-       bool mlocked = PageMlocked(hpage);
-@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc
-       if (kill)
-               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
--      unmap_success = try_to_unmap(hpage, ttu);
-+      if (!PageHuge(hpage)) {
-+              unmap_success = try_to_unmap(hpage, ttu);
-+      } else if (mapping) {
-+              /*
-+               * For hugetlb pages, try_to_unmap could potentially call
-+               * huge_pmd_unshare.  Because of this, take semaphore in
-+               * write mode here and set TTU_RMAP_LOCKED to indicate we
-+               * have taken the lock at this higer level.
-+               */
-+              i_mmap_lock_write(mapping);
-+              unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-+      }
-       if (!unmap_success)
-               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
-                      pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_
-               goto put_anon;
-       if (page_mapped(hpage)) {
-+              struct address_space *mapping = page_mapping(hpage);
-+
-+              /*
-+               * try_to_unmap could potentially call huge_pmd_unshare.
-+               * Because of this, take semaphore in write mode here and
-+               * set TTU_RMAP_LOCKED to let lower levels know we have
-+               * taken the lock.
-+               */
-+              i_mmap_lock_write(mapping);
-               try_to_unmap(hpage,
--                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+                      TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-               page_was_mapped = 1;
-       }
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
-  *     page->flags PG_locked (lock_page)
-  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
-  *         mapping->i_mmap_rwsem
-+ *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
-  *           anon_vma->rwsem
-  *             mm->page_table_lock or pte_lock
-  *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page
-               /*
-                * If sharing is possible, start and end will be adjusted
-                * accordingly.
-+               *
-+               * If called for a huge page, caller must hold i_mmap_rwsem
-+               * in write mode as it is possible to call huge_pmd_unshare.
-                */
-               adjust_range_if_pmd_sharing_possible(vma, &start, &end);
-       }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -267,10 +267,14 @@ retry:
-               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-               /*
--               * Serialize via hugetlb_fault_mutex
-+               * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+               * i_mmap_rwsem ensures the dst_pte remains valid even
-+               * in the case of shared pmds.  fault mutex prevents
-+               * races with other faulting threads.
-                */
--              idx = linear_page_index(dst_vma, dst_addr);
-               mapping = dst_vma->vm_file->f_mapping;
-+              i_mmap_lock_read(mapping);
-+              idx = linear_page_index(dst_vma, dst_addr);
-               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
-                                                               idx, dst_addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -279,6 +283,7 @@ retry:
-               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
-               if (!dst_pte) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -286,6 +291,7 @@ retry:
-               dst_pteval = huge_ptep_get(dst_pte);
-               if (!huge_pte_none(dst_pteval)) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -293,6 +299,7 @@ retry:
-                                               dst_addr, src_addr, &page);
-               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+              i_mmap_unlock_read(mapping);
-               vm_alloc_shared = vm_shared;
-               cond_resched();
diff --git a/queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch b/queue-4.19/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
deleted file mode 100644 (file)
index 1f42b81..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race.  One obvious omission in the
-current code is removing a page newly added to the page cache.  This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations.  To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out.  There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv.  Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race.  i_mmap_rwsem is now held in read mode for the
-duration of page fault processing.  Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen.  Remove the dead code and
-expand comments to explain reasoning.  Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
-  Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c |   61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c         |   21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
-  * truncation is indicated by end of range being LLONG_MAX
-  *    In this case, we first scan the range and release found pages.
-  *    After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- *    maps and global counts.  Page faults can not race with truncation
-- *    in this routine.  hugetlb_no_page() prevents page faults in the
-- *    truncated range.  It checks i_size before allocation, and again after
-- *    with the page table lock for the page held.  The same lock must be
-- *    acquired to unmap a page.
-+ *    maps and global counts.
-  * hole punch is indicated if end is not LLONG_MAX
-  *    In the hole punch case we scan the range and release found pages.
-  *    Only when releasing a page is the associated region/reserv map
-  *    deleted.  The region/reserv map for ranges without associated
-- *    pages are not modified.  Page faults can race with hole punch.
-- *    This is indicated if we find a mapped page.
-+ *    pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
-  * Note: If the passed end of range value is beyond the end of file, but
-  * not LLONG_MAX this routine still performs a hole punch operation.
-  */
-@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc
-               for (i = 0; i < pagevec_count(&pvec); ++i) {
-                       struct page *page = pvec.pages[i];
--                      u32 hash;
-                       index = page->index;
--                      hash = hugetlb_fault_mutex_hash(h, current->mm,
--                                                      &pseudo_vma,
--                                                      mapping, index, 0);
--                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
-                       /*
--                       * If page is mapped, it was faulted in after being
--                       * unmapped in caller.  Unmap (again) now after taking
--                       * the fault mutex.  The mutex will prevent faults
--                       * until we finish removing the page.
--                       *
--                       * This race can only happen in the hole punch case.
--                       * Getting here in a truncate operation is a bug.
-+                       * A mapped page is impossible as callers should unmap
-+                       * all references before calling.  And, i_mmap_rwsem
-+                       * prevents the creation of additional mappings.
-                        */
--                      if (unlikely(page_mapped(page))) {
--                              BUG_ON(truncate_op);
--
--                              i_mmap_lock_write(mapping);
--                              hugetlb_vmdelete_list(&mapping->i_mmap,
--                                      index * pages_per_huge_page(h),
--                                      (index + 1) * pages_per_huge_page(h));
--                              i_mmap_unlock_write(mapping);
--                      }
-+                      VM_BUG_ON(page_mapped(page));
-                       lock_page(page);
-                       /*
-@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc
-                       }
-                       unlock_page(page);
--                      mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-               }
-               huge_pagevec_release(&pvec);
-               cond_resched();
-@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struc
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+      struct address_space *mapping = inode->i_mapping;
-       struct resv_map *resv_map;
-+      /*
-+       * The vfs layer guarantees that there are no other users of this
-+       * inode.  Therefore, it would be safe to call remove_inode_hugepages
-+       * without holding i_mmap_rwsem.  We acquire and hold here to be
-+       * consistent with other callers.  Since there will be no contention
-+       * on the semaphore, overhead is negligible.
-+       */
-+      i_mmap_lock_write(mapping);
-       remove_inode_hugepages(inode, 0, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-+
-       resv_map = (struct resv_map *)inode->i_mapping->private_data;
-       /* root inode doesn't have the resv_map, so we should check it */
-       if (resv_map)
-@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct ino
-       i_mmap_lock_write(mapping);
-       if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
-               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
--      i_mmap_unlock_write(mapping);
-       remove_inode_hugepages(inode, offset, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-       return 0;
- }
-@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct
-                       hugetlb_vmdelete_list(&mapping->i_mmap,
-                                               hole_start >> PAGE_SHIFT,
-                                               hole_end  >> PAGE_SHIFT);
--              i_mmap_unlock_write(mapping);
-               remove_inode_hugepages(inode, hole_start, hole_end);
-+              i_mmap_unlock_write(mapping);
-               inode_unlock(inode);
-       }
-@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct f
-               /* addr is the offset within the file (zero based) */
-               addr = index * hpage_size;
--              /* mutex taken here, fault path and hole punch */
-+              /*
-+               * fault mutex taken here, protects against fault path
-+               * and hole punch.  inode_lock previously taken protects
-+               * against truncation.
-+               */
-               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-                                               index, addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3757,16 +3757,16 @@ static vm_fault_t hugetlb_no_page(struct
-       }
-       /*
--       * Use page lock to guard against racing truncation
--       * before we get page_table_lock.
-+       * We can not race with truncation due to holding i_mmap_rwsem.
-+       * Check once here for faults beyond end of file.
-        */
-+      size = i_size_read(mapping->host) >> huge_page_shift(h);
-+      if (idx >= size)
-+              goto out;
-+
- retry:
-       page = find_lock_page(mapping, idx);
-       if (!page) {
--              size = i_size_read(mapping->host) >> huge_page_shift(h);
--              if (idx >= size)
--                      goto out;
--
-               /*
-                * Check for page in userfault range
-                */
-@@ -3856,9 +3856,6 @@ retry:
-       }
-       ptl = huge_pte_lock(h, mm, ptep);
--      size = i_size_read(mapping->host) >> huge_page_shift(h);
--      if (idx >= size)
--              goto backout;
-       ret = 0;
-       if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3961,8 +3958,10 @@ vm_fault_t hugetlb_fault(struct mm_struc
-       /*
-        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
--       * until finished with ptep.  This prevents huge_pmd_unshare from
--       * being called elsewhere and making the ptep no longer valid.
-+       * until finished with ptep.  This serves two purposes:
-+       * 1) It prevents huge_pmd_unshare from being called elsewhere
-+       *    and making the ptep no longer valid.
-+       * 2) It synchronizes us with file truncation.
-        *
-        * ptep could have already be assigned via huge_pte_offset.  That
-        * is OK, as huge_pte_alloc will return the same value unless
index ad3b98af3f35f77c9f395a3cb01dbe12a1b80753..a77f12101358e2c476d0a55dc21b16e2d14cae39 100644 (file)
@@ -86,8 +86,6 @@ mm-devm_memremap_pages-add-memory_device_private-support.patch
 mm-hmm-use-devm-semantics-for-hmm_devmem_-add-remove.patch
 mm-hmm-mark-hmm_devmem_-add-add_resource-export_symbol_gpl.patch
 mm-swap-fix-swapoff-with-ksm-pages.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
 memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
 sunrpc-fix-cache_head-leak-due-to-queued-request.patch
 sunrpc-use-svc_net-in-svcauth_gss_-functions.patch
diff --git a/queue-4.20/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch b/queue-4.20/hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
deleted file mode 100644 (file)
index e471fe2..0000000
+++ /dev/null
@@ -1,345 +0,0 @@
-From b43a9990055958e70347c56f90ea2ae32c67334c Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:38 -0800
-Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit b43a9990055958e70347c56f90ea2ae32c67334c upstream.
-
-While looking at BUGs associated with invalid huge page map counts, it was
-discovered and observed that a huge pte pointer could become 'invalid' and
-point to another task's page table.  Consider the following:
-
-A task takes a page fault on a shared hugetlbfs file and calls
-huge_pte_alloc to get a ptep.  Suppose the returned ptep points to a
-shared pmd.
-
-Now, another task truncates the hugetlbfs file.  As part of truncation, it
-unmaps everyone who has the file mapped.  If the range being truncated is
-covered by a shared pmd, huge_pmd_unshare will be called.  For all but the
-last user of the shared pmd, huge_pmd_unshare will clear the pud pointing
-to the pmd.  If the task in the middle of the page fault is not the last
-user, the ptep returned by huge_pte_alloc now points to another task's
-page table or worse.  This leads to bad things such as incorrect page
-map/reference counts or invalid memory references.
-
-To fix, expand the use of i_mmap_rwsem as follows:
-
-- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
-  huge_pmd_share is only called via huge_pte_alloc, so callers of
-  huge_pte_alloc take i_mmap_rwsem before calling.  In addition, callers
-  of huge_pte_alloc continue to hold the semaphore until finished with the
-  ptep.
-
-- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is
-  called.
-
-[mike.kravetz@oracle.com: add explicit check for mapping != null]
-Link: http://lkml.kernel.org/r/20181218223557.5202-2-mike.kravetz@oracle.com
-Fixes: 39dde65c9940 ("shared page table for hugetlb page")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: Colin Ian King <colin.king@canonical.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/hugetlb.c        |   65 ++++++++++++++++++++++++++++++++++++++++------------
- mm/memory-failure.c |   16 +++++++++++-
- mm/migrate.c        |   13 +++++++++-
- mm/rmap.c           |    4 +++
- mm/userfaultfd.c    |   11 +++++++-
- 5 files changed, 89 insertions(+), 20 deletions(-)
-
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3238,6 +3238,7 @@ int copy_hugetlb_page_range(struct mm_st
-       struct page *ptepage;
-       unsigned long addr;
-       int cow;
-+      struct address_space *mapping = vma->vm_file->f_mapping;
-       struct hstate *h = hstate_vma(vma);
-       unsigned long sz = huge_page_size(h);
-       unsigned long mmun_start;       /* For mmu_notifiers */
-@@ -3250,12 +3251,23 @@ int copy_hugetlb_page_range(struct mm_st
-       mmun_end = vma->vm_end;
-       if (cow)
-               mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-+      else {
-+              /*
-+               * For shared mappings i_mmap_rwsem must be held to call
-+               * huge_pte_alloc, otherwise the returned ptep could go
-+               * away if part of a shared pmd and another thread calls
-+               * huge_pmd_unshare.
-+               */
-+              i_mmap_lock_read(mapping);
-+      }
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
-               spinlock_t *src_ptl, *dst_ptl;
-+
-               src_pte = huge_pte_offset(src, addr, sz);
-               if (!src_pte)
-                       continue;
-+
-               dst_pte = huge_pte_alloc(dst, addr, sz);
-               if (!dst_pte) {
-                       ret = -ENOMEM;
-@@ -3326,6 +3338,8 @@ int copy_hugetlb_page_range(struct mm_st
-       if (cow)
-               mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-+      else
-+              i_mmap_unlock_read(mapping);
-       return ret;
- }
-@@ -3773,14 +3787,18 @@ retry:
-                       };
-                       /*
--                       * hugetlb_fault_mutex must be dropped before
--                       * handling userfault.  Reacquire after handling
--                       * fault to make calling code simpler.
-+                       * hugetlb_fault_mutex and i_mmap_rwsem must be
-+                       * dropped before handling userfault.  Reacquire
-+                       * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, haddr);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-+
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-+
-+                      i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                       goto out;
-               }
-@@ -3928,6 +3946,11 @@ vm_fault_t hugetlb_fault(struct mm_struc
-       ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
-       if (ptep) {
-+              /*
-+               * Since we hold no locks, ptep could be stale.  That is
-+               * OK as we are only making decisions based on content and
-+               * not actually modifying content here.
-+               */
-               entry = huge_ptep_get(ptep);
-               if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(vma, mm, ptep);
-@@ -3935,20 +3958,31 @@ vm_fault_t hugetlb_fault(struct mm_struc
-               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-                       return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(hstate_index(h));
--      } else {
--              ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
--              if (!ptep)
--                      return VM_FAULT_OOM;
-       }
-+      /*
-+       * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-+       * until finished with ptep.  This prevents huge_pmd_unshare from
-+       * being called elsewhere and making the ptep no longer valid.
-+       *
-+       * ptep could have already be assigned via huge_pte_offset.  That
-+       * is OK, as huge_pte_alloc will return the same value unless
-+       * something changed.
-+       */
-       mapping = vma->vm_file->f_mapping;
--      idx = vma_hugecache_offset(h, vma, haddr);
-+      i_mmap_lock_read(mapping);
-+      ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-+      if (!ptep) {
-+              i_mmap_unlock_read(mapping);
-+              return VM_FAULT_OOM;
-+      }
-       /*
-        * Serialize hugepage allocation and instantiation, so that we don't
-        * get spurious allocation failures if two CPUs race to instantiate
-        * the same page in the page cache.
-        */
-+      idx = vma_hugecache_offset(h, vma, haddr);
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -4036,6 +4070,7 @@ out_ptl:
-       }
- out_mutex:
-       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+      i_mmap_unlock_read(mapping);
-       /*
-        * Generally it's safe to hold refcount during waiting page lock. But
-        * here we just wait to defer the next page fault to avoid busy loop and
-@@ -4640,10 +4675,12 @@ void adjust_range_if_pmd_sharing_possibl
-  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
-  * and returns the corresponding pte. While this is not necessary for the
-  * !shared pmd case because we can allocate the pmd later as well, it makes the
-- * code much cleaner. pmd allocation is essential for the shared case because
-- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
-- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
-- * bad pmd for sharing.
-+ * code much cleaner.
-+ *
-+ * This routine must be called with i_mmap_rwsem held in at least read mode.
-+ * For hugetlbfs, this prevents removal of any page table entries associated
-+ * with the address space.  This is important as we are setting up sharing
-+ * based on existing page table entries (mappings).
-  */
- pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
- {
-@@ -4660,7 +4697,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_lock_write(mapping);
-       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
-               if (svma == vma)
-                       continue;
-@@ -4690,7 +4726,6 @@ pte_t *huge_pmd_share(struct mm_struct *
-       spin_unlock(ptl);
- out:
-       pte = (pte_t *)pmd_alloc(mm, pud, addr);
--      i_mmap_unlock_write(mapping);
-       return pte;
- }
-@@ -4701,7 +4736,7 @@ out:
-  * indicated by page_count > 1, unmap is achieved by clearing pud and
-  * decrementing the ref count. If count == 1, the pte page is not shared.
-  *
-- * called with page table lock held.
-+ * Called with page table lock held and i_mmap_rwsem held in write mode.
-  *
-  * returns: 1 successfully unmapped a shared pte page
-  *        0 the underlying pte page is not shared, or it is the last user
---- a/mm/memory-failure.c
-+++ b/mm/memory-failure.c
-@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struc
-       enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
-       struct address_space *mapping;
-       LIST_HEAD(tokill);
--      bool unmap_success;
-+      bool unmap_success = true;
-       int kill = 1, forcekill;
-       struct page *hpage = *hpagep;
-       bool mlocked = PageMlocked(hpage);
-@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc
-       if (kill)
-               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
--      unmap_success = try_to_unmap(hpage, ttu);
-+      if (!PageHuge(hpage)) {
-+              unmap_success = try_to_unmap(hpage, ttu);
-+      } else if (mapping) {
-+              /*
-+               * For hugetlb pages, try_to_unmap could potentially call
-+               * huge_pmd_unshare.  Because of this, take semaphore in
-+               * write mode here and set TTU_RMAP_LOCKED to indicate we
-+               * have taken the lock at this higer level.
-+               */
-+              i_mmap_lock_write(mapping);
-+              unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-+      }
-       if (!unmap_success)
-               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
-                      pfn, page_mapcount(hpage));
---- a/mm/migrate.c
-+++ b/mm/migrate.c
-@@ -1297,8 +1297,19 @@ static int unmap_and_move_huge_page(new_
-               goto put_anon;
-       if (page_mapped(hpage)) {
-+              struct address_space *mapping = page_mapping(hpage);
-+
-+              /*
-+               * try_to_unmap could potentially call huge_pmd_unshare.
-+               * Because of this, take semaphore in write mode here and
-+               * set TTU_RMAP_LOCKED to let lower levels know we have
-+               * taken the lock.
-+               */
-+              i_mmap_lock_write(mapping);
-               try_to_unmap(hpage,
--                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-+                      TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-+                      TTU_RMAP_LOCKED);
-+              i_mmap_unlock_write(mapping);
-               page_was_mapped = 1;
-       }
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -25,6 +25,7 @@
-  *     page->flags PG_locked (lock_page)
-  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
-  *         mapping->i_mmap_rwsem
-+ *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
-  *           anon_vma->rwsem
-  *             mm->page_table_lock or pte_lock
-  *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
-@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page
-               /*
-                * If sharing is possible, start and end will be adjusted
-                * accordingly.
-+               *
-+               * If called for a huge page, caller must hold i_mmap_rwsem
-+               * in write mode as it is possible to call huge_pmd_unshare.
-                */
-               adjust_range_if_pmd_sharing_possible(vma, &start, &end);
-       }
---- a/mm/userfaultfd.c
-+++ b/mm/userfaultfd.c
-@@ -267,10 +267,14 @@ retry:
-               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
-               /*
--               * Serialize via hugetlb_fault_mutex
-+               * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
-+               * i_mmap_rwsem ensures the dst_pte remains valid even
-+               * in the case of shared pmds.  fault mutex prevents
-+               * races with other faulting threads.
-                */
--              idx = linear_page_index(dst_vma, dst_addr);
-               mapping = dst_vma->vm_file->f_mapping;
-+              i_mmap_lock_read(mapping);
-+              idx = linear_page_index(dst_vma, dst_addr);
-               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
-                                                               idx, dst_addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
-@@ -279,6 +283,7 @@ retry:
-               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
-               if (!dst_pte) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -286,6 +291,7 @@ retry:
-               dst_pteval = huge_ptep_get(dst_pte);
-               if (!huge_pte_none(dst_pteval)) {
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+                      i_mmap_unlock_read(mapping);
-                       goto out_unlock;
-               }
-@@ -293,6 +299,7 @@ retry:
-                                               dst_addr, src_addr, &page);
-               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-+              i_mmap_unlock_read(mapping);
-               vm_alloc_shared = vm_shared;
-               cond_resched();
diff --git a/queue-4.20/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch b/queue-4.20/hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
deleted file mode 100644 (file)
index 940622c..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-From c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 Mon Sep 17 00:00:00 2001
-From: Mike Kravetz <mike.kravetz@oracle.com>
-Date: Fri, 28 Dec 2018 00:39:42 -0800
-Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race
-
-From: Mike Kravetz <mike.kravetz@oracle.com>
-
-commit c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 upstream.
-
-hugetlbfs page faults can race with truncate and hole punch operations.
-Current code in the page fault path attempts to handle this by 'backing
-out' operations if we encounter the race.  One obvious omission in the
-current code is removing a page newly added to the page cache.  This is
-pretty straight forward to address, but there is a more subtle and
-difficult issue of backing out hugetlb reservations.  To handle this
-correctly, the 'reservation state' before page allocation needs to be
-noted so that it can be properly backed out.  There are four distinct
-possibilities for reservation state: shared/reserved, shared/no-resv,
-private/reserved and private/no-resv.  Backing out a reservation may
-require memory allocation which could fail so that needs to be taken into
-account as well.
-
-Instead of writing the required complicated code for this rare occurrence,
-just eliminate the race.  i_mmap_rwsem is now held in read mode for the
-duration of page fault processing.  Hold i_mmap_rwsem longer in truncation
-and hold punch code to cover the call to remove_inode_hugepages.
-
-With this modification, code in remove_inode_hugepages checking for races
-becomes 'dead' as it can not longer happen.  Remove the dead code and
-expand comments to explain reasoning.  Similarly, checks for races with
-truncation in the page fault path can be simplified and removed.
-
-[mike.kravetz@oracle.com: incorporat suggestions from Kirill]
-  Link: http://lkml.kernel.org/r/20181222223013.22193-3-mike.kravetz@oracle.com
-Link: http://lkml.kernel.org/r/20181218223557.5202-3-mike.kravetz@oracle.com
-Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd")
-Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
-Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
-Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
-Cc: Andrea Arcangeli <aarcange@redhat.com>
-Cc: Davidlohr Bueso <dave@stgolabs.net>
-Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
-Cc: <stable@vger.kernel.org>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- fs/hugetlbfs/inode.c |   61 +++++++++++++++++++++++----------------------------
- mm/hugetlb.c         |   21 ++++++++---------
- 2 files changed, 38 insertions(+), 44 deletions(-)
-
---- a/fs/hugetlbfs/inode.c
-+++ b/fs/hugetlbfs/inode.c
-@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac
-  * truncation is indicated by end of range being LLONG_MAX
-  *    In this case, we first scan the range and release found pages.
-  *    After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
-- *    maps and global counts.  Page faults can not race with truncation
-- *    in this routine.  hugetlb_no_page() prevents page faults in the
-- *    truncated range.  It checks i_size before allocation, and again after
-- *    with the page table lock for the page held.  The same lock must be
-- *    acquired to unmap a page.
-+ *    maps and global counts.
-  * hole punch is indicated if end is not LLONG_MAX
-  *    In the hole punch case we scan the range and release found pages.
-  *    Only when releasing a page is the associated region/reserv map
-  *    deleted.  The region/reserv map for ranges without associated
-- *    pages are not modified.  Page faults can race with hole punch.
-- *    This is indicated if we find a mapped page.
-+ *    pages are not modified.
-+ *
-+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
-+ * races with page faults.
-+ *
-  * Note: If the passed end of range value is beyond the end of file, but
-  * not LLONG_MAX this routine still performs a hole punch operation.
-  */
-@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc
-               for (i = 0; i < pagevec_count(&pvec); ++i) {
-                       struct page *page = pvec.pages[i];
--                      u32 hash;
-                       index = page->index;
--                      hash = hugetlb_fault_mutex_hash(h, current->mm,
--                                                      &pseudo_vma,
--                                                      mapping, index, 0);
--                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
--
-                       /*
--                       * If page is mapped, it was faulted in after being
--                       * unmapped in caller.  Unmap (again) now after taking
--                       * the fault mutex.  The mutex will prevent faults
--                       * until we finish removing the page.
--                       *
--                       * This race can only happen in the hole punch case.
--                       * Getting here in a truncate operation is a bug.
-+                       * A mapped page is impossible as callers should unmap
-+                       * all references before calling.  And, i_mmap_rwsem
-+                       * prevents the creation of additional mappings.
-                        */
--                      if (unlikely(page_mapped(page))) {
--                              BUG_ON(truncate_op);
--
--                              i_mmap_lock_write(mapping);
--                              hugetlb_vmdelete_list(&mapping->i_mmap,
--                                      index * pages_per_huge_page(h),
--                                      (index + 1) * pages_per_huge_page(h));
--                              i_mmap_unlock_write(mapping);
--                      }
-+                      VM_BUG_ON(page_mapped(page));
-                       lock_page(page);
-                       /*
-@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc
-                       }
-                       unlock_page(page);
--                      mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-               }
-               huge_pagevec_release(&pvec);
-               cond_resched();
-@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struc
- static void hugetlbfs_evict_inode(struct inode *inode)
- {
-+      struct address_space *mapping = inode->i_mapping;
-       struct resv_map *resv_map;
-+      /*
-+       * The vfs layer guarantees that there are no other users of this
-+       * inode.  Therefore, it would be safe to call remove_inode_hugepages
-+       * without holding i_mmap_rwsem.  We acquire and hold here to be
-+       * consistent with other callers.  Since there will be no contention
-+       * on the semaphore, overhead is negligible.
-+       */
-+      i_mmap_lock_write(mapping);
-       remove_inode_hugepages(inode, 0, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-+
-       resv_map = (struct resv_map *)inode->i_mapping->private_data;
-       /* root inode doesn't have the resv_map, so we should check it */
-       if (resv_map)
-@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct ino
-       i_mmap_lock_write(mapping);
-       if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
-               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
--      i_mmap_unlock_write(mapping);
-       remove_inode_hugepages(inode, offset, LLONG_MAX);
-+      i_mmap_unlock_write(mapping);
-       return 0;
- }
-@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct
-                       hugetlb_vmdelete_list(&mapping->i_mmap,
-                                               hole_start >> PAGE_SHIFT,
-                                               hole_end  >> PAGE_SHIFT);
--              i_mmap_unlock_write(mapping);
-               remove_inode_hugepages(inode, hole_start, hole_end);
-+              i_mmap_unlock_write(mapping);
-               inode_unlock(inode);
-       }
-@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct f
-               /* addr is the offset within the file (zero based) */
-               addr = index * hpage_size;
--              /* mutex taken here, fault path and hole punch */
-+              /*
-+               * fault mutex taken here, protects against fault path
-+               * and hole punch.  inode_lock previously taken protects
-+               * against truncation.
-+               */
-               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-                                               index, addr);
-               mutex_lock(&hugetlb_fault_mutex_table[hash]);
---- a/mm/hugetlb.c
-+++ b/mm/hugetlb.c
-@@ -3758,16 +3758,16 @@ static vm_fault_t hugetlb_no_page(struct
-       }
-       /*
--       * Use page lock to guard against racing truncation
--       * before we get page_table_lock.
-+       * We can not race with truncation due to holding i_mmap_rwsem.
-+       * Check once here for faults beyond end of file.
-        */
-+      size = i_size_read(mapping->host) >> huge_page_shift(h);
-+      if (idx >= size)
-+              goto out;
-+
- retry:
-       page = find_lock_page(mapping, idx);
-       if (!page) {
--              size = i_size_read(mapping->host) >> huge_page_shift(h);
--              if (idx >= size)
--                      goto out;
--
-               /*
-                * Check for page in userfault range
-                */
-@@ -3857,9 +3857,6 @@ retry:
-       }
-       ptl = huge_pte_lock(h, mm, ptep);
--      size = i_size_read(mapping->host) >> huge_page_shift(h);
--      if (idx >= size)
--              goto backout;
-       ret = 0;
-       if (!huge_pte_none(huge_ptep_get(ptep)))
-@@ -3962,8 +3959,10 @@ vm_fault_t hugetlb_fault(struct mm_struc
-       /*
-        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
--       * until finished with ptep.  This prevents huge_pmd_unshare from
--       * being called elsewhere and making the ptep no longer valid.
-+       * until finished with ptep.  This serves two purposes:
-+       * 1) It prevents huge_pmd_unshare from being called elsewhere
-+       *    and making the ptep no longer valid.
-+       * 2) It synchronizes us with file truncation.
-        *
-        * ptep could have already be assigned via huge_pte_offset.  That
-        * is OK, as huge_pte_alloc will return the same value unless
index f80e1a2e1c2936b098bc0c59387fa9f54944631f..b97dd01b6790795e7961c1fcb5f1fcc195db39b3 100644 (file)
@@ -6,8 +6,6 @@ hwpoison-memory_hotplug-allow-hwpoisoned-pages-to-be-offlined.patch
 mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
 mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
 mm-devm_memremap_pages-fix-shutdown-handling.patch
-hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch
-hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch
 memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
 sunrpc-fix-cache_head-leak-due-to-queued-request.patch
 sunrpc-use-svc_net-in-svcauth_gss_-functions.patch