queue-4.4/hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch

   1 From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001
   2 From: Mike Kravetz <mike.kravetz@oracle.com>
   3 Date: Mon, 13 May 2019 17:19:41 -0700
   4 Subject: hugetlb: use same fault hash key for shared and private mappings
   5
   6 From: Mike Kravetz <mike.kravetz@oracle.com>
   7
   8 commit 1b426bac66e6cc83c9f2d92b96e4e72acf43419a upstream.
   9
  10 hugetlb uses a fault mutex hash table to prevent page faults of the
  11 same pages concurrently.  The key for shared and private mappings is
  12 different.  Shared keys off address_space and file index.  Private keys
  13 off mm and virtual address.  Consider a private mappings of a populated
  14 hugetlbfs file.  A fault will map the page from the file and if needed
  15 do a COW to map a writable page.
  16
  17 Hugetlbfs hole punch uses the fault mutex to prevent mappings of file
  18 pages.  It uses the address_space file index key.  However, private
  19 mappings will use a different key and could race with this code to map
  20 the file page.  This causes problems (BUG) for the page cache remove
  21 code as it expects the page to be unmapped.  A sample stack is:
  22
  23 page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  24 kernel BUG at mm/filemap.c:169!
  25 ...
  26 RIP: 0010:unaccount_page_cache_page+0x1b8/0x200
  27 ...
  28 Call Trace:
  29 __delete_from_page_cache+0x39/0x220
  30 delete_from_page_cache+0x45/0x70
  31 remove_inode_hugepages+0x13c/0x380
  32 ? __add_to_page_cache_locked+0x162/0x380
  33 hugetlbfs_fallocate+0x403/0x540
  34 ? _cond_resched+0x15/0x30
  35 ? __inode_security_revalidate+0x5d/0x70
  36 ? selinux_file_permission+0x100/0x130
  37 vfs_fallocate+0x13f/0x270
  38 ksys_fallocate+0x3c/0x80
  39 __x64_sys_fallocate+0x1a/0x20
  40 do_syscall_64+0x5b/0x180
  41 entry_SYSCALL_64_after_hwframe+0x44/0xa9
  42
  43 There seems to be another potential COW issue/race with this approach
  44 of different private and shared keys as noted in commit 8382d914ebf7
  45 ("mm, hugetlb: improve page-fault scalability").
  46
  47 Since every hugetlb mapping (even anon and private) is actually a file
  48 mapping, just use the address_space index key for all mappings.  This
  49 results in potentially more hash collisions.  However, this should not
  50 be the common case.
  51
  52 Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com
  53 Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5
  54 Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
  55 Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
  56 Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
  57 Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
  58 Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
  59 Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
  60 Cc: Michal Hocko <mhocko@kernel.org>
  61 Cc: <stable@vger.kernel.org>
  62 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  63 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  64 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  65
  66
  67 ---
  68  fs/hugetlbfs/inode.c    |    8 ++------
  69  include/linux/hugetlb.h |    4 +---
  70  mm/hugetlb.c            |   19 +++++--------------
  71  3 files changed, 8 insertions(+), 23 deletions(-)
  72
  73 --- a/fs/hugetlbfs/inode.c
  74 +++ b/fs/hugetlbfs/inode.c
  75 @@ -414,9 +414,7 @@ static void remove_inode_hugepages(struc
  76                         if (next >= end)
  77                                 break;
  78
  79 -                       hash = hugetlb_fault_mutex_hash(h, current->mm,
  80 -                                                       &pseudo_vma,
  81 -                                                       mapping, next, 0);
  82 +                       hash = hugetlb_fault_mutex_hash(h, mapping, next, 0);
  83                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  84
  85                         lock_page(page);
  86 @@ -569,7 +567,6 @@ static long hugetlbfs_fallocate(struct f
  87         struct address_space *mapping = inode->i_mapping;
  88         struct hstate *h = hstate_inode(inode);
  89         struct vm_area_struct pseudo_vma;
  90 -       struct mm_struct *mm = current->mm;
  91         loff_t hpage_size = huge_page_size(h);
  92         unsigned long hpage_shift = huge_page_shift(h);
  93         pgoff_t start, index, end;
  94 @@ -633,8 +630,7 @@ static long hugetlbfs_fallocate(struct f
  95                 addr = index * hpage_size;
  96
  97                 /* mutex taken here, fault path and hole punch */
  98 -               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
  99 -                                               index, addr);
 100 +               hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
 101                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
 102
 103                 /* See if already present in mapping to avoid alloc/free */
 104 --- a/include/linux/hugetlb.h
 105 +++ b/include/linux/hugetlb.h
 106 @@ -91,9 +91,7 @@ void putback_active_hugepage(struct page
 107  void free_huge_page(struct page *page);
 108  void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
 109  extern struct mutex *hugetlb_fault_mutex_table;
 110 -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 111 -                               struct vm_area_struct *vma,
 112 -                               struct address_space *mapping,
 113 +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
 114                                 pgoff_t idx, unsigned long address);
 115
 116  #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 117 --- a/mm/hugetlb.c
 118 +++ b/mm/hugetlb.c
 119 @@ -3703,21 +3703,14 @@ backout_unlocked:
 120  }
 121
 122  #ifdef CONFIG_SMP
 123 -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 124 -                           struct vm_area_struct *vma,
 125 -                           struct address_space *mapping,
 126 +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
 127                             pgoff_t idx, unsigned long address)
 128  {
 129         unsigned long key[2];
 130         u32 hash;
 131
 132 -       if (vma->vm_flags & VM_SHARED) {
 133 -               key[0] = (unsigned long) mapping;
 134 -               key[1] = idx;
 135 -       } else {
 136 -               key[0] = (unsigned long) mm;
 137 -               key[1] = address >> huge_page_shift(h);
 138 -       }
 139 +       key[0] = (unsigned long) mapping;
 140 +       key[1] = idx;
 141
 142         hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
 143
 144 @@ -3728,9 +3721,7 @@ u32 hugetlb_fault_mutex_hash(struct hsta
 145   * For uniprocesor systems we always use a single mutex, so just
 146   * return 0 and avoid the hashing overhead.
 147   */
 148 -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 149 -                           struct vm_area_struct *vma,
 150 -                           struct address_space *mapping,
 151 +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
 152                             pgoff_t idx, unsigned long address)
 153  {
 154         return 0;
 155 @@ -3776,7 +3767,7 @@ int hugetlb_fault(struct mm_struct *mm,
 156          * get spurious allocation failures if two CPUs race to instantiate
 157          * the same page in the page cache.
 158          */
 159 -       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
 160 +       hash = hugetlb_fault_mutex_hash(h, mapping, idx, address);
 161         mutex_lock(&hugetlb_fault_mutex_table[hash]);
 162
 163         entry = huge_ptep_get(ptep);