]>
Commit | Line | Data |
---|---|---|
21f93658 GKH |
1 | From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001 |
2 | From: Mike Kravetz <mike.kravetz@oracle.com> | |
3 | Date: Mon, 13 May 2019 17:19:41 -0700 | |
4 | Subject: hugetlb: use same fault hash key for shared and private mappings | |
5 | ||
6 | From: Mike Kravetz <mike.kravetz@oracle.com> | |
7 | ||
8 | commit 1b426bac66e6cc83c9f2d92b96e4e72acf43419a upstream. | |
9 | ||
10 | hugetlb uses a fault mutex hash table to prevent page faults of the | |
11 | same pages concurrently. The key for shared and private mappings is | |
12 | different. Shared keys off address_space and file index. Private keys | |
13 | off mm and virtual address. Consider a private mappings of a populated | |
14 | hugetlbfs file. A fault will map the page from the file and if needed | |
15 | do a COW to map a writable page. | |
16 | ||
17 | Hugetlbfs hole punch uses the fault mutex to prevent mappings of file | |
18 | pages. It uses the address_space file index key. However, private | |
19 | mappings will use a different key and could race with this code to map | |
20 | the file page. This causes problems (BUG) for the page cache remove | |
21 | code as it expects the page to be unmapped. A sample stack is: | |
22 | ||
23 | page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) | |
24 | kernel BUG at mm/filemap.c:169! | |
25 | ... | |
26 | RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 | |
27 | ... | |
28 | Call Trace: | |
29 | __delete_from_page_cache+0x39/0x220 | |
30 | delete_from_page_cache+0x45/0x70 | |
31 | remove_inode_hugepages+0x13c/0x380 | |
32 | ? __add_to_page_cache_locked+0x162/0x380 | |
33 | hugetlbfs_fallocate+0x403/0x540 | |
34 | ? _cond_resched+0x15/0x30 | |
35 | ? __inode_security_revalidate+0x5d/0x70 | |
36 | ? selinux_file_permission+0x100/0x130 | |
37 | vfs_fallocate+0x13f/0x270 | |
38 | ksys_fallocate+0x3c/0x80 | |
39 | __x64_sys_fallocate+0x1a/0x20 | |
40 | do_syscall_64+0x5b/0x180 | |
41 | entry_SYSCALL_64_after_hwframe+0x44/0xa9 | |
42 | ||
43 | There seems to be another potential COW issue/race with this approach | |
44 | of different private and shared keys as noted in commit 8382d914ebf7 | |
45 | ("mm, hugetlb: improve page-fault scalability"). | |
46 | ||
47 | Since every hugetlb mapping (even anon and private) is actually a file | |
48 | mapping, just use the address_space index key for all mappings. This | |
49 | results in potentially more hash collisions. However, this should not | |
50 | be the common case. | |
51 | ||
52 | Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com | |
53 | Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 | |
54 | Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") | |
55 | Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> | |
56 | Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | |
57 | Reviewed-by: Davidlohr Bueso <dbueso@suse.de> | |
58 | Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> | |
59 | Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com> | |
60 | Cc: Michal Hocko <mhocko@kernel.org> | |
61 | Cc: <stable@vger.kernel.org> | |
62 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
63 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
64 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
65 | ||
66 | --- | |
67 | fs/hugetlbfs/inode.c | 7 ++----- | |
68 | include/linux/hugetlb.h | 4 +--- | |
69 | mm/hugetlb.c | 22 ++++++---------------- | |
70 | mm/userfaultfd.c | 3 +-- | |
71 | 4 files changed, 10 insertions(+), 26 deletions(-) | |
72 | ||
73 | --- a/fs/hugetlbfs/inode.c | |
74 | +++ b/fs/hugetlbfs/inode.c | |
75 | @@ -426,9 +426,7 @@ static void remove_inode_hugepages(struc | |
76 | u32 hash; | |
77 | ||
78 | index = page->index; | |
79 | - hash = hugetlb_fault_mutex_hash(h, current->mm, | |
80 | - &pseudo_vma, | |
81 | - mapping, index, 0); | |
82 | + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); | |
83 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
84 | ||
85 | /* | |
86 | @@ -625,8 +623,7 @@ static long hugetlbfs_fallocate(struct f | |
87 | addr = index * hpage_size; | |
88 | ||
89 | /* mutex taken here, fault path and hole punch */ | |
90 | - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, | |
91 | - index, addr); | |
92 | + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); | |
93 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
94 | ||
95 | /* See if already present in mapping to avoid alloc/free */ | |
96 | --- a/include/linux/hugetlb.h | |
97 | +++ b/include/linux/hugetlb.h | |
98 | @@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *old | |
99 | void free_huge_page(struct page *page); | |
100 | void hugetlb_fix_reserve_counts(struct inode *inode); | |
101 | extern struct mutex *hugetlb_fault_mutex_table; | |
102 | -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | |
103 | - struct vm_area_struct *vma, | |
104 | - struct address_space *mapping, | |
105 | +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, | |
106 | pgoff_t idx, unsigned long address); | |
107 | ||
108 | pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); | |
109 | --- a/mm/hugetlb.c | |
110 | +++ b/mm/hugetlb.c | |
111 | @@ -3777,8 +3777,7 @@ retry: | |
112 | * handling userfault. Reacquire after handling | |
113 | * fault to make calling code simpler. | |
114 | */ | |
115 | - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, | |
116 | - idx, haddr); | |
117 | + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); | |
118 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | |
119 | ret = handle_userfault(&vmf, VM_UFFD_MISSING); | |
120 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
121 | @@ -3886,21 +3885,14 @@ backout_unlocked: | |
122 | } | |
123 | ||
124 | #ifdef CONFIG_SMP | |
125 | -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | |
126 | - struct vm_area_struct *vma, | |
127 | - struct address_space *mapping, | |
128 | +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, | |
129 | pgoff_t idx, unsigned long address) | |
130 | { | |
131 | unsigned long key[2]; | |
132 | u32 hash; | |
133 | ||
134 | - if (vma->vm_flags & VM_SHARED) { | |
135 | - key[0] = (unsigned long) mapping; | |
136 | - key[1] = idx; | |
137 | - } else { | |
138 | - key[0] = (unsigned long) mm; | |
139 | - key[1] = address >> huge_page_shift(h); | |
140 | - } | |
141 | + key[0] = (unsigned long) mapping; | |
142 | + key[1] = idx; | |
143 | ||
144 | hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); | |
145 | ||
146 | @@ -3911,9 +3903,7 @@ u32 hugetlb_fault_mutex_hash(struct hsta | |
147 | * For uniprocesor systems we always use a single mutex, so just | |
148 | * return 0 and avoid the hashing overhead. | |
149 | */ | |
150 | -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, | |
151 | - struct vm_area_struct *vma, | |
152 | - struct address_space *mapping, | |
153 | +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, | |
154 | pgoff_t idx, unsigned long address) | |
155 | { | |
156 | return 0; | |
157 | @@ -3958,7 +3948,7 @@ vm_fault_t hugetlb_fault(struct mm_struc | |
158 | * get spurious allocation failures if two CPUs race to instantiate | |
159 | * the same page in the page cache. | |
160 | */ | |
161 | - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); | |
162 | + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); | |
163 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
164 | ||
165 | entry = huge_ptep_get(ptep); | |
166 | --- a/mm/userfaultfd.c | |
167 | +++ b/mm/userfaultfd.c | |
168 | @@ -271,8 +271,7 @@ retry: | |
169 | */ | |
170 | idx = linear_page_index(dst_vma, dst_addr); | |
171 | mapping = dst_vma->vm_file->f_mapping; | |
172 | - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, | |
173 | - idx, dst_addr); | |
174 | + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); | |
175 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
176 | ||
177 | err = -ENOMEM; |