]>
Commit | Line | Data |
---|---|---|
7c34048e GKH |
1 | From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001 |
2 | From: Mike Kravetz <mike.kravetz@oracle.com> | |
3 | Date: Fri, 5 Oct 2018 15:51:29 -0700 | |
4 | Subject: mm: migration: fix migration of huge PMD shared pages | |
5 | ||
6 | From: Mike Kravetz <mike.kravetz@oracle.com> | |
7 | ||
8 | commit 017b1660df89f5fb4bfe66c34e35f7d2031100c7 upstream. | |
9 | ||
10 | The page migration code employs try_to_unmap() to try and unmap the source | |
11 | page. This is accomplished by using rmap_walk to find all vmas where the | |
12 | page is mapped. This search stops when page mapcount is zero. For shared | |
13 | PMD huge pages, the page map count is always 1 no matter the number of | |
14 | mappings. Shared mappings are tracked via the reference count of the PMD | |
15 | page. Therefore, try_to_unmap stops prematurely and does not completely | |
16 | unmap all mappings of the source page. | |
17 | ||
18 | This problem can result is data corruption as writes to the original | |
19 | source page can happen after contents of the page are copied to the target | |
20 | page. Hence, data is lost. | |
21 | ||
22 | This problem was originally seen as DB corruption of shared global areas | |
23 | after a huge page was soft offlined due to ECC memory errors. DB | |
24 | developers noticed they could reproduce the issue by (hotplug) offlining | |
25 | memory used to back huge pages. A simple testcase can reproduce the | |
26 | problem by creating a shared PMD mapping (note that this must be at least | |
27 | PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using | |
28 | migrate_pages() to migrate process pages between nodes while continually | |
29 | writing to the huge pages being migrated. | |
30 | ||
31 | To fix, have the try_to_unmap_one routine check for huge PMD sharing by | |
32 | calling huge_pmd_unshare for hugetlbfs huge pages. If it is a shared | |
33 | mapping it will be 'unshared' which removes the page table entry and drops | |
34 | the reference on the PMD page. After this, flush caches and TLB. | |
35 | ||
36 | mmu notifiers are called before locking page tables, but we can not be | |
37 | sure of PMD sharing until page tables are locked. Therefore, check for | |
38 | the possibility of PMD sharing before locking so that notifiers can | |
39 | prepare for the worst possible case. | |
40 | ||
41 | Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com | |
42 | [mike.kravetz@oracle.com: make _range_in_vma() a static inline] | |
43 | Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com | |
44 | Fixes: 39dde65c9940 ("shared page table for hugetlb page") | |
45 | Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> | |
46 | Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | |
47 | Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | |
48 | Acked-by: Michal Hocko <mhocko@suse.com> | |
49 | Cc: Vlastimil Babka <vbabka@suse.cz> | |
50 | Cc: Davidlohr Bueso <dave@stgolabs.net> | |
51 | Cc: Jerome Glisse <jglisse@redhat.com> | |
52 | Cc: Mike Kravetz <mike.kravetz@oracle.com> | |
53 | Cc: <stable@vger.kernel.org> | |
54 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
55 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
56 | ||
57 | --- | |
58 | include/linux/hugetlb.h | 14 ++++++++++++++ | |
59 | include/linux/mm.h | 6 ++++++ | |
60 | mm/hugetlb.c | 37 +++++++++++++++++++++++++++++++++++-- | |
61 | mm/rmap.c | 42 +++++++++++++++++++++++++++++++++++++++--- | |
62 | 4 files changed, 94 insertions(+), 5 deletions(-) | |
63 | ||
64 | --- a/include/linux/hugetlb.h | |
65 | +++ b/include/linux/hugetlb.h | |
66 | @@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct * | |
67 | pte_t *huge_pte_offset(struct mm_struct *mm, | |
68 | unsigned long addr, unsigned long sz); | |
69 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); | |
70 | +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, | |
71 | + unsigned long *start, unsigned long *end); | |
72 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | |
73 | int write); | |
74 | struct page *follow_huge_pd(struct vm_area_struct *vma, | |
75 | @@ -170,6 +172,18 @@ static inline unsigned long hugetlb_tota | |
76 | return 0; | |
77 | } | |
78 | ||
79 | +static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, | |
80 | + pte_t *ptep) | |
81 | +{ | |
82 | + return 0; | |
83 | +} | |
84 | + | |
85 | +static inline void adjust_range_if_pmd_sharing_possible( | |
86 | + struct vm_area_struct *vma, | |
87 | + unsigned long *start, unsigned long *end) | |
88 | +{ | |
89 | +} | |
90 | + | |
91 | #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) | |
92 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) | |
93 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) | |
94 | --- a/include/linux/mm.h | |
95 | +++ b/include/linux/mm.h | |
96 | @@ -2463,6 +2463,12 @@ static inline struct vm_area_struct *fin | |
97 | return vma; | |
98 | } | |
99 | ||
100 | +static inline bool range_in_vma(struct vm_area_struct *vma, | |
101 | + unsigned long start, unsigned long end) | |
102 | +{ | |
103 | + return (vma && vma->vm_start <= start && end <= vma->vm_end); | |
104 | +} | |
105 | + | |
106 | #ifdef CONFIG_MMU | |
107 | pgprot_t vm_get_page_prot(unsigned long vm_flags); | |
108 | void vma_set_page_prot(struct vm_area_struct *vma); | |
109 | --- a/mm/hugetlb.c | |
110 | +++ b/mm/hugetlb.c | |
111 | @@ -4556,13 +4556,41 @@ static bool vma_shareable(struct vm_area | |
112 | /* | |
113 | * check on proper vm_flags and page table alignment | |
114 | */ | |
115 | - if (vma->vm_flags & VM_MAYSHARE && | |
116 | - vma->vm_start <= base && end <= vma->vm_end) | |
117 | + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) | |
118 | return true; | |
119 | return false; | |
120 | } | |
121 | ||
122 | /* | |
123 | + * Determine if start,end range within vma could be mapped by shared pmd. | |
124 | + * If yes, adjust start and end to cover range associated with possible | |
125 | + * shared pmd mappings. | |
126 | + */ | |
127 | +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, | |
128 | + unsigned long *start, unsigned long *end) | |
129 | +{ | |
130 | + unsigned long check_addr = *start; | |
131 | + | |
132 | + if (!(vma->vm_flags & VM_MAYSHARE)) | |
133 | + return; | |
134 | + | |
135 | + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { | |
136 | + unsigned long a_start = check_addr & PUD_MASK; | |
137 | + unsigned long a_end = a_start + PUD_SIZE; | |
138 | + | |
139 | + /* | |
140 | + * If sharing is possible, adjust start/end if necessary. | |
141 | + */ | |
142 | + if (range_in_vma(vma, a_start, a_end)) { | |
143 | + if (a_start < *start) | |
144 | + *start = a_start; | |
145 | + if (a_end > *end) | |
146 | + *end = a_end; | |
147 | + } | |
148 | + } | |
149 | +} | |
150 | + | |
151 | +/* | |
152 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() | |
153 | * and returns the corresponding pte. While this is not necessary for the | |
154 | * !shared pmd case because we can allocate the pmd later as well, it makes the | |
155 | @@ -4659,6 +4687,11 @@ int huge_pmd_unshare(struct mm_struct *m | |
156 | { | |
157 | return 0; | |
158 | } | |
159 | + | |
160 | +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, | |
161 | + unsigned long *start, unsigned long *end) | |
162 | +{ | |
163 | +} | |
164 | #define want_pmd_share() (0) | |
165 | #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ | |
166 | ||
167 | --- a/mm/rmap.c | |
168 | +++ b/mm/rmap.c | |
169 | @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page | |
170 | } | |
171 | ||
172 | /* | |
173 | - * We have to assume the worse case ie pmd for invalidation. Note that | |
174 | - * the page can not be free in this function as call of try_to_unmap() | |
175 | - * must hold a reference on the page. | |
176 | + * For THP, we have to assume the worse case ie pmd for invalidation. | |
177 | + * For hugetlb, it could be much worse if we need to do pud | |
178 | + * invalidation in the case of pmd sharing. | |
179 | + * | |
180 | + * Note that the page can not be free in this function as call of | |
181 | + * try_to_unmap() must hold a reference on the page. | |
182 | */ | |
183 | end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); | |
184 | + if (PageHuge(page)) { | |
185 | + /* | |
186 | + * If sharing is possible, start and end will be adjusted | |
187 | + * accordingly. | |
188 | + */ | |
189 | + adjust_range_if_pmd_sharing_possible(vma, &start, &end); | |
190 | + } | |
191 | mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); | |
192 | ||
193 | while (page_vma_mapped_walk(&pvmw)) { | |
194 | @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page | |
195 | subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); | |
196 | address = pvmw.address; | |
197 | ||
198 | + if (PageHuge(page)) { | |
199 | + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { | |
200 | + /* | |
201 | + * huge_pmd_unshare unmapped an entire PMD | |
202 | + * page. There is no way of knowing exactly | |
203 | + * which PMDs may be cached for this mm, so | |
204 | + * we must flush them all. start/end were | |
205 | + * already adjusted above to cover this range. | |
206 | + */ | |
207 | + flush_cache_range(vma, start, end); | |
208 | + flush_tlb_range(vma, start, end); | |
209 | + mmu_notifier_invalidate_range(mm, start, end); | |
210 | + | |
211 | + /* | |
212 | + * The ref count of the PMD page was dropped | |
213 | + * which is part of the way map counting | |
214 | + * is done for shared PMDs. Return 'true' | |
215 | + * here. When there is no other sharing, | |
216 | + * huge_pmd_unshare returns false and we will | |
217 | + * unmap the actual page and drop map count | |
218 | + * to zero. | |
219 | + */ | |
220 | + page_vma_mapped_walk_done(&pvmw); | |
221 | + break; | |
222 | + } | |
223 | + } | |
224 | ||
225 | if (IS_ENABLED(CONFIG_MIGRATION) && | |
226 | (flags & TTU_MIGRATION) && |