]>
Commit | Line | Data |
---|---|---|
c9c77b0b GKH |
1 | From a987370f8e7a1677ae385042644326d9cd145a20 Mon Sep 17 00:00:00 2001 |
2 | From: Marc Zyngier <marc.zyngier@arm.com> | |
3 | Date: Tue, 10 Mar 2015 19:06:59 +0000 | |
4 | Subject: arm64: KVM: Fix stage-2 PGD allocation to have per-page refcounting | |
5 | ||
6 | From: Marc Zyngier <marc.zyngier@arm.com> | |
7 | ||
8 | commit a987370f8e7a1677ae385042644326d9cd145a20 upstream. | |
9 | ||
10 | We're using __get_free_pages with to allocate the guest's stage-2 | |
11 | PGD. The standard behaviour of this function is to return a set of | |
12 | pages where only the head page has a valid refcount. | |
13 | ||
14 | This behaviour gets us into trouble when we're trying to increment | |
15 | the refcount on a non-head page: | |
16 | ||
17 | page:ffff7c00cfb693c0 count:0 mapcount:0 mapping: (null) index:0x0 | |
18 | flags: 0x4000000000000000() | |
19 | page dumped because: VM_BUG_ON_PAGE((*({ __attribute__((unused)) typeof((&page->_count)->counter) __var = ( typeof((&page->_count)->counter)) 0; (volatile typeof((&page->_count)->counter) *)&((&page->_count)->counter); })) <= 0) | |
20 | BUG: failure at include/linux/mm.h:548/get_page()! | |
21 | Kernel panic - not syncing: BUG! | |
22 | CPU: 1 PID: 1695 Comm: kvm-vcpu-0 Not tainted 4.0.0-rc1+ #3825 | |
23 | Hardware name: APM X-Gene Mustang board (DT) | |
24 | Call trace: | |
25 | [<ffff80000008a09c>] dump_backtrace+0x0/0x13c | |
26 | [<ffff80000008a1e8>] show_stack+0x10/0x1c | |
27 | [<ffff800000691da8>] dump_stack+0x74/0x94 | |
28 | [<ffff800000690d78>] panic+0x100/0x240 | |
29 | [<ffff8000000a0bc4>] stage2_get_pmd+0x17c/0x2bc | |
30 | [<ffff8000000a1dc4>] kvm_handle_guest_abort+0x4b4/0x6b0 | |
31 | [<ffff8000000a420c>] handle_exit+0x58/0x180 | |
32 | [<ffff80000009e7a4>] kvm_arch_vcpu_ioctl_run+0x114/0x45c | |
33 | [<ffff800000099df4>] kvm_vcpu_ioctl+0x2e0/0x754 | |
34 | [<ffff8000001c0a18>] do_vfs_ioctl+0x424/0x5c8 | |
35 | [<ffff8000001c0bfc>] SyS_ioctl+0x40/0x78 | |
36 | CPU0: stopping | |
37 | ||
38 | A possible approach for this is to split the compound page using | |
39 | split_page() at allocation time, and change the teardown path to | |
40 | free one page at a time. It turns out that alloc_pages_exact() and | |
41 | free_pages_exact() does exactly that. | |
42 | ||
43 | While we're at it, the PGD allocation code is reworked to reduce | |
44 | duplication. | |
45 | ||
46 | This has been tested on an X-Gene platform with a 4kB/48bit-VA host | |
47 | kernel, and kvmtool hacked to place memory in the second page of | |
48 | the hardware PGD (PUD for the host kernel). Also regression-tested | |
49 | on a Cubietruck (Cortex-A7). | |
50 | ||
51 | [ Reworked to use alloc_pages_exact() and free_pages_exact() and to | |
52 | return pointers directly instead of by reference as arguments | |
53 | - Christoffer ] | |
54 | ||
55 | Reported-by: Mark Rutland <mark.rutland@arm.com> | |
56 | Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> | |
57 | Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> | |
58 | Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org> | |
59 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
60 | ||
61 | --- | |
62 | arch/arm/include/asm/kvm_mmu.h | 10 ++--- | |
63 | arch/arm/kvm/mmu.c | 67 ++++++++++++++++++++++++++++----------- | |
64 | arch/arm64/include/asm/kvm_mmu.h | 46 ++------------------------ | |
65 | 3 files changed, 57 insertions(+), 66 deletions(-) | |
66 | ||
67 | --- a/arch/arm/include/asm/kvm_mmu.h | |
68 | +++ b/arch/arm/include/asm/kvm_mmu.h | |
69 | @@ -141,16 +141,14 @@ static inline bool kvm_page_empty(void * | |
70 | ||
71 | #define KVM_PREALLOC_LEVEL 0 | |
72 | ||
73 | -static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd) | |
74 | +static inline void *kvm_get_hwpgd(struct kvm *kvm) | |
75 | { | |
76 | - return 0; | |
77 | + return kvm->arch.pgd; | |
78 | } | |
79 | ||
80 | -static inline void kvm_free_hwpgd(struct kvm *kvm) { } | |
81 | - | |
82 | -static inline void *kvm_get_hwpgd(struct kvm *kvm) | |
83 | +static inline unsigned int kvm_get_hwpgd_size(void) | |
84 | { | |
85 | - return kvm->arch.pgd; | |
86 | + return PTRS_PER_S2_PGD * sizeof(pgd_t); | |
87 | } | |
88 | ||
89 | struct kvm; | |
90 | --- a/arch/arm/kvm/mmu.c | |
91 | +++ b/arch/arm/kvm/mmu.c | |
92 | @@ -593,6 +593,20 @@ int create_hyp_io_mappings(void *from, v | |
93 | __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); | |
94 | } | |
95 | ||
96 | +/* Free the HW pgd, one page at a time */ | |
97 | +static void kvm_free_hwpgd(void *hwpgd) | |
98 | +{ | |
99 | + free_pages_exact(hwpgd, kvm_get_hwpgd_size()); | |
100 | +} | |
101 | + | |
102 | +/* Allocate the HW PGD, making sure that each page gets its own refcount */ | |
103 | +static void *kvm_alloc_hwpgd(void) | |
104 | +{ | |
105 | + unsigned int size = kvm_get_hwpgd_size(); | |
106 | + | |
107 | + return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); | |
108 | +} | |
109 | + | |
110 | /** | |
111 | * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. | |
112 | * @kvm: The KVM struct pointer for the VM. | |
113 | @@ -606,15 +620,31 @@ int create_hyp_io_mappings(void *from, v | |
114 | */ | |
115 | int kvm_alloc_stage2_pgd(struct kvm *kvm) | |
116 | { | |
117 | - int ret; | |
118 | pgd_t *pgd; | |
119 | + void *hwpgd; | |
120 | ||
121 | if (kvm->arch.pgd != NULL) { | |
122 | kvm_err("kvm_arch already initialized?\n"); | |
123 | return -EINVAL; | |
124 | } | |
125 | ||
126 | + hwpgd = kvm_alloc_hwpgd(); | |
127 | + if (!hwpgd) | |
128 | + return -ENOMEM; | |
129 | + | |
130 | + /* When the kernel uses more levels of page tables than the | |
131 | + * guest, we allocate a fake PGD and pre-populate it to point | |
132 | + * to the next-level page table, which will be the real | |
133 | + * initial page table pointed to by the VTTBR. | |
134 | + * | |
135 | + * When KVM_PREALLOC_LEVEL==2, we allocate a single page for | |
136 | + * the PMD and the kernel will use folded pud. | |
137 | + * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD | |
138 | + * pages. | |
139 | + */ | |
140 | if (KVM_PREALLOC_LEVEL > 0) { | |
141 | + int i; | |
142 | + | |
143 | /* | |
144 | * Allocate fake pgd for the page table manipulation macros to | |
145 | * work. This is not used by the hardware and we have no | |
146 | @@ -622,30 +652,32 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm | |
147 | */ | |
148 | pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), | |
149 | GFP_KERNEL | __GFP_ZERO); | |
150 | + | |
151 | + if (!pgd) { | |
152 | + kvm_free_hwpgd(hwpgd); | |
153 | + return -ENOMEM; | |
154 | + } | |
155 | + | |
156 | + /* Plug the HW PGD into the fake one. */ | |
157 | + for (i = 0; i < PTRS_PER_S2_PGD; i++) { | |
158 | + if (KVM_PREALLOC_LEVEL == 1) | |
159 | + pgd_populate(NULL, pgd + i, | |
160 | + (pud_t *)hwpgd + i * PTRS_PER_PUD); | |
161 | + else if (KVM_PREALLOC_LEVEL == 2) | |
162 | + pud_populate(NULL, pud_offset(pgd, 0) + i, | |
163 | + (pmd_t *)hwpgd + i * PTRS_PER_PMD); | |
164 | + } | |
165 | } else { | |
166 | /* | |
167 | * Allocate actual first-level Stage-2 page table used by the | |
168 | * hardware for Stage-2 page table walks. | |
169 | */ | |
170 | - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, S2_PGD_ORDER); | |
171 | + pgd = (pgd_t *)hwpgd; | |
172 | } | |
173 | ||
174 | - if (!pgd) | |
175 | - return -ENOMEM; | |
176 | - | |
177 | - ret = kvm_prealloc_hwpgd(kvm, pgd); | |
178 | - if (ret) | |
179 | - goto out_err; | |
180 | - | |
181 | kvm_clean_pgd(pgd); | |
182 | kvm->arch.pgd = pgd; | |
183 | return 0; | |
184 | -out_err: | |
185 | - if (KVM_PREALLOC_LEVEL > 0) | |
186 | - kfree(pgd); | |
187 | - else | |
188 | - free_pages((unsigned long)pgd, S2_PGD_ORDER); | |
189 | - return ret; | |
190 | } | |
191 | ||
192 | /** | |
193 | @@ -746,11 +778,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm | |
194 | return; | |
195 | ||
196 | unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); | |
197 | - kvm_free_hwpgd(kvm); | |
198 | + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); | |
199 | if (KVM_PREALLOC_LEVEL > 0) | |
200 | kfree(kvm->arch.pgd); | |
201 | - else | |
202 | - free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER); | |
203 | + | |
204 | kvm->arch.pgd = NULL; | |
205 | } | |
206 | ||
207 | --- a/arch/arm64/include/asm/kvm_mmu.h | |
208 | +++ b/arch/arm64/include/asm/kvm_mmu.h | |
209 | @@ -150,43 +150,6 @@ static inline void kvm_set_s2pmd_writabl | |
210 | #define KVM_PREALLOC_LEVEL (0) | |
211 | #endif | |
212 | ||
213 | -/** | |
214 | - * kvm_prealloc_hwpgd - allocate inital table for VTTBR | |
215 | - * @kvm: The KVM struct pointer for the VM. | |
216 | - * @pgd: The kernel pseudo pgd | |
217 | - * | |
218 | - * When the kernel uses more levels of page tables than the guest, we allocate | |
219 | - * a fake PGD and pre-populate it to point to the next-level page table, which | |
220 | - * will be the real initial page table pointed to by the VTTBR. | |
221 | - * | |
222 | - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for the PMD and | |
223 | - * the kernel will use folded pud. When KVM_PREALLOC_LEVEL==1, we | |
224 | - * allocate 2 consecutive PUD pages. | |
225 | - */ | |
226 | -static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd) | |
227 | -{ | |
228 | - unsigned int i; | |
229 | - unsigned long hwpgd; | |
230 | - | |
231 | - if (KVM_PREALLOC_LEVEL == 0) | |
232 | - return 0; | |
233 | - | |
234 | - hwpgd = __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTRS_PER_S2_PGD_SHIFT); | |
235 | - if (!hwpgd) | |
236 | - return -ENOMEM; | |
237 | - | |
238 | - for (i = 0; i < PTRS_PER_S2_PGD; i++) { | |
239 | - if (KVM_PREALLOC_LEVEL == 1) | |
240 | - pgd_populate(NULL, pgd + i, | |
241 | - (pud_t *)hwpgd + i * PTRS_PER_PUD); | |
242 | - else if (KVM_PREALLOC_LEVEL == 2) | |
243 | - pud_populate(NULL, pud_offset(pgd, 0) + i, | |
244 | - (pmd_t *)hwpgd + i * PTRS_PER_PMD); | |
245 | - } | |
246 | - | |
247 | - return 0; | |
248 | -} | |
249 | - | |
250 | static inline void *kvm_get_hwpgd(struct kvm *kvm) | |
251 | { | |
252 | pgd_t *pgd = kvm->arch.pgd; | |
253 | @@ -203,12 +166,11 @@ static inline void *kvm_get_hwpgd(struct | |
254 | return pmd_offset(pud, 0); | |
255 | } | |
256 | ||
257 | -static inline void kvm_free_hwpgd(struct kvm *kvm) | |
258 | +static inline unsigned int kvm_get_hwpgd_size(void) | |
259 | { | |
260 | - if (KVM_PREALLOC_LEVEL > 0) { | |
261 | - unsigned long hwpgd = (unsigned long)kvm_get_hwpgd(kvm); | |
262 | - free_pages(hwpgd, PTRS_PER_S2_PGD_SHIFT); | |
263 | - } | |
264 | + if (KVM_PREALLOC_LEVEL > 0) | |
265 | + return PTRS_PER_S2_PGD * PAGE_SIZE; | |
266 | + return PTRS_PER_S2_PGD * sizeof(pgd_t); | |
267 | } | |
268 | ||
269 | static inline bool kvm_page_empty(void *ptr) |