]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: jbeulich@novell.com |
2 | Subject: eliminate scalability issues from initial mapping setup | |
3 | Patch-mainline: obsolete | |
4 | References: bnc#417417 | |
5 | ||
6 | Direct Xen to place the initial P->M table outside of the initial | |
7 | mapping, as otherwise the 1G (implementation) / 2G (theoretical) | |
8 | restriction on the size of the initial mapping limits the amount | |
9 | of memory a domain can be handed initially. | |
10 | ||
11 | Note that the flags passed to HYPERVISOR_update_va_mapping() from | |
12 | __make_page_writable() and make_lowmem_page_writable() are | |
13 | intentionally not including UVMF_ALL. This is intended to be on optimal | |
14 | choice between the overhead of a potential spurious page fault (as | |
15 | remote CPUs may still have read-only translations in their TLBs) and | |
16 | the overhead of cross processor flushes. Flushing on the local CPU | |
17 | shouldn't be as expensive (and hence can be viewed as an optimization | |
18 | avoiding the spurious page fault on the local CPU), but is required | |
19 | when the functions are used before the page fault handler gets set up. | |
20 | ||
21 | --- sle11-2009-07-31.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:49:32.000000000 +0100 | |
22 | +++ sle11-2009-07-31/arch/x86/kernel/head64-xen.c 2009-03-16 16:40:54.000000000 +0100 | |
23 | @@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch | |
24 | + (xen_start_info->nr_pt_frames << PAGE_SHIFT), | |
25 | "Xen provided"); | |
26 | ||
27 | + if (xen_feature(XENFEAT_auto_translated_physmap)) | |
28 | + xen_start_info->mfn_list = ~0UL; | |
29 | + else if (xen_start_info->mfn_list < __START_KERNEL_map) | |
30 | + reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT, | |
31 | + (xen_start_info->first_p2m_pfn | |
32 | + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, | |
33 | + "INITP2M"); | |
34 | + | |
35 | /* | |
36 | * At this point everything still needed from the boot loader | |
37 | * or BIOS or kernel text should be early reserved or marked not | |
38 | --- sle11-2009-07-31.orig/arch/x86/kernel/head_64-xen.S 2009-03-16 16:40:52.000000000 +0100 | |
39 | +++ sle11-2009-07-31/arch/x86/kernel/head_64-xen.S 2009-03-16 16:40:54.000000000 +0100 | |
40 | @@ -18,6 +18,7 @@ | |
41 | #include <asm/desc.h> | |
42 | #include <asm/segment.h> | |
43 | #include <asm/page.h> | |
44 | +#include <asm/pgtable.h> | |
45 | #include <asm/msr.h> | |
46 | #include <asm/cache.h> | |
47 | #include <asm/dwarf2.h> | |
48 | @@ -135,6 +136,7 @@ ENTRY(empty_zero_page) | |
49 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) | |
50 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) | |
51 | ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) | |
52 | + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) | |
53 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") | |
54 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") | |
55 | ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) | |
56 | --- sle11-2009-07-31.orig/arch/x86/kernel/setup-xen.c 2009-07-31 15:14:31.000000000 +0200 | |
57 | +++ sle11-2009-07-31/arch/x86/kernel/setup-xen.c 2009-07-31 15:14:44.000000000 +0200 | |
58 | @@ -1021,7 +1021,7 @@ void __init setup_arch(char **cmdline_p) | |
59 | difference = xen_start_info->nr_pages - max_pfn; | |
60 | ||
61 | set_xen_guest_handle(reservation.extent_start, | |
62 | - ((unsigned long *)xen_start_info->mfn_list) + max_pfn); | |
63 | + phys_to_machine_mapping + max_pfn); | |
64 | reservation.nr_extents = difference; | |
65 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
66 | &reservation); | |
67 | @@ -1038,14 +1038,86 @@ void __init setup_arch(char **cmdline_p) | |
68 | phys_to_machine_mapping = alloc_bootmem_pages( | |
69 | max_pfn * sizeof(unsigned long)); | |
70 | memcpy(phys_to_machine_mapping, | |
71 | - (unsigned long *)xen_start_info->mfn_list, | |
72 | + __va(__pa(xen_start_info->mfn_list)), | |
73 | p2m_pages * sizeof(unsigned long)); | |
74 | memset(phys_to_machine_mapping + p2m_pages, ~0, | |
75 | (max_pfn - p2m_pages) * sizeof(unsigned long)); | |
76 | - free_bootmem( | |
77 | - __pa(xen_start_info->mfn_list), | |
78 | - PFN_PHYS(PFN_UP(xen_start_info->nr_pages * | |
79 | - sizeof(unsigned long)))); | |
80 | + | |
81 | +#ifdef CONFIG_X86_64 | |
82 | + if (xen_start_info->mfn_list == VMEMMAP_START) { | |
83 | + /* | |
84 | + * Since it is well isolated we can (and since it is | |
85 | + * perhaps large we should) also free the page tables | |
86 | + * mapping the initial P->M table. | |
87 | + */ | |
88 | + unsigned long va = VMEMMAP_START, pa; | |
89 | + pgd_t *pgd = pgd_offset_k(va); | |
90 | + pud_t *pud_page = pud_offset(pgd, 0); | |
91 | + | |
92 | + BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK); | |
93 | + xen_l4_entry_update(pgd, __pgd(0)); | |
94 | + for(;;) { | |
95 | + pud_t *pud = pud_page + pud_index(va); | |
96 | + | |
97 | + if (pud_none(*pud)) | |
98 | + va += PUD_SIZE; | |
99 | + else if (pud_large(*pud)) { | |
100 | + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; | |
101 | + make_pages_writable(__va(pa), | |
102 | + PUD_SIZE >> PAGE_SHIFT, | |
103 | + XENFEAT_writable_page_tables); | |
104 | + free_bootmem(pa, PUD_SIZE); | |
105 | + va += PUD_SIZE; | |
106 | + } else { | |
107 | + pmd_t *pmd = pmd_offset(pud, va); | |
108 | + | |
109 | + if (pmd_large(*pmd)) { | |
110 | + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; | |
111 | + make_pages_writable(__va(pa), | |
112 | + PMD_SIZE >> PAGE_SHIFT, | |
113 | + XENFEAT_writable_page_tables); | |
114 | + free_bootmem(pa, PMD_SIZE); | |
115 | + } else if (!pmd_none(*pmd)) { | |
116 | + pte_t *pte = pte_offset_kernel(pmd, va); | |
117 | + | |
118 | + for (i = 0; i < PTRS_PER_PTE; ++i) { | |
119 | + if (pte_none(pte[i])) | |
120 | + break; | |
121 | + pa = pte_pfn(pte[i]) << PAGE_SHIFT; | |
122 | + make_page_writable(__va(pa), | |
123 | + XENFEAT_writable_page_tables); | |
124 | + free_bootmem(pa, PAGE_SIZE); | |
125 | + } | |
126 | + ClearPagePinned(virt_to_page(pte)); | |
127 | + make_page_writable(pte, | |
128 | + XENFEAT_writable_page_tables); | |
129 | + free_bootmem(__pa(pte), PAGE_SIZE); | |
130 | + } | |
131 | + va += PMD_SIZE; | |
132 | + if (pmd_index(va)) | |
133 | + continue; | |
134 | + ClearPagePinned(virt_to_page(pmd)); | |
135 | + make_page_writable(pmd, | |
136 | + XENFEAT_writable_page_tables); | |
137 | + free_bootmem(__pa((unsigned long)pmd | |
138 | + & PAGE_MASK), | |
139 | + PAGE_SIZE); | |
140 | + } | |
141 | + if (!pud_index(va)) | |
142 | + break; | |
143 | + } | |
144 | + ClearPagePinned(virt_to_page(pud_page)); | |
145 | + make_page_writable(pud_page, | |
146 | + XENFEAT_writable_page_tables); | |
147 | + free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK), | |
148 | + PAGE_SIZE); | |
149 | + } else if (!WARN_ON(xen_start_info->mfn_list | |
150 | + < __START_KERNEL_map)) | |
151 | +#endif | |
152 | + free_bootmem(__pa(xen_start_info->mfn_list), | |
153 | + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * | |
154 | + sizeof(unsigned long)))); | |
155 | + | |
156 | ||
157 | /* | |
158 | * Initialise the list of the frames that specify the list of | |
159 | --- sle11-2009-07-31.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:40:52.000000000 +0100 | |
160 | +++ sle11-2009-07-31/arch/x86/mm/init_64-xen.c 2009-03-16 16:40:54.000000000 +0100 | |
161 | @@ -157,6 +157,17 @@ static unsigned long __meminitdata table | |
162 | static unsigned long __meminitdata table_cur; | |
163 | static unsigned long __meminitdata table_top; | |
164 | ||
165 | +static __init unsigned long get_table_cur(void) | |
166 | +{ | |
167 | + BUG_ON(!table_cur); | |
168 | + if (xen_start_info->mfn_list < __START_KERNEL_map | |
169 | + && table_cur == xen_start_info->first_p2m_pfn) { | |
170 | + table_cur += xen_start_info->nr_p2m_frames; | |
171 | + table_top += xen_start_info->nr_p2m_frames; | |
172 | + } | |
173 | + return table_cur++; | |
174 | +} | |
175 | + | |
176 | /* | |
177 | * NOTE: This function is marked __ref because it calls __init function | |
178 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | |
179 | @@ -168,8 +179,7 @@ static __ref void *spp_getpage(void) | |
180 | if (after_bootmem) | |
181 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | |
182 | else if (table_cur < table_top) { | |
183 | - ptr = __va(table_cur << PAGE_SHIFT); | |
184 | - table_cur++; | |
185 | + ptr = __va(get_table_cur() << PAGE_SHIFT); | |
186 | memset(ptr, 0, PAGE_SIZE); | |
187 | } else | |
188 | ptr = alloc_bootmem_pages(PAGE_SIZE); | |
189 | @@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign | |
190 | return adr; | |
191 | } | |
192 | ||
193 | - BUG_ON(!table_cur); | |
194 | - pfn = table_cur++; | |
195 | + pfn = get_table_cur(); | |
196 | if (pfn >= table_top) | |
197 | panic("alloc_low_page: ran out of memory"); | |
198 | ||
199 | @@ -361,14 +370,29 @@ static inline int __meminit make_readonl | |
200 | /* Make new page tables read-only on the first pass. */ | |
201 | if (!xen_feature(XENFEAT_writable_page_tables) | |
202 | && !max_pfn_mapped | |
203 | - && (paddr >= (table_start << PAGE_SHIFT)) | |
204 | - && (paddr < (table_top << PAGE_SHIFT))) | |
205 | - readonly = 1; | |
206 | + && (paddr >= (table_start << PAGE_SHIFT))) { | |
207 | + unsigned long top = table_top; | |
208 | + | |
209 | + /* Account for the range get_table_cur() skips. */ | |
210 | + if (xen_start_info->mfn_list < __START_KERNEL_map | |
211 | + && table_cur <= xen_start_info->first_p2m_pfn | |
212 | + && top > xen_start_info->first_p2m_pfn) | |
213 | + top += xen_start_info->nr_p2m_frames; | |
214 | + if (paddr < (top << PAGE_SHIFT)) | |
215 | + readonly = 1; | |
216 | + } | |
217 | /* Make old page tables read-only. */ | |
218 | if (!xen_feature(XENFEAT_writable_page_tables) | |
219 | && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) | |
220 | && (paddr < (table_cur << PAGE_SHIFT))) | |
221 | readonly = 1; | |
222 | + /* Make P->M table (and its page tables) read-only. */ | |
223 | + if (!xen_feature(XENFEAT_writable_page_tables) | |
224 | + && xen_start_info->mfn_list < __START_KERNEL_map | |
225 | + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT) | |
226 | + && paddr < (xen_start_info->first_p2m_pfn | |
227 | + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT) | |
228 | + readonly = 1; | |
229 | ||
230 | /* | |
231 | * No need for writable mapping of kernel image. This also ensures that | |
232 | @@ -616,6 +640,12 @@ void __init xen_init_pt(void) | |
233 | __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); | |
234 | memcpy(level2_kernel_pgt, page, PAGE_SIZE); | |
235 | ||
236 | + /* Copy the initial P->M table mappings if necessary. */ | |
237 | + addr = pgd_index(xen_start_info->mfn_list); | |
238 | + if (addr < pgd_index(__START_KERNEL_map)) | |
239 | + init_level4_pgt[addr] = | |
240 | + ((pgd_t *)xen_start_info->pt_base)[addr]; | |
241 | + | |
242 | /* Do an early initialization of the fixmap area. */ | |
243 | addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); | |
244 | level3_kernel_pgt[pud_index(addr)] = | |
245 | @@ -676,22 +706,28 @@ static void __init find_early_table_spac | |
246 | static void __init xen_finish_init_mapping(void) | |
247 | { | |
248 | unsigned long i, start, end; | |
249 | + struct mmuext_op mmuext; | |
250 | ||
251 | /* Re-vector virtual addresses pointing into the initial | |
252 | mapping to the just-established permanent ones. */ | |
253 | xen_start_info = __va(__pa(xen_start_info)); | |
254 | xen_start_info->pt_base = (unsigned long) | |
255 | __va(__pa(xen_start_info->pt_base)); | |
256 | - if (!xen_feature(XENFEAT_auto_translated_physmap)) { | |
257 | + if (!xen_feature(XENFEAT_auto_translated_physmap) | |
258 | + && xen_start_info->mfn_list >= __START_KERNEL_map) | |
259 | phys_to_machine_mapping = | |
260 | __va(__pa(xen_start_info->mfn_list)); | |
261 | - xen_start_info->mfn_list = (unsigned long) | |
262 | - phys_to_machine_mapping; | |
263 | - } | |
264 | if (xen_start_info->mod_start) | |
265 | xen_start_info->mod_start = (unsigned long) | |
266 | __va(__pa(xen_start_info->mod_start)); | |
267 | ||
268 | + /* Unpin the no longer used Xen provided page tables. */ | |
269 | + mmuext.cmd = MMUEXT_UNPIN_TABLE; | |
270 | + mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base) | |
271 | + >> PAGE_SHIFT); | |
272 | + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF)) | |
273 | + BUG(); | |
274 | + | |
275 | /* Destroy the Xen-created mappings beyond the kernel image. */ | |
276 | start = PAGE_ALIGN((unsigned long)_end); | |
277 | end = __START_KERNEL_map + (table_start << PAGE_SHIFT); | |
278 | @@ -948,9 +984,20 @@ unsigned long __init_refok init_memory_m | |
279 | ||
280 | __flush_tlb_all(); | |
281 | ||
282 | - if (!after_bootmem && table_top > table_start) | |
283 | + if (!after_bootmem && table_top > table_start) { | |
284 | + if (xen_start_info->mfn_list < __START_KERNEL_map | |
285 | + && table_start <= xen_start_info->first_p2m_pfn | |
286 | + && table_top > xen_start_info->first_p2m_pfn) { | |
287 | + reserve_early(table_start << PAGE_SHIFT, | |
288 | + xen_start_info->first_p2m_pfn | |
289 | + << PAGE_SHIFT, | |
290 | + "PGTABLE"); | |
291 | + table_start = xen_start_info->first_p2m_pfn | |
292 | + + xen_start_info->nr_p2m_frames; | |
293 | + } | |
294 | reserve_early(table_start << PAGE_SHIFT, | |
295 | table_top << PAGE_SHIFT, "PGTABLE"); | |
296 | + } | |
297 | ||
298 | printk(KERN_INFO "last_map_addr: %lx end: %lx\n", | |
299 | last_map_addr, end); | |
300 | --- sle11-2009-07-31.orig/arch/x86/mm/pageattr-xen.c 2009-06-29 15:42:17.000000000 +0200 | |
301 | +++ sle11-2009-07-31/arch/x86/mm/pageattr-xen.c 2009-06-29 15:46:57.000000000 +0200 | |
302 | @@ -1262,7 +1262,7 @@ static void __make_page_writable(unsigne | |
303 | ||
304 | pte = lookup_address(va, &level); | |
305 | BUG_ON(!pte || level != PG_LEVEL_4K); | |
306 | - if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0)) | |
307 | + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG)) | |
308 | BUG(); | |
309 | if (in_secondary_range(va)) { | |
310 | unsigned long pfn = pte_pfn(*pte); | |
311 | --- sle11-2009-07-31.orig/arch/x86/mm/pgtable-xen.c 2009-04-09 14:54:18.000000000 +0200 | |
312 | +++ sle11-2009-07-31/arch/x86/mm/pgtable-xen.c 2009-03-16 16:40:54.000000000 +0100 | |
313 | @@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void) | |
314 | if (PTRS_PER_PUD > 1) /* not folded */ | |
315 | SetPagePinned(virt_to_page(pud)); | |
316 | for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | |
317 | - if (!pud_present(*pud)) | |
318 | + if (!pud_present(*pud) || pud_large(*pud)) | |
319 | continue; | |
320 | pmd = pmd_offset(pud, 0); | |
321 | if (PTRS_PER_PMD > 1) /* not folded */ | |
322 | @@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void) | |
323 | && m >= pmd_index(HYPERVISOR_VIRT_START)) | |
324 | continue; | |
325 | #endif | |
326 | - if (!pmd_present(*pmd)) | |
327 | + if (!pmd_present(*pmd) || pmd_large(*pmd)) | |
328 | continue; | |
329 | SetPagePinned(pmd_page(*pmd)); | |
330 | } | |
331 | --- sle11-2009-07-31.orig/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200 | |
332 | +++ sle11-2009-07-31/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:40:54.000000000 +0100 | |
333 | @@ -188,6 +188,6 @@ void make_lowmem_page_writable(void *va, | |
334 | pte = lookup_address((unsigned long)va, &level); | |
335 | BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); | |
336 | rc = HYPERVISOR_update_va_mapping( | |
337 | - (unsigned long)va, pte_mkwrite(*pte), 0); | |
338 | + (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG); | |
339 | BUG_ON(rc); | |
340 | } | |
341 | --- sle11-2009-07-31.orig/include/xen/interface/elfnote.h 2008-11-25 12:35:56.000000000 +0100 | |
342 | +++ sle11-2009-07-31/include/xen/interface/elfnote.h 2009-03-16 16:40:54.000000000 +0100 | |
343 | @@ -162,9 +162,20 @@ | |
344 | #define XEN_ELFNOTE_SUSPEND_CANCEL 14 | |
345 | ||
346 | /* | |
347 | + * The (non-default) location the initial phys-to-machine map should be | |
348 | + * placed at by the hypervisor (Dom0) or the tools (DomU). | |
349 | + * The kernel must be prepared for this mapping to be established using | |
350 | + * large pages, despite such otherwise not being available to guests. | |
351 | + * The kernel must also be prepared that the page table pages used for | |
352 | + * this mapping may not be accessible through the initial mapping. | |
353 | + * (Only x86-64 supports this at present.) | |
354 | + */ | |
355 | +#define XEN_ELFNOTE_INIT_P2M 15 | |
356 | + | |
357 | +/* | |
358 | * The number of the highest elfnote defined. | |
359 | */ | |
360 | -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL | |
361 | +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M | |
362 | ||
363 | /* | |
364 | * System information exported through crash notes. | |
365 | --- sle11-2009-07-31.orig/include/xen/interface/xen.h 2009-05-14 11:17:48.000000000 +0200 | |
366 | +++ sle11-2009-07-31/include/xen/interface/xen.h 2009-03-16 16:40:54.000000000 +0100 | |
367 | @@ -536,6 +536,7 @@ typedef struct shared_info shared_info_t | |
368 | * a. relocated kernel image | |
369 | * b. initial ram disk [mod_start, mod_len] | |
370 | * c. list of allocated page frames [mfn_list, nr_pages] | |
371 | + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) | |
372 | * d. start_info_t structure [register ESI (x86)] | |
373 | * e. bootstrap page tables [pt_base, CR3 (x86)] | |
374 | * f. bootstrap stack [register ESP (x86)] | |
375 | @@ -577,6 +578,9 @@ struct start_info { | |
376 | unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ | |
377 | unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ | |
378 | int8_t cmd_line[MAX_GUEST_CMDLINE]; | |
379 | + /* The pfn range here covers both page table and p->m table frames. */ | |
380 | + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ | |
381 | + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ | |
382 | }; | |
383 | typedef struct start_info start_info_t; | |
384 |