]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.xen/xen-x86_64-note-init-p2m
Changed checkfs to auto reboot after correctable fsck fixes.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.xen / xen-x86_64-note-init-p2m
1 From: jbeulich@novell.com
2 Subject: eliminate scalability issues from initial mapping setup
3 Patch-mainline: obsolete
4 References: bnc#417417
5
6 Direct Xen to place the initial P->M table outside of the initial
7 mapping, as otherwise the 1G (implementation) / 2G (theoretical)
8 restriction on the size of the initial mapping limits the amount
9 of memory a domain can be handed initially.
10
11 Note that the flags passed to HYPERVISOR_update_va_mapping() from
12 __make_page_writable() and make_lowmem_page_writable() are
13 intentionally not including UVMF_ALL. This is intended to be on optimal
14 choice between the overhead of a potential spurious page fault (as
15 remote CPUs may still have read-only translations in their TLBs) and
16 the overhead of cross processor flushes. Flushing on the local CPU
17 shouldn't be as expensive (and hence can be viewed as an optimization
18 avoiding the spurious page fault on the local CPU), but is required
19 when the functions are used before the page fault handler gets set up.
20
21 --- sle11-2009-05-14.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:49:32.000000000 +0100
22 +++ sle11-2009-05-14/arch/x86/kernel/head64-xen.c 2009-03-16 16:40:54.000000000 +0100
23 @@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch
24 + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
25 "Xen provided");
26
27 + if (xen_feature(XENFEAT_auto_translated_physmap))
28 + xen_start_info->mfn_list = ~0UL;
29 + else if (xen_start_info->mfn_list < __START_KERNEL_map)
30 + reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT,
31 + (xen_start_info->first_p2m_pfn
32 + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT,
33 + "INITP2M");
34 +
35 /*
36 * At this point everything still needed from the boot loader
37 * or BIOS or kernel text should be early reserved or marked not
38 --- sle11-2009-05-14.orig/arch/x86/kernel/head_64-xen.S 2009-03-16 16:40:52.000000000 +0100
39 +++ sle11-2009-05-14/arch/x86/kernel/head_64-xen.S 2009-03-16 16:40:54.000000000 +0100
40 @@ -18,6 +18,7 @@
41 #include <asm/desc.h>
42 #include <asm/segment.h>
43 #include <asm/page.h>
44 +#include <asm/pgtable.h>
45 #include <asm/msr.h>
46 #include <asm/cache.h>
47 #include <asm/dwarf2.h>
48 @@ -135,6 +136,7 @@ ENTRY(empty_zero_page)
49 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
50 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
51 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
52 + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
54 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
55 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
56 --- sle11-2009-05-14.orig/arch/x86/kernel/setup-xen.c 2009-02-16 17:05:16.000000000 +0100
57 +++ sle11-2009-05-14/arch/x86/kernel/setup-xen.c 2009-03-16 16:40:54.000000000 +0100
58 @@ -1022,7 +1022,7 @@ void __init setup_arch(char **cmdline_p)
59 difference = xen_start_info->nr_pages - max_pfn;
60
61 set_xen_guest_handle(reservation.extent_start,
62 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
63 + phys_to_machine_mapping + max_pfn);
64 reservation.nr_extents = difference;
65 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
66 &reservation);
67 @@ -1039,14 +1039,86 @@ void __init setup_arch(char **cmdline_p)
68 phys_to_machine_mapping = alloc_bootmem_pages(
69 max_pfn * sizeof(unsigned long));
70 memcpy(phys_to_machine_mapping,
71 - (unsigned long *)xen_start_info->mfn_list,
72 + __va(__pa(xen_start_info->mfn_list)),
73 p2m_pages * sizeof(unsigned long));
74 memset(phys_to_machine_mapping + p2m_pages, ~0,
75 (max_pfn - p2m_pages) * sizeof(unsigned long));
76 - free_bootmem(
77 - __pa(xen_start_info->mfn_list),
78 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
79 - sizeof(unsigned long))));
80 +
81 +#ifdef CONFIG_X86_64
82 + if (xen_start_info->mfn_list == VMEMMAP_START) {
83 + /*
84 + * Since it is well isolated we can (and since it is
85 + * perhaps large we should) also free the page tables
86 + * mapping the initial P->M table.
87 + */
88 + unsigned long va = VMEMMAP_START, pa;
89 + pgd_t *pgd = pgd_offset_k(va);
90 + pud_t *pud_page = pud_offset(pgd, 0);
91 +
92 + BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
93 + xen_l4_entry_update(pgd, __pgd(0));
94 + for(;;) {
95 + pud_t *pud = pud_page + pud_index(va);
96 +
97 + if (pud_none(*pud))
98 + va += PUD_SIZE;
99 + else if (pud_large(*pud)) {
100 + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
101 + make_pages_writable(__va(pa),
102 + PUD_SIZE >> PAGE_SHIFT,
103 + XENFEAT_writable_page_tables);
104 + free_bootmem(pa, PUD_SIZE);
105 + va += PUD_SIZE;
106 + } else {
107 + pmd_t *pmd = pmd_offset(pud, va);
108 +
109 + if (pmd_large(*pmd)) {
110 + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
111 + make_pages_writable(__va(pa),
112 + PMD_SIZE >> PAGE_SHIFT,
113 + XENFEAT_writable_page_tables);
114 + free_bootmem(pa, PMD_SIZE);
115 + } else if (!pmd_none(*pmd)) {
116 + pte_t *pte = pte_offset_kernel(pmd, va);
117 +
118 + for (i = 0; i < PTRS_PER_PTE; ++i) {
119 + if (pte_none(pte[i]))
120 + break;
121 + pa = pte_pfn(pte[i]) << PAGE_SHIFT;
122 + make_page_writable(__va(pa),
123 + XENFEAT_writable_page_tables);
124 + free_bootmem(pa, PAGE_SIZE);
125 + }
126 + ClearPagePinned(virt_to_page(pte));
127 + make_page_writable(pte,
128 + XENFEAT_writable_page_tables);
129 + free_bootmem(__pa(pte), PAGE_SIZE);
130 + }
131 + va += PMD_SIZE;
132 + if (pmd_index(va))
133 + continue;
134 + ClearPagePinned(virt_to_page(pmd));
135 + make_page_writable(pmd,
136 + XENFEAT_writable_page_tables);
137 + free_bootmem(__pa((unsigned long)pmd
138 + & PAGE_MASK),
139 + PAGE_SIZE);
140 + }
141 + if (!pud_index(va))
142 + break;
143 + }
144 + ClearPagePinned(virt_to_page(pud_page));
145 + make_page_writable(pud_page,
146 + XENFEAT_writable_page_tables);
147 + free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
148 + PAGE_SIZE);
149 + } else if (!WARN_ON(xen_start_info->mfn_list
150 + < __START_KERNEL_map))
151 +#endif
152 + free_bootmem(__pa(xen_start_info->mfn_list),
153 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
154 + sizeof(unsigned long))));
155 +
156
157 /*
158 * Initialise the list of the frames that specify the list of
159 --- sle11-2009-05-14.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:40:52.000000000 +0100
160 +++ sle11-2009-05-14/arch/x86/mm/init_64-xen.c 2009-03-16 16:40:54.000000000 +0100
161 @@ -157,6 +157,17 @@ static unsigned long __meminitdata table
162 static unsigned long __meminitdata table_cur;
163 static unsigned long __meminitdata table_top;
164
165 +static __init unsigned long get_table_cur(void)
166 +{
167 + BUG_ON(!table_cur);
168 + if (xen_start_info->mfn_list < __START_KERNEL_map
169 + && table_cur == xen_start_info->first_p2m_pfn) {
170 + table_cur += xen_start_info->nr_p2m_frames;
171 + table_top += xen_start_info->nr_p2m_frames;
172 + }
173 + return table_cur++;
174 +}
175 +
176 /*
177 * NOTE: This function is marked __ref because it calls __init function
178 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
179 @@ -168,8 +179,7 @@ static __ref void *spp_getpage(void)
180 if (after_bootmem)
181 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
182 else if (table_cur < table_top) {
183 - ptr = __va(table_cur << PAGE_SHIFT);
184 - table_cur++;
185 + ptr = __va(get_table_cur() << PAGE_SHIFT);
186 memset(ptr, 0, PAGE_SIZE);
187 } else
188 ptr = alloc_bootmem_pages(PAGE_SIZE);
189 @@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign
190 return adr;
191 }
192
193 - BUG_ON(!table_cur);
194 - pfn = table_cur++;
195 + pfn = get_table_cur();
196 if (pfn >= table_top)
197 panic("alloc_low_page: ran out of memory");
198
199 @@ -361,14 +370,29 @@ static inline int __meminit make_readonl
200 /* Make new page tables read-only on the first pass. */
201 if (!xen_feature(XENFEAT_writable_page_tables)
202 && !max_pfn_mapped
203 - && (paddr >= (table_start << PAGE_SHIFT))
204 - && (paddr < (table_top << PAGE_SHIFT)))
205 - readonly = 1;
206 + && (paddr >= (table_start << PAGE_SHIFT))) {
207 + unsigned long top = table_top;
208 +
209 + /* Account for the range get_table_cur() skips. */
210 + if (xen_start_info->mfn_list < __START_KERNEL_map
211 + && table_cur <= xen_start_info->first_p2m_pfn
212 + && top > xen_start_info->first_p2m_pfn)
213 + top += xen_start_info->nr_p2m_frames;
214 + if (paddr < (top << PAGE_SHIFT))
215 + readonly = 1;
216 + }
217 /* Make old page tables read-only. */
218 if (!xen_feature(XENFEAT_writable_page_tables)
219 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
220 && (paddr < (table_cur << PAGE_SHIFT)))
221 readonly = 1;
222 + /* Make P->M table (and its page tables) read-only. */
223 + if (!xen_feature(XENFEAT_writable_page_tables)
224 + && xen_start_info->mfn_list < __START_KERNEL_map
225 + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
226 + && paddr < (xen_start_info->first_p2m_pfn
227 + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
228 + readonly = 1;
229
230 /*
231 * No need for writable mapping of kernel image. This also ensures that
232 @@ -616,6 +640,12 @@ void __init xen_init_pt(void)
233 __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
234 memcpy(level2_kernel_pgt, page, PAGE_SIZE);
235
236 + /* Copy the initial P->M table mappings if necessary. */
237 + addr = pgd_index(xen_start_info->mfn_list);
238 + if (addr < pgd_index(__START_KERNEL_map))
239 + init_level4_pgt[addr] =
240 + ((pgd_t *)xen_start_info->pt_base)[addr];
241 +
242 /* Do an early initialization of the fixmap area. */
243 addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
244 level3_kernel_pgt[pud_index(addr)] =
245 @@ -676,22 +706,28 @@ static void __init find_early_table_spac
246 static void __init xen_finish_init_mapping(void)
247 {
248 unsigned long i, start, end;
249 + struct mmuext_op mmuext;
250
251 /* Re-vector virtual addresses pointing into the initial
252 mapping to the just-established permanent ones. */
253 xen_start_info = __va(__pa(xen_start_info));
254 xen_start_info->pt_base = (unsigned long)
255 __va(__pa(xen_start_info->pt_base));
256 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
257 + if (!xen_feature(XENFEAT_auto_translated_physmap)
258 + && xen_start_info->mfn_list >= __START_KERNEL_map)
259 phys_to_machine_mapping =
260 __va(__pa(xen_start_info->mfn_list));
261 - xen_start_info->mfn_list = (unsigned long)
262 - phys_to_machine_mapping;
263 - }
264 if (xen_start_info->mod_start)
265 xen_start_info->mod_start = (unsigned long)
266 __va(__pa(xen_start_info->mod_start));
267
268 + /* Unpin the no longer used Xen provided page tables. */
269 + mmuext.cmd = MMUEXT_UNPIN_TABLE;
270 + mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base)
271 + >> PAGE_SHIFT);
272 + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
273 + BUG();
274 +
275 /* Destroy the Xen-created mappings beyond the kernel image. */
276 start = PAGE_ALIGN((unsigned long)_end);
277 end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
278 @@ -948,9 +984,20 @@ unsigned long __init_refok init_memory_m
279
280 __flush_tlb_all();
281
282 - if (!after_bootmem && table_top > table_start)
283 + if (!after_bootmem && table_top > table_start) {
284 + if (xen_start_info->mfn_list < __START_KERNEL_map
285 + && table_start <= xen_start_info->first_p2m_pfn
286 + && table_top > xen_start_info->first_p2m_pfn) {
287 + reserve_early(table_start << PAGE_SHIFT,
288 + xen_start_info->first_p2m_pfn
289 + << PAGE_SHIFT,
290 + "PGTABLE");
291 + table_start = xen_start_info->first_p2m_pfn
292 + + xen_start_info->nr_p2m_frames;
293 + }
294 reserve_early(table_start << PAGE_SHIFT,
295 table_top << PAGE_SHIFT, "PGTABLE");
296 + }
297
298 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
299 last_map_addr, end);
300 --- sle11-2009-05-14.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:40:01.000000000 +0100
301 +++ sle11-2009-05-14/arch/x86/mm/pageattr-xen.c 2009-03-16 16:40:54.000000000 +0100
302 @@ -1251,7 +1251,7 @@ static void __make_page_writable(unsigne
303
304 pte = lookup_address(va, &level);
305 BUG_ON(!pte || level != PG_LEVEL_4K);
306 - if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
307 + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
308 BUG();
309 if (in_secondary_range(va)) {
310 unsigned long pfn = pte_pfn(*pte);
311 --- sle11-2009-05-14.orig/arch/x86/mm/pgtable-xen.c 2009-04-09 14:54:18.000000000 +0200
312 +++ sle11-2009-05-14/arch/x86/mm/pgtable-xen.c 2009-03-16 16:40:54.000000000 +0100
313 @@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void)
314 if (PTRS_PER_PUD > 1) /* not folded */
315 SetPagePinned(virt_to_page(pud));
316 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
317 - if (!pud_present(*pud))
318 + if (!pud_present(*pud) || pud_large(*pud))
319 continue;
320 pmd = pmd_offset(pud, 0);
321 if (PTRS_PER_PMD > 1) /* not folded */
322 @@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void)
323 && m >= pmd_index(HYPERVISOR_VIRT_START))
324 continue;
325 #endif
326 - if (!pmd_present(*pmd))
327 + if (!pmd_present(*pmd) || pmd_large(*pmd))
328 continue;
329 SetPagePinned(pmd_page(*pmd));
330 }
331 --- sle11-2009-05-14.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:16.000000000 +0100
332 +++ sle11-2009-05-14/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:40:54.000000000 +0100
333 @@ -188,6 +188,6 @@ void make_lowmem_page_writable(void *va,
334 pte = lookup_address((unsigned long)va, &level);
335 BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
336 rc = HYPERVISOR_update_va_mapping(
337 - (unsigned long)va, pte_mkwrite(*pte), 0);
338 + (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
339 BUG_ON(rc);
340 }
341 --- sle11-2009-05-14.orig/include/xen/interface/elfnote.h 2008-11-25 12:35:56.000000000 +0100
342 +++ sle11-2009-05-14/include/xen/interface/elfnote.h 2009-03-16 16:40:54.000000000 +0100
343 @@ -162,9 +162,20 @@
344 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
345
346 /*
347 + * The (non-default) location the initial phys-to-machine map should be
348 + * placed at by the hypervisor (Dom0) or the tools (DomU).
349 + * The kernel must be prepared for this mapping to be established using
350 + * large pages, despite such otherwise not being available to guests.
351 + * The kernel must also be prepared that the page table pages used for
352 + * this mapping may not be accessible through the initial mapping.
353 + * (Only x86-64 supports this at present.)
354 + */
355 +#define XEN_ELFNOTE_INIT_P2M 15
356 +
357 +/*
358 * The number of the highest elfnote defined.
359 */
360 -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
361 +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
362
363 /*
364 * System information exported through crash notes.
365 --- sle11-2009-05-14.orig/include/xen/interface/xen.h 2009-05-14 11:17:48.000000000 +0200
366 +++ sle11-2009-05-14/include/xen/interface/xen.h 2009-03-16 16:40:54.000000000 +0100
367 @@ -536,6 +536,7 @@ typedef struct shared_info shared_info_t
368 * a. relocated kernel image
369 * b. initial ram disk [mod_start, mod_len]
370 * c. list of allocated page frames [mfn_list, nr_pages]
371 + * (unless relocated due to XEN_ELFNOTE_INIT_P2M)
372 * d. start_info_t structure [register ESI (x86)]
373 * e. bootstrap page tables [pt_base, CR3 (x86)]
374 * f. bootstrap stack [register ESP (x86)]
375 @@ -577,6 +578,9 @@ struct start_info {
376 unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
377 unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
378 int8_t cmd_line[MAX_GUEST_CMDLINE];
379 + /* The pfn range here covers both page table and p->m table frames. */
380 + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */
381 + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */
382 };
383 typedef struct start_info start_info_t;
384