From: jbeulich@novell.com Subject: don't require order-1 allocations for pgd-s Patch-mainline: obsolete At the same time remove the useless user mode pair of init_level4_pgt. Index: head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c =================================================================== --- head-2008-12-01.orig/arch/x86/kernel/cpu/common_64-xen.c 2008-12-01 12:13:15.000000000 +0100 +++ head-2008-12-01/arch/x86/kernel/cpu/common_64-xen.c 2008-12-01 12:13:27.000000000 +0100 @@ -530,8 +530,7 @@ static void __init_refok switch_pt(int c #ifdef CONFIG_XEN if (cpu == 0) xen_init_pt(); - xen_pt_switch(__pa_symbol(init_level4_pgt)); - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); + xen_pt_switch(init_level4_pgt); #endif } Index: head-2008-12-01/arch/x86/kernel/head_64-xen.S =================================================================== --- head-2008-12-01.orig/arch/x86/kernel/head_64-xen.S 2008-12-01 11:49:07.000000000 +0100 +++ head-2008-12-01/arch/x86/kernel/head_64-xen.S 2008-12-01 12:13:27.000000000 +0100 @@ -44,14 +44,6 @@ ENTRY(name) NEXT_PAGE(init_level4_pgt) .fill 512,8,0 - /* - * We update two pgd entries to make kernel and user pgd consistent - * at pgd_populate(). It can be used for kernel modules. So we place - * this page here for those cases to avoid memory corruption. - * We also use this page to establish the initial mapping for the - * vsyscall area. - */ - .fill 512,8,0 NEXT_PAGE(level3_kernel_pgt) .fill 512,8,0 Index: head-2008-12-01/arch/x86/mm/hypervisor.c =================================================================== --- head-2008-12-01.orig/arch/x86/mm/hypervisor.c 2008-12-01 12:13:22.000000000 +0100 +++ head-2008-12-01/arch/x86/mm/hypervisor.c 2008-12-01 12:13:27.000000000 +0100 @@ -422,7 +422,7 @@ void xen_l3_entry_update(pud_t *ptr, pud #endif #ifdef CONFIG_X86_64 -void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val) +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) { mmu_update_t u[2]; struct page *page = NULL; @@ -435,8 +435,10 @@ void xen_l4_entry_update(pgd_t *ptr, int } u[0].ptr = virt_to_machine(ptr); u[0].val = __pgd_val(val); - if (user) { - u[1].ptr = virt_to_machine(__user_pgd(ptr)); + if (((unsigned long)ptr & ~PAGE_MASK) + < pgd_index(__HYPERVISOR_VIRT_START) * sizeof(*ptr) + && (ptr = __user_pgd(ptr)) != NULL) { + u[1].ptr = virt_to_machine(ptr); u[1].val = __pgd_val(val); do_lN_entry_update(u, 2, page); } else @@ -444,21 +446,25 @@ void xen_l4_entry_update(pgd_t *ptr, int } #endif /* CONFIG_X86_64 */ -void xen_pt_switch(unsigned long ptr) +#ifdef CONFIG_X86_64 +void xen_pt_switch(pgd_t *pgd) { struct mmuext_op op; op.cmd = MMUEXT_NEW_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + op.arg1.mfn = pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT); BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } -void xen_new_user_pt(unsigned long ptr) +void xen_new_user_pt(pgd_t *pgd) { struct mmuext_op op; + + pgd = __user_pgd(pgd); op.cmd = MMUEXT_NEW_USER_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + op.arg1.mfn = pgd ? pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT) : 0; BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } +#endif void xen_tlb_flush(void) { @@ -526,28 +532,38 @@ void xen_invlpg_mask(cpumask_t *mask, un void xen_pgd_pin(pgd_t *pgd) { struct mmuext_op op[NR_PGD_PIN_OPS]; + unsigned int nr = NR_PGD_PIN_OPS; op[0].cmd = MMUEXT_PIN_L3_TABLE; op[0].arg1.mfn = pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT); #ifdef CONFIG_X86_64 op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE; - op[1].arg1.mfn = pfn_to_mfn(__pa(__user_pgd(pgd)) >> PAGE_SHIFT); + pgd = __user_pgd(pgd); + if (pgd) + op[1].arg1.mfn = pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT); + else + nr = 1; #endif - if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0) + if (HYPERVISOR_mmuext_op(op, nr, NULL, DOMID_SELF) < 0) BUG(); } void xen_pgd_unpin(pgd_t *pgd) { struct mmuext_op op[NR_PGD_PIN_OPS]; + unsigned int nr = NR_PGD_PIN_OPS; op[0].cmd = MMUEXT_UNPIN_TABLE; op[0].arg1.mfn = pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT); #ifdef CONFIG_X86_64 - op[1].cmd = MMUEXT_UNPIN_TABLE; - op[1].arg1.mfn = pfn_to_mfn(__pa(__user_pgd(pgd)) >> PAGE_SHIFT); + pgd = __user_pgd(pgd); + if (pgd) { + op[1].cmd = MMUEXT_UNPIN_TABLE; + op[1].arg1.mfn = pfn_to_mfn(__pa(pgd) >> PAGE_SHIFT); + } else + nr = 1; #endif - if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0) + if (HYPERVISOR_mmuext_op(op, nr, NULL, DOMID_SELF) < 0) BUG(); } Index: head-2008-12-01/arch/x86/mm/init_64-xen.c =================================================================== --- head-2008-12-01.orig/arch/x86/mm/init_64-xen.c 2008-12-01 12:13:22.000000000 +0100 +++ head-2008-12-01/arch/x86/mm/init_64-xen.c 2008-12-01 12:13:27.000000000 +0100 @@ -604,9 +604,6 @@ void __init xen_init_pt(void) __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); memcpy(level2_kernel_pgt, page, PAGE_SIZE); - __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] = - __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); - /* Do an early initialization of the fixmap area. */ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); level3_kernel_pgt[pud_index(addr)] = @@ -616,8 +613,6 @@ void __init xen_init_pt(void) early_make_page_readonly(init_level4_pgt, XENFEAT_writable_page_tables); - early_make_page_readonly(__user_pgd(init_level4_pgt), - XENFEAT_writable_page_tables); early_make_page_readonly(level3_kernel_pgt, XENFEAT_writable_page_tables); early_make_page_readonly(level3_user_pgt, Index: head-2008-12-01/arch/x86/mm/pgtable-xen.c =================================================================== --- head-2008-12-01.orig/arch/x86/mm/pgtable-xen.c 2008-12-01 12:13:22.000000000 +0100 +++ head-2008-12-01/arch/x86/mm/pgtable-xen.c 2008-12-01 12:13:27.000000000 +0100 @@ -270,9 +270,11 @@ static void pgd_walk(pgd_t *pgd_base, pg BUG(); seq = 0; } + pgd = __user_pgd(pgd_base); + BUG_ON(!pgd); MULTI_update_va_mapping(mcl + seq, - (unsigned long)__user_pgd(pgd_base), - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags), 0); MULTI_update_va_mapping(mcl + seq + 1, (unsigned long)pgd_base, @@ -658,12 +660,29 @@ static void pgd_prepopulate_pmd(struct m } } +static inline pgd_t *user_pgd_alloc(pgd_t *pgd) +{ #ifdef CONFIG_X86_64 -/* We allocate two contiguous pages for kernel and user. */ -#define PGD_ORDER 1 -#else -#define PGD_ORDER 0 + if (pgd) { + pgd_t *upgd = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + + if (upgd) + virt_to_page(pgd)->index = (long)upgd; + else { + free_page((unsigned long)pgd); + pgd = NULL; + } + } +#endif + return pgd; +} + +static inline void user_pgd_free(pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + free_page(virt_to_page(pgd)->index); #endif +} pgd_t *pgd_alloc(struct mm_struct *mm) { @@ -671,7 +690,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmds[PREALLOCATED_PMDS]; unsigned long flags; - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER); + pgd = user_pgd_alloc((void *)__get_free_page(GFP_KERNEL|__GFP_ZERO)); if (pgd == NULL) goto out; @@ -710,7 +729,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb)); out_free_pgd: - free_pages((unsigned long)pgd, PGD_ORDER); + user_pgd_free(pgd); + free_page((unsigned long)pgd); out: return NULL; } @@ -729,7 +749,8 @@ void pgd_free(struct mm_struct *mm, pgd_ pgd_mop_up_pmds(mm, pgd); paravirt_pgd_free(mm, pgd); - free_pages((unsigned long)pgd, PGD_ORDER); + user_pgd_free(pgd); + free_page((unsigned long)pgd); } /* blktap and gntdev need this, as otherwise they would implicitly (and Index: head-2008-12-01/drivers/xen/core/machine_reboot.c =================================================================== --- head-2008-12-01.orig/drivers/xen/core/machine_reboot.c 2008-12-01 12:13:13.000000000 +0100 +++ head-2008-12-01/drivers/xen/core/machine_reboot.c 2008-12-01 12:13:27.000000000 +0100 @@ -199,8 +199,7 @@ static int take_machine_down(void *_susp * in fast-suspend mode as that implies a new enough Xen. */ if (!suspend->fast_suspend) - xen_new_user_pt(__pa(__user_pgd( - current->active_mm->pgd))); + xen_new_user_pt(current->active_mm->pgd); #endif } Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h =================================================================== --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 12:13:22.000000000 +0100 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 12:13:27.000000000 +0100 @@ -85,8 +85,8 @@ void do_hypervisor_callback(struct pt_re * be MACHINE addresses. */ -void xen_pt_switch(unsigned long ptr); -void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ +void xen_pt_switch(pgd_t *); +void xen_new_user_pt(pgd_t *); /* x86_64 only */ void xen_load_gs(unsigned int selector); /* x86_64 only */ void xen_tlb_flush(void); void xen_invlpg(unsigned long ptr); @@ -94,7 +94,7 @@ void xen_invlpg(unsigned long ptr); void xen_l1_entry_update(pte_t *ptr, pte_t val); void xen_l2_entry_update(pmd_t *ptr, pmd_t val); void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ -void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val); /* x86_64 only */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ void xen_pgd_pin(pgd_t *); void xen_pgd_unpin(pgd_t *); Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h =================================================================== --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 11:49:07.000000000 +0100 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_64.h 2008-12-01 12:13:27.000000000 +0100 @@ -46,6 +46,7 @@ static inline void switch_mm(struct mm_s { unsigned cpu = smp_processor_id(); struct mmuext_op _op[3], *op = _op; + pgd_t *upgd; if (likely(prev != next)) { BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && @@ -64,9 +65,11 @@ static inline void switch_mm(struct mm_s op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); op++; - /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ + /* xen_new_user_pt(next->pgd) */ op->cmd = MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT); + upgd = __user_pgd(next->pgd); + op->arg1.mfn = likely(upgd) + ? pfn_to_mfn(__pa(upgd) >> PAGE_SHIFT) : 0; op++; if (unlikely(next->context.ldt != prev->context.ldt)) { @@ -90,7 +93,7 @@ static inline void switch_mm(struct mm_s * to make sure to use no freed page tables. */ load_cr3(next->pgd); - xen_new_user_pt(__pa(__user_pgd(next->pgd))); + xen_new_user_pt(next->pgd); load_LDT_nolock(&next->context); } } Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h =================================================================== --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 12:13:06.000000000 +0100 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgalloc.h 2008-12-01 12:13:27.000000000 +0100 @@ -106,15 +106,13 @@ static inline void pud_populate(struct m #endif /* CONFIG_X86_PAE */ #if PAGETABLE_LEVELS > 3 -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) - static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud)); paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); if (unlikely(PagePinned(virt_to_page(pgd)))) - xen_l4_entry_update(pgd, 1, ent); + xen_l4_entry_update(pgd, ent); else *__user_pgd(pgd) = *pgd = ent; } Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h =================================================================== --- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 12:13:13.000000000 +0100 +++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 12:13:27.000000000 +0100 @@ -131,18 +131,25 @@ static inline void xen_set_pud(pud_t *pu : (void)(*__pudp = xen_make_pud(0)); \ }) -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) +static inline pgd_t *__user_pgd(pgd_t *pgd) +{ + if (unlikely(((unsigned long)pgd & PAGE_MASK) + == (unsigned long)init_level4_pgt)) + return NULL; + return (pgd_t *)(virt_to_page(pgd)->index + + ((unsigned long)pgd & ~PAGE_MASK)); +} static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) { - xen_l4_entry_update(pgdp, 0, pgd); + xen_l4_entry_update(pgdp, pgd); } #define xen_pgd_clear(pgd) \ ({ \ pgd_t *__pgdp = (pgd); \ PagePinned(virt_to_page(__pgdp)) \ - ? xen_l4_entry_update(__pgdp, 1, xen_make_pgd(0)) \ + ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \ : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \ })