]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blobdiff - src/patches/suse-2.6.27.31/patches.xen/xen-x86_64-note-init-p2m
Move xen patchset to new version's subdir.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.xen / xen-x86_64-note-init-p2m
diff --git a/src/patches/suse-2.6.27.31/patches.xen/xen-x86_64-note-init-p2m b/src/patches/suse-2.6.27.31/patches.xen/xen-x86_64-note-init-p2m
new file mode 100644 (file)
index 0000000..b6954e6
--- /dev/null
@@ -0,0 +1,384 @@
+From: jbeulich@novell.com
+Subject: eliminate scalability issues from initial mapping setup
+Patch-mainline: obsolete
+References: bnc#417417
+
+Direct Xen to place the initial P->M table outside of the initial
+mapping, as otherwise the 1G (implementation) / 2G (theoretical)
+restriction on the size of the initial mapping limits the amount
+of memory a domain can be handed initially.
+
+Note that the flags passed to HYPERVISOR_update_va_mapping() from
+__make_page_writable() and make_lowmem_page_writable() are
+intentionally not including UVMF_ALL. This is intended to be on optimal
+choice between the overhead of a potential spurious page fault (as
+remote CPUs may still have read-only translations in their TLBs) and
+the overhead of cross processor flushes. Flushing on the local CPU
+shouldn't be as expensive (and hence can be viewed as an optimization
+avoiding the spurious page fault on the local CPU), but is required
+when the functions are used before the page fault handler gets set up.
+
+--- sle11-2009-05-14.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:49:32.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/head64-xen.c      2009-03-16 16:40:54.000000000 +0100
+@@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch
+                     + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+                     "Xen provided");
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              xen_start_info->mfn_list = ~0UL;
++      else if (xen_start_info->mfn_list < __START_KERNEL_map)
++              reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT,
++                            (xen_start_info->first_p2m_pfn
++                             + xen_start_info->nr_p2m_frames) << PAGE_SHIFT,
++                            "INITP2M");
++
+       /*
+        * At this point everything still needed from the boot loader
+        * or BIOS or kernel text should be early reserved or marked not
+--- sle11-2009-05-14.orig/arch/x86/kernel/head_64-xen.S        2009-03-16 16:40:52.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/head_64-xen.S     2009-03-16 16:40:54.000000000 +0100
+@@ -18,6 +18,7 @@
+ #include <asm/desc.h>
+ #include <asm/segment.h>
+ #include <asm/page.h>
++#include <asm/pgtable.h>
+ #include <asm/msr.h>
+ #include <asm/cache.h>
+ #include <asm/dwarf2.h>
+@@ -135,6 +136,7 @@ ENTRY(empty_zero_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
++      ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
+       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+       ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+--- sle11-2009-05-14.orig/arch/x86/kernel/setup-xen.c  2009-02-16 17:05:16.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/setup-xen.c       2009-03-16 16:40:54.000000000 +0100
+@@ -1022,7 +1022,7 @@ void __init setup_arch(char **cmdline_p)
+               difference = xen_start_info->nr_pages - max_pfn;
+               set_xen_guest_handle(reservation.extent_start,
+-                                   ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
++                                   phys_to_machine_mapping + max_pfn);
+               reservation.nr_extents = difference;
+               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                          &reservation);
+@@ -1039,14 +1039,86 @@ void __init setup_arch(char **cmdline_p)
+               phys_to_machine_mapping = alloc_bootmem_pages(
+                       max_pfn * sizeof(unsigned long));
+               memcpy(phys_to_machine_mapping,
+-                     (unsigned long *)xen_start_info->mfn_list,
++                     __va(__pa(xen_start_info->mfn_list)),
+                      p2m_pages * sizeof(unsigned long));
+               memset(phys_to_machine_mapping + p2m_pages, ~0,
+                      (max_pfn - p2m_pages) * sizeof(unsigned long));
+-              free_bootmem(
+-                      __pa(xen_start_info->mfn_list),
+-                      PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+-                                      sizeof(unsigned long))));
++
++#ifdef CONFIG_X86_64
++              if (xen_start_info->mfn_list == VMEMMAP_START) {
++                      /*
++                       * Since it is well isolated we can (and since it is
++                       * perhaps large we should) also free the page tables
++                       * mapping the initial P->M table.
++                       */
++                      unsigned long va = VMEMMAP_START, pa;
++                      pgd_t *pgd = pgd_offset_k(va);
++                      pud_t *pud_page = pud_offset(pgd, 0);
++
++                      BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
++                      xen_l4_entry_update(pgd, __pgd(0));
++                      for(;;) {
++                              pud_t *pud = pud_page + pud_index(va);
++
++                              if (pud_none(*pud))
++                                      va += PUD_SIZE;
++                              else if (pud_large(*pud)) {
++                                      pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
++                                      make_pages_writable(__va(pa),
++                                              PUD_SIZE >> PAGE_SHIFT,
++                                              XENFEAT_writable_page_tables);
++                                      free_bootmem(pa, PUD_SIZE);
++                                      va += PUD_SIZE;
++                              } else {
++                                      pmd_t *pmd = pmd_offset(pud, va);
++
++                                      if (pmd_large(*pmd)) {
++                                              pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
++                                              make_pages_writable(__va(pa),
++                                                      PMD_SIZE >> PAGE_SHIFT,
++                                                      XENFEAT_writable_page_tables);
++                                              free_bootmem(pa, PMD_SIZE);
++                                      } else if (!pmd_none(*pmd)) {
++                                              pte_t *pte = pte_offset_kernel(pmd, va);
++
++                                              for (i = 0; i < PTRS_PER_PTE; ++i) {
++                                                      if (pte_none(pte[i]))
++                                                              break;
++                                                      pa = pte_pfn(pte[i]) << PAGE_SHIFT;
++                                                      make_page_writable(__va(pa),
++                                                              XENFEAT_writable_page_tables);
++                                                      free_bootmem(pa, PAGE_SIZE);
++                                              }
++                                              ClearPagePinned(virt_to_page(pte));
++                                              make_page_writable(pte,
++                                                      XENFEAT_writable_page_tables);
++                                              free_bootmem(__pa(pte), PAGE_SIZE);
++                                      }
++                                      va += PMD_SIZE;
++                                      if (pmd_index(va))
++                                              continue;
++                                      ClearPagePinned(virt_to_page(pmd));
++                                      make_page_writable(pmd,
++                                              XENFEAT_writable_page_tables);
++                                      free_bootmem(__pa((unsigned long)pmd
++                                                        & PAGE_MASK),
++                                              PAGE_SIZE);
++                              }
++                              if (!pud_index(va))
++                                      break;
++                      }
++                      ClearPagePinned(virt_to_page(pud_page));
++                      make_page_writable(pud_page,
++                              XENFEAT_writable_page_tables);
++                      free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
++                              PAGE_SIZE);
++              } else if (!WARN_ON(xen_start_info->mfn_list
++                                  < __START_KERNEL_map))
++#endif
++                      free_bootmem(__pa(xen_start_info->mfn_list),
++                              PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
++                                              sizeof(unsigned long))));
++
+               /*
+                * Initialise the list of the frames that specify the list of
+--- sle11-2009-05-14.orig/arch/x86/mm/init_64-xen.c    2009-03-16 16:40:52.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/init_64-xen.c 2009-03-16 16:40:54.000000000 +0100
+@@ -157,6 +157,17 @@ static unsigned long __meminitdata table
+ static unsigned long __meminitdata table_cur;
+ static unsigned long __meminitdata table_top;
++static __init unsigned long get_table_cur(void)
++{
++      BUG_ON(!table_cur);
++      if (xen_start_info->mfn_list < __START_KERNEL_map
++          && table_cur == xen_start_info->first_p2m_pfn) {
++              table_cur += xen_start_info->nr_p2m_frames;
++              table_top += xen_start_info->nr_p2m_frames;
++      }
++      return table_cur++;
++}
++
+ /*
+  * NOTE: This function is marked __ref because it calls __init function
+  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+@@ -168,8 +179,7 @@ static __ref void *spp_getpage(void)
+       if (after_bootmem)
+               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+       else if (table_cur < table_top) {
+-              ptr = __va(table_cur << PAGE_SHIFT);
+-              table_cur++;
++              ptr = __va(get_table_cur() << PAGE_SHIFT);
+               memset(ptr, 0, PAGE_SIZE);
+       } else
+               ptr = alloc_bootmem_pages(PAGE_SIZE);
+@@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign
+               return adr;
+       }
+-      BUG_ON(!table_cur);
+-      pfn = table_cur++;
++      pfn = get_table_cur();
+       if (pfn >= table_top)
+               panic("alloc_low_page: ran out of memory");
+@@ -361,14 +370,29 @@ static inline int __meminit make_readonl
+       /* Make new page tables read-only on the first pass. */
+       if (!xen_feature(XENFEAT_writable_page_tables)
+           && !max_pfn_mapped
+-          && (paddr >= (table_start << PAGE_SHIFT))
+-          && (paddr < (table_top << PAGE_SHIFT)))
+-              readonly = 1;
++          && (paddr >= (table_start << PAGE_SHIFT))) {
++              unsigned long top = table_top;
++
++              /* Account for the range get_table_cur() skips. */
++              if (xen_start_info->mfn_list < __START_KERNEL_map
++                  && table_cur <= xen_start_info->first_p2m_pfn
++                  && top > xen_start_info->first_p2m_pfn)
++                      top += xen_start_info->nr_p2m_frames;
++              if (paddr < (top << PAGE_SHIFT))
++                      readonly = 1;
++      }
+       /* Make old page tables read-only. */
+       if (!xen_feature(XENFEAT_writable_page_tables)
+           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
+           && (paddr < (table_cur << PAGE_SHIFT)))
+               readonly = 1;
++      /* Make P->M table (and its page tables) read-only. */
++      if (!xen_feature(XENFEAT_writable_page_tables)
++          && xen_start_info->mfn_list < __START_KERNEL_map
++          && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
++          && paddr < (xen_start_info->first_p2m_pfn
++                      + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
++              readonly = 1;
+       /*
+        * No need for writable mapping of kernel image. This also ensures that
+@@ -616,6 +640,12 @@ void __init xen_init_pt(void)
+               __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
+       memcpy(level2_kernel_pgt, page, PAGE_SIZE);
++      /* Copy the initial P->M table mappings if necessary. */
++      addr = pgd_index(xen_start_info->mfn_list);
++      if (addr < pgd_index(__START_KERNEL_map))
++              init_level4_pgt[addr] =
++                      ((pgd_t *)xen_start_info->pt_base)[addr];
++
+       /* Do an early initialization of the fixmap area. */
+       addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+       level3_kernel_pgt[pud_index(addr)] =
+@@ -676,22 +706,28 @@ static void __init find_early_table_spac
+ static void __init xen_finish_init_mapping(void)
+ {
+       unsigned long i, start, end;
++      struct mmuext_op mmuext;
+       /* Re-vector virtual addresses pointing into the initial
+          mapping to the just-established permanent ones. */
+       xen_start_info = __va(__pa(xen_start_info));
+       xen_start_info->pt_base = (unsigned long)
+               __va(__pa(xen_start_info->pt_base));
+-      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++      if (!xen_feature(XENFEAT_auto_translated_physmap)
++          && xen_start_info->mfn_list >= __START_KERNEL_map)
+               phys_to_machine_mapping =
+                       __va(__pa(xen_start_info->mfn_list));
+-              xen_start_info->mfn_list = (unsigned long)
+-                      phys_to_machine_mapping;
+-      }
+       if (xen_start_info->mod_start)
+               xen_start_info->mod_start = (unsigned long)
+                       __va(__pa(xen_start_info->mod_start));
++      /* Unpin the no longer used Xen provided page tables. */
++      mmuext.cmd = MMUEXT_UNPIN_TABLE;
++      mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base)
++                                   >> PAGE_SHIFT);
++      if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
++              BUG();
++
+       /* Destroy the Xen-created mappings beyond the kernel image. */
+       start = PAGE_ALIGN((unsigned long)_end);
+       end   = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+@@ -948,9 +984,20 @@ unsigned long __init_refok init_memory_m
+       __flush_tlb_all();
+-      if (!after_bootmem && table_top > table_start)
++      if (!after_bootmem && table_top > table_start) {
++              if (xen_start_info->mfn_list < __START_KERNEL_map
++                  && table_start <= xen_start_info->first_p2m_pfn
++                  && table_top > xen_start_info->first_p2m_pfn) {
++                      reserve_early(table_start << PAGE_SHIFT,
++                                    xen_start_info->first_p2m_pfn
++                                    << PAGE_SHIFT,
++                                    "PGTABLE");
++                      table_start = xen_start_info->first_p2m_pfn
++                                  + xen_start_info->nr_p2m_frames;
++              }
+               reserve_early(table_start << PAGE_SHIFT,
+                             table_top << PAGE_SHIFT, "PGTABLE");
++      }
+       printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+                        last_map_addr, end);
+--- sle11-2009-05-14.orig/arch/x86/mm/pageattr-xen.c   2009-03-16 16:40:01.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/pageattr-xen.c        2009-03-16 16:40:54.000000000 +0100
+@@ -1251,7 +1251,7 @@ static void __make_page_writable(unsigne
+       pte = lookup_address(va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K);
+-      if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
++      if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
+               BUG();
+       if (in_secondary_range(va)) {
+               unsigned long pfn = pte_pfn(*pte);
+--- sle11-2009-05-14.orig/arch/x86/mm/pgtable-xen.c    2009-04-09 14:54:18.000000000 +0200
++++ sle11-2009-05-14/arch/x86/mm/pgtable-xen.c 2009-03-16 16:40:54.000000000 +0100
+@@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void)
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       SetPagePinned(virt_to_page(pud));
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+-                      if (!pud_present(*pud))
++                      if (!pud_present(*pud) || pud_large(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */
+@@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void)
+                                   && m >= pmd_index(HYPERVISOR_VIRT_START))
+                                       continue;
+ #endif
+-                              if (!pmd_present(*pmd))
++                              if (!pmd_present(*pmd) || pmd_large(*pmd))
+                                       continue;
+                               SetPagePinned(pmd_page(*pmd));
+                       }
+--- sle11-2009-05-14.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:16.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/pgtable_32-xen.c      2009-03-16 16:40:54.000000000 +0100
+@@ -188,6 +188,6 @@ void make_lowmem_page_writable(void *va,
+       pte = lookup_address((unsigned long)va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+       rc = HYPERVISOR_update_va_mapping(
+-              (unsigned long)va, pte_mkwrite(*pte), 0);
++              (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
+       BUG_ON(rc);
+ }
+--- sle11-2009-05-14.orig/include/xen/interface/elfnote.h      2008-11-25 12:35:56.000000000 +0100
++++ sle11-2009-05-14/include/xen/interface/elfnote.h   2009-03-16 16:40:54.000000000 +0100
+@@ -162,9 +162,20 @@
+ #define XEN_ELFNOTE_SUSPEND_CANCEL 14
+ /*
++ * The (non-default) location the initial phys-to-machine map should be
++ * placed at by the hypervisor (Dom0) or the tools (DomU).
++ * The kernel must be prepared for this mapping to be established using
++ * large pages, despite such otherwise not being available to guests.
++ * The kernel must also be prepared that the page table pages used for
++ * this mapping may not be accessible through the initial mapping.
++ * (Only x86-64 supports this at present.)
++ */
++#define XEN_ELFNOTE_INIT_P2M      15
++
++/*
+  * The number of the highest elfnote defined.
+  */
+-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
++#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
+ /*
+  * System information exported through crash notes.
+--- sle11-2009-05-14.orig/include/xen/interface/xen.h  2009-05-14 11:17:48.000000000 +0200
++++ sle11-2009-05-14/include/xen/interface/xen.h       2009-03-16 16:40:54.000000000 +0100
+@@ -536,6 +536,7 @@ typedef struct shared_info shared_info_t
+  *      a. relocated kernel image
+  *      b. initial ram disk              [mod_start, mod_len]
+  *      c. list of allocated page frames [mfn_list, nr_pages]
++ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
+  *      d. start_info_t structure        [register ESI (x86)]
+  *      e. bootstrap page tables         [pt_base, CR3 (x86)]
+  *      f. bootstrap stack               [register ESP (x86)]
+@@ -577,6 +578,9 @@ struct start_info {
+     unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
+     unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
+     int8_t cmd_line[MAX_GUEST_CMDLINE];
++    /* The pfn range here covers both page table and p->m table frames.   */
++    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
++    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
+ };
+ typedef struct start_info start_info_t;