]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/60029_xen3-patch-2.6.20.patch1
Stop dhcpcd before starting if it was running
[people/pmueller/ipfire-2.x.git] / src / patches / 60029_xen3-patch-2.6.20.patch1
CommitLineData
cc90b958
BS
1From: www.kernel.org
2Subject: Linux 2.6.20
3Patch-mainline: 2.6.20
4
5Automatically created from "patches.kernel.org/patch-2.6.20" by xen-port-patches.py
6
7Acked-by: jbeulich@novell.com
8
9Index: head-2008-12-01/arch/x86/Kconfig
10===================================================================
11--- head-2008-12-01.orig/arch/x86/Kconfig 2008-12-01 11:29:05.000000000 +0100
12+++ head-2008-12-01/arch/x86/Kconfig 2008-12-01 11:32:38.000000000 +0100
13@@ -1427,7 +1427,7 @@ config PHYSICAL_START
14
15 config RELOCATABLE
16 bool "Build a relocatable kernel (EXPERIMENTAL)"
17- depends on EXPERIMENTAL
18+ depends on EXPERIMENTAL && !X86_XEN
19 help
20 This builds a kernel image that retains relocation information
21 so it can be loaded someplace besides the default 1MB.
22Index: head-2008-12-01/arch/x86/kernel/asm-offsets_32.c
23===================================================================
24--- head-2008-12-01.orig/arch/x86/kernel/asm-offsets_32.c 2008-12-01 11:21:02.000000000 +0100
25+++ head-2008-12-01/arch/x86/kernel/asm-offsets_32.c 2008-12-01 11:32:38.000000000 +0100
26@@ -54,6 +54,7 @@ void foo(void)
27 OFFSET(TI_exec_domain, thread_info, exec_domain);
28 OFFSET(TI_flags, thread_info, flags);
29 OFFSET(TI_status, thread_info, status);
30+ OFFSET(TI_cpu, thread_info, cpu);
31 OFFSET(TI_preempt_count, thread_info, preempt_count);
32 OFFSET(TI_addr_limit, thread_info, addr_limit);
33 OFFSET(TI_restart_block, thread_info, restart_block);
34@@ -108,6 +109,11 @@ void foo(void)
35
36 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
37
38+#ifdef CONFIG_XEN
39+ BLANK();
40+ OFFSET(XEN_START_mfn_list, start_info, mfn_list);
41+#endif
42+
43 #ifdef CONFIG_PARAVIRT
44 BLANK();
45 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
46Index: head-2008-12-01/arch/x86/kernel/cpu/common-xen.c
47===================================================================
48--- head-2008-12-01.orig/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:29:05.000000000 +0100
49+++ head-2008-12-01/arch/x86/kernel/cpu/common-xen.c 2008-12-01 11:32:38.000000000 +0100
50@@ -22,6 +22,7 @@
51 #define phys_pkg_id(a,b) a
52 #endif
53 #endif
54+#include <asm/pda.h>
55 #include <asm/hypervisor.h>
56
57 #include "cpu.h"
58@@ -29,10 +30,8 @@
59 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
60 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
61
62-#ifndef CONFIG_XEN
63-DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
64-EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
65-#endif
66+struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
67+EXPORT_SYMBOL(_cpu_pda);
68
69 static int cachesize_override __cpuinitdata = -1;
70 static int disable_x86_fxsr __cpuinitdata;
71@@ -60,7 +59,7 @@ static struct cpu_dev __cpuinitdata defa
72 .c_init = default_init,
73 .c_vendor = "Unknown",
74 };
75-static struct cpu_dev * this_cpu = &default_cpu;
76+static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
77
78 static int __init cachesize_setup(char *str)
79 {
80@@ -242,29 +241,14 @@ static int __cpuinit have_cpuid_p(void)
81 return flag_is_changeable_p(X86_EFLAGS_ID);
82 }
83
84-/* Do minimum CPU detection early.
85- Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
86- The others are not touched to avoid unwanted side effects.
87-
88- WARNING: this function is only called on the BP. Don't add code here
89- that is supposed to run on all CPUs. */
90-static void __init early_cpu_detect(void)
91+void __init cpu_detect(struct cpuinfo_x86 *c)
92 {
93- struct cpuinfo_x86 *c = &boot_cpu_data;
94-
95- c->x86_cache_alignment = 32;
96-
97- if (!have_cpuid_p())
98- return;
99-
100 /* Get vendor name */
101 cpuid(0x00000000, &c->cpuid_level,
102 (int *)&c->x86_vendor_id[0],
103 (int *)&c->x86_vendor_id[8],
104 (int *)&c->x86_vendor_id[4]);
105
106- get_cpu_vendor(c, 1);
107-
108 c->x86 = 4;
109 if (c->cpuid_level >= 0x00000001) {
110 u32 junk, tfms, cap0, misc;
111@@ -281,6 +265,26 @@ static void __init early_cpu_detect(void
112 }
113 }
114
115+/* Do minimum CPU detection early.
116+ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
117+ The others are not touched to avoid unwanted side effects.
118+
119+ WARNING: this function is only called on the BP. Don't add code here
120+ that is supposed to run on all CPUs. */
121+static void __init early_cpu_detect(void)
122+{
123+ struct cpuinfo_x86 *c = &boot_cpu_data;
124+
125+ c->x86_cache_alignment = 32;
126+
127+ if (!have_cpuid_p())
128+ return;
129+
130+ cpu_detect(c);
131+
132+ get_cpu_vendor(c, 1);
133+}
134+
135 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
136 {
137 u32 tfms, xlvl;
138@@ -315,6 +319,8 @@ static void __cpuinit generic_identify(s
139 #else
140 c->apicid = (ebx >> 24) & 0xFF;
141 #endif
142+ if (c->x86_capability[0] & (1<<19))
143+ c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
144 } else {
145 /* Have CPUID level 0 only - unheard of */
146 c->x86 = 4;
147@@ -379,6 +385,7 @@ void __cpuinit identify_cpu(struct cpuin
148 c->x86_vendor_id[0] = '\0'; /* Unset */
149 c->x86_model_id[0] = '\0'; /* Unset */
150 c->x86_max_cores = 1;
151+ c->x86_clflush_size = 32;
152 memset(&c->x86_capability, 0, sizeof c->x86_capability);
153
154 if (!have_cpuid_p()) {
155@@ -599,61 +606,23 @@ void __init early_cpu_init(void)
156 #endif
157 }
158
159-static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
160+/* Make sure %gs is initialized properly in idle threads */
161+struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
162 {
163- unsigned long frames[16];
164- unsigned long va;
165- int f;
166-
167- for (va = gdt_descr->address, f = 0;
168- va < gdt_descr->address + gdt_descr->size;
169- va += PAGE_SIZE, f++) {
170- frames[f] = virt_to_mfn(va);
171- make_lowmem_page_readonly(
172- (void *)va, XENFEAT_writable_descriptor_tables);
173- }
174- if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
175- BUG();
176+ memset(regs, 0, sizeof(struct pt_regs));
177+ regs->xgs = __KERNEL_PDA;
178+ return regs;
179 }
180
181-/*
182- * cpu_init() initializes state that is per-CPU. Some data is already
183- * initialized (naturally) in the bootstrap process, such as the GDT
184- * and IDT. We reload them nevertheless, this function acts as a
185- * 'CPU state barrier', nothing should get across.
186- */
187-void __cpuinit cpu_init(void)
188+static __cpuinit int alloc_gdt(int cpu)
189 {
190- int cpu = smp_processor_id();
191-#ifndef CONFIG_X86_NO_TSS
192- struct tss_struct * t = &per_cpu(init_tss, cpu);
193-#endif
194- struct thread_struct *thread = &current->thread;
195- struct desc_struct *gdt;
196 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
197+ struct desc_struct *gdt;
198+ struct i386_pda *pda;
199
200- if (cpu_test_and_set(cpu, cpu_initialized)) {
201- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
202- for (;;) local_irq_enable();
203- }
204- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
205-
206- if (cpu_has_vme || cpu_has_de)
207- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
208- if (tsc_disable && cpu_has_tsc) {
209- printk(KERN_NOTICE "Disabling TSC...\n");
210- /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
211- clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
212- set_in_cr4(X86_CR4_TSD);
213- }
214+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
215+ pda = cpu_pda(cpu);
216
217-#ifndef CONFIG_XEN
218- /* The CPU hotplug case */
219- if (cpu_gdt_descr->address) {
220- gdt = (struct desc_struct *)cpu_gdt_descr->address;
221- memset(gdt, 0, PAGE_SIZE);
222- goto old_gdt;
223- }
224 /*
225 * This is a horrible hack to allocate the GDT. The problem
226 * is that cpu_init() is called really early for the boot CPU
227@@ -661,54 +630,141 @@ void __cpuinit cpu_init(void)
228 * CPUs, when bootmem will have gone away
229 */
230 if (NODE_DATA(0)->bdata->node_bootmem_map) {
231- gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
232- /* alloc_bootmem_pages panics on failure, so no check */
233+ BUG_ON(gdt != NULL || pda != NULL);
234+
235+ gdt = alloc_bootmem_pages(PAGE_SIZE);
236+ pda = alloc_bootmem(sizeof(*pda));
237+ /* alloc_bootmem(_pages) panics on failure, so no check */
238+
239 memset(gdt, 0, PAGE_SIZE);
240+ memset(pda, 0, sizeof(*pda));
241 } else {
242- gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
243- if (unlikely(!gdt)) {
244- printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
245- for (;;)
246- local_irq_enable();
247+ /* GDT and PDA might already have been allocated if
248+ this is a CPU hotplug re-insertion. */
249+ if (gdt == NULL)
250+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
251+
252+ if (pda == NULL)
253+ pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
254+
255+ if (unlikely(!gdt || !pda)) {
256+ free_pages((unsigned long)gdt, 0);
257+ kfree(pda);
258+ return 0;
259 }
260 }
261-old_gdt:
262+
263+ cpu_gdt_descr->address = (unsigned long)gdt;
264+ cpu_pda(cpu) = pda;
265+
266+ return 1;
267+}
268+
269+/* Initial PDA used by boot CPU */
270+struct i386_pda boot_pda = {
271+ ._pda = &boot_pda,
272+ .cpu_number = 0,
273+ .pcurrent = &init_task,
274+};
275+
276+static inline void set_kernel_gs(void)
277+{
278+ /* Set %gs for this CPU's PDA. Memory clobber is to create a
279+ barrier with respect to any PDA operations, so the compiler
280+ doesn't move any before here. */
281+ asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
282+}
283+
284+/* Initialize the CPU's GDT and PDA. The boot CPU does this for
285+ itself, but secondaries find this done for them. */
286+__cpuinit int init_gdt(int cpu, struct task_struct *idle)
287+{
288+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
289+ struct desc_struct *gdt;
290+ struct i386_pda *pda;
291+
292+ /* For non-boot CPUs, the GDT and PDA should already have been
293+ allocated. */
294+ if (!alloc_gdt(cpu)) {
295+ printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
296+ return 0;
297+ }
298+
299+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
300+ pda = cpu_pda(cpu);
301+
302+ BUG_ON(gdt == NULL || pda == NULL);
303+
304 /*
305 * Initialize the per-CPU GDT with the boot GDT,
306 * and set up the GDT descriptor:
307 */
308 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
309+ cpu_gdt_descr->size = GDT_SIZE - 1;
310
311- /* Set up GDT entry for 16bit stack */
312- *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
313- ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
314- ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
315- (CPU_16BIT_STACK_SIZE - 1);
316+ pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
317+ (u32 *)&gdt[GDT_ENTRY_PDA].b,
318+ (unsigned long)pda, sizeof(*pda) - 1,
319+ 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
320+
321+ memset(pda, 0, sizeof(*pda));
322+ pda->_pda = pda;
323+ pda->cpu_number = cpu;
324+ pda->pcurrent = idle;
325
326- cpu_gdt_descr->size = GDT_SIZE - 1;
327- cpu_gdt_descr->address = (unsigned long)gdt;
328-#else
329- if (cpu == 0 && cpu_gdt_descr->address == 0) {
330- gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
331- /* alloc_bootmem_pages panics on failure, so no check */
332- memset(gdt, 0, PAGE_SIZE);
333+ return 1;
334+}
335
336- memcpy(gdt, cpu_gdt_table, GDT_SIZE);
337-
338- cpu_gdt_descr->size = GDT_SIZE;
339- cpu_gdt_descr->address = (unsigned long)gdt;
340+void __cpuinit cpu_set_gdt(int cpu)
341+{
342+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
343+ unsigned long va, frames[16];
344+ int f;
345+
346+ for (va = cpu_gdt_descr->address, f = 0;
347+ va < cpu_gdt_descr->address + cpu_gdt_descr->size;
348+ va += PAGE_SIZE, f++) {
349+ frames[f] = virt_to_mfn(va);
350+ make_lowmem_page_readonly(
351+ (void *)va, XENFEAT_writable_descriptor_tables);
352 }
353+ BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
354+
355+ set_kernel_gs();
356+}
357+
358+/* Common CPU init for both boot and secondary CPUs */
359+static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
360+{
361+#ifndef CONFIG_X86_NO_TSS
362+ struct tss_struct * t = &per_cpu(init_tss, cpu);
363 #endif
364+ struct thread_struct *thread = &curr->thread;
365+
366+ if (cpu_test_and_set(cpu, cpu_initialized)) {
367+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
368+ for (;;) local_irq_enable();
369+ }
370
371- cpu_gdt_init(cpu_gdt_descr);
372+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
373+
374+ if (cpu_has_vme || cpu_has_de)
375+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
376+ if (tsc_disable && cpu_has_tsc) {
377+ printk(KERN_NOTICE "Disabling TSC...\n");
378+ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
379+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
380+ set_in_cr4(X86_CR4_TSD);
381+ }
382
383 /*
384 * Set up and load the per-CPU TSS and LDT
385 */
386 atomic_inc(&init_mm.mm_count);
387- current->active_mm = &init_mm;
388- BUG_ON(current->mm);
389- enter_lazy_tlb(&init_mm, current);
390+ curr->active_mm = &init_mm;
391+ if (curr->mm)
392+ BUG();
393+ enter_lazy_tlb(&init_mm, curr);
394
395 load_esp0(t, thread);
396
397@@ -719,8 +775,8 @@ old_gdt:
398 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
399 #endif
400
401- /* Clear %fs and %gs. */
402- asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
403+ /* Clear %fs. */
404+ asm volatile ("mov %0, %%fs" : : "r" (0));
405
406 /* Clear all 6 debug registers: */
407 set_debugreg(0, 0);
408@@ -738,6 +794,38 @@ old_gdt:
409 mxcsr_feature_mask_init();
410 }
411
412+/* Entrypoint to initialize secondary CPU */
413+void __cpuinit secondary_cpu_init(void)
414+{
415+ int cpu = smp_processor_id();
416+ struct task_struct *curr = current;
417+
418+ _cpu_init(cpu, curr);
419+}
420+
421+/*
422+ * cpu_init() initializes state that is per-CPU. Some data is already
423+ * initialized (naturally) in the bootstrap process, such as the GDT
424+ * and IDT. We reload them nevertheless, this function acts as a
425+ * 'CPU state barrier', nothing should get across.
426+ */
427+void __cpuinit cpu_init(void)
428+{
429+ int cpu = smp_processor_id();
430+ struct task_struct *curr = current;
431+
432+ /* Set up the real GDT and PDA, so we can transition from the
433+ boot versions. */
434+ if (!init_gdt(cpu, curr)) {
435+ /* failed to allocate something; not much we can do... */
436+ for (;;)
437+ local_irq_enable();
438+ }
439+
440+ cpu_set_gdt(cpu);
441+ _cpu_init(cpu, curr);
442+}
443+
444 #ifdef CONFIG_HOTPLUG_CPU
445 void __cpuinit cpu_uninit(void)
446 {
447Index: head-2008-12-01/arch/x86/kernel/cpu/mtrr/main-xen.c
448===================================================================
449--- head-2008-12-01.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100
450+++ head-2008-12-01/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-12-01 11:32:38.000000000 +0100
451@@ -12,7 +12,7 @@
452 static DEFINE_MUTEX(mtrr_mutex);
453
454 void generic_get_mtrr(unsigned int reg, unsigned long *base,
455- unsigned int *size, mtrr_type * type)
456+ unsigned long *size, mtrr_type * type)
457 {
458 struct xen_platform_op op;
459
460@@ -115,8 +115,7 @@ int mtrr_del_page(int reg, unsigned long
461 {
462 unsigned i;
463 mtrr_type ltype;
464- unsigned long lbase;
465- unsigned int lsize;
466+ unsigned long lbase, lsize;
467 int error = -EINVAL;
468 struct xen_platform_op op;
469
470Index: head-2008-12-01/arch/x86/kernel/e820_32-xen.c
471===================================================================
472--- /dev/null 1970-01-01 00:00:00.000000000 +0000
473+++ head-2008-12-01/arch/x86/kernel/e820_32-xen.c 2008-12-01 11:32:38.000000000 +0100
474@@ -0,0 +1,1015 @@
475+#include <linux/kernel.h>
476+#include <linux/types.h>
477+#include <linux/init.h>
478+#include <linux/bootmem.h>
479+#include <linux/ioport.h>
480+#include <linux/string.h>
481+#include <linux/kexec.h>
482+#include <linux/module.h>
483+#include <linux/mm.h>
484+#include <linux/efi.h>
485+#include <linux/pfn.h>
486+#include <linux/uaccess.h>
487+
488+#include <asm/pgtable.h>
489+#include <asm/page.h>
490+#include <asm/e820.h>
491+#include <xen/interface/memory.h>
492+
493+#ifdef CONFIG_EFI
494+int efi_enabled = 0;
495+EXPORT_SYMBOL(efi_enabled);
496+#endif
497+
498+struct e820map e820;
499+struct change_member {
500+ struct e820entry *pbios; /* pointer to original bios entry */
501+ unsigned long long addr; /* address for this change point */
502+};
503+static struct change_member change_point_list[2*E820MAX] __initdata;
504+static struct change_member *change_point[2*E820MAX] __initdata;
505+static struct e820entry *overlap_list[E820MAX] __initdata;
506+static struct e820entry new_bios[E820MAX] __initdata;
507+/* For PCI or other memory-mapped resources */
508+unsigned long pci_mem_start = 0x10000000;
509+#ifdef CONFIG_PCI
510+EXPORT_SYMBOL(pci_mem_start);
511+#endif
512+extern int user_defined_memmap;
513+struct resource data_resource = {
514+ .name = "Kernel data",
515+ .start = 0,
516+ .end = 0,
517+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
518+};
519+
520+struct resource code_resource = {
521+ .name = "Kernel code",
522+ .start = 0,
523+ .end = 0,
524+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
525+};
526+
527+static struct resource system_rom_resource = {
528+ .name = "System ROM",
529+ .start = 0xf0000,
530+ .end = 0xfffff,
531+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
532+};
533+
534+static struct resource extension_rom_resource = {
535+ .name = "Extension ROM",
536+ .start = 0xe0000,
537+ .end = 0xeffff,
538+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
539+};
540+
541+static struct resource adapter_rom_resources[] = { {
542+ .name = "Adapter ROM",
543+ .start = 0xc8000,
544+ .end = 0,
545+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
546+}, {
547+ .name = "Adapter ROM",
548+ .start = 0,
549+ .end = 0,
550+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
551+}, {
552+ .name = "Adapter ROM",
553+ .start = 0,
554+ .end = 0,
555+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
556+}, {
557+ .name = "Adapter ROM",
558+ .start = 0,
559+ .end = 0,
560+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
561+}, {
562+ .name = "Adapter ROM",
563+ .start = 0,
564+ .end = 0,
565+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
566+}, {
567+ .name = "Adapter ROM",
568+ .start = 0,
569+ .end = 0,
570+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
571+} };
572+
573+static struct resource video_rom_resource = {
574+ .name = "Video ROM",
575+ .start = 0xc0000,
576+ .end = 0xc7fff,
577+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
578+};
579+
580+static struct resource video_ram_resource = {
581+ .name = "Video RAM area",
582+ .start = 0xa0000,
583+ .end = 0xbffff,
584+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
585+};
586+
587+static struct resource standard_io_resources[] = { {
588+ .name = "dma1",
589+ .start = 0x0000,
590+ .end = 0x001f,
591+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
592+}, {
593+ .name = "pic1",
594+ .start = 0x0020,
595+ .end = 0x0021,
596+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
597+}, {
598+ .name = "timer0",
599+ .start = 0x0040,
600+ .end = 0x0043,
601+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
602+}, {
603+ .name = "timer1",
604+ .start = 0x0050,
605+ .end = 0x0053,
606+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
607+}, {
608+ .name = "keyboard",
609+ .start = 0x0060,
610+ .end = 0x006f,
611+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
612+}, {
613+ .name = "dma page reg",
614+ .start = 0x0080,
615+ .end = 0x008f,
616+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
617+}, {
618+ .name = "pic2",
619+ .start = 0x00a0,
620+ .end = 0x00a1,
621+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
622+}, {
623+ .name = "dma2",
624+ .start = 0x00c0,
625+ .end = 0x00df,
626+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
627+}, {
628+ .name = "fpu",
629+ .start = 0x00f0,
630+ .end = 0x00ff,
631+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
632+} };
633+
634+static int romsignature(const unsigned char *x)
635+{
636+ unsigned short sig;
637+ int ret = 0;
638+ if (probe_kernel_address((const unsigned short *)x, sig) == 0)
639+ ret = (sig == 0xaa55);
640+ return ret;
641+}
642+
643+static int __init romchecksum(unsigned char *rom, unsigned long length)
644+{
645+ unsigned char *p, sum = 0;
646+
647+ for (p = rom; p < rom + length; p++)
648+ sum += *p;
649+ return sum == 0;
650+}
651+
652+static void __init probe_roms(void)
653+{
654+ unsigned long start, length, upper;
655+ unsigned char *rom;
656+ int i;
657+
658+#ifdef CONFIG_XEN
659+ /* Nothing to do if not running in dom0. */
660+ if (!is_initial_xendomain())
661+ return;
662+#endif
663+
664+ /* video rom */
665+ upper = adapter_rom_resources[0].start;
666+ for (start = video_rom_resource.start; start < upper; start += 2048) {
667+ rom = isa_bus_to_virt(start);
668+ if (!romsignature(rom))
669+ continue;
670+
671+ video_rom_resource.start = start;
672+
673+ /* 0 < length <= 0x7f * 512, historically */
674+ length = rom[2] * 512;
675+
676+ /* if checksum okay, trust length byte */
677+ if (length && romchecksum(rom, length))
678+ video_rom_resource.end = start + length - 1;
679+
680+ request_resource(&iomem_resource, &video_rom_resource);
681+ break;
682+ }
683+
684+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
685+ if (start < upper)
686+ start = upper;
687+
688+ /* system rom */
689+ request_resource(&iomem_resource, &system_rom_resource);
690+ upper = system_rom_resource.start;
691+
692+ /* check for extension rom (ignore length byte!) */
693+ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
694+ if (romsignature(rom)) {
695+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
696+ if (romchecksum(rom, length)) {
697+ request_resource(&iomem_resource, &extension_rom_resource);
698+ upper = extension_rom_resource.start;
699+ }
700+ }
701+
702+ /* check for adapter roms on 2k boundaries */
703+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
704+ rom = isa_bus_to_virt(start);
705+ if (!romsignature(rom))
706+ continue;
707+
708+ /* 0 < length <= 0x7f * 512, historically */
709+ length = rom[2] * 512;
710+
711+ /* but accept any length that fits if checksum okay */
712+ if (!length || start + length > upper || !romchecksum(rom, length))
713+ continue;
714+
715+ adapter_rom_resources[i].start = start;
716+ adapter_rom_resources[i].end = start + length - 1;
717+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
718+
719+ start = adapter_rom_resources[i++].end & ~2047UL;
720+ }
721+}
722+
723+#ifdef CONFIG_XEN
724+static struct e820map machine_e820 __initdata;
725+#define e820 machine_e820
726+#endif
727+
728+/*
729+ * Request address space for all standard RAM and ROM resources
730+ * and also for regions reported as reserved by the e820.
731+ */
732+static void __init
733+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
734+{
735+ int i;
736+
737+ probe_roms();
738+ for (i = 0; i < e820.nr_map; i++) {
739+ struct resource *res;
740+#ifndef CONFIG_RESOURCES_64BIT
741+ if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
742+ continue;
743+#endif
744+ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
745+ switch (e820.map[i].type) {
746+ case E820_RAM: res->name = "System RAM"; break;
747+ case E820_ACPI: res->name = "ACPI Tables"; break;
748+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
749+ default: res->name = "reserved";
750+ }
751+ res->start = e820.map[i].addr;
752+ res->end = res->start + e820.map[i].size - 1;
753+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
754+ if (request_resource(&iomem_resource, res)) {
755+ kfree(res);
756+ continue;
757+ }
758+ if (e820.map[i].type == E820_RAM) {
759+ /*
760+ * We don't know which RAM region contains kernel data,
761+ * so we try it repeatedly and let the resource manager
762+ * test it.
763+ */
764+#ifndef CONFIG_XEN
765+ request_resource(res, code_resource);
766+ request_resource(res, data_resource);
767+#endif
768+#ifdef CONFIG_KEXEC
769+ request_resource(res, &crashk_res);
770+#ifdef CONFIG_XEN
771+ xen_machine_kexec_register_resources(res);
772+#endif
773+#endif
774+ }
775+ }
776+}
777+
778+#undef e820
779+
780+/*
781+ * Request address space for all standard resources
782+ *
783+ * This is called just before pcibios_init(), which is also a
784+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
785+ */
786+static int __init request_standard_resources(void)
787+{
788+ int i;
789+
790+ /* Nothing to do if not running in dom0. */
791+ if (!is_initial_xendomain())
792+ return 0;
793+
794+ printk("Setting up standard PCI resources\n");
795+ if (efi_enabled)
796+ efi_initialize_iomem_resources(&code_resource, &data_resource);
797+ else
798+ legacy_init_iomem_resources(&code_resource, &data_resource);
799+
800+ /* EFI systems may still have VGA */
801+ request_resource(&iomem_resource, &video_ram_resource);
802+
803+ /* request I/O space for devices used on all i[345]86 PCs */
804+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
805+ request_resource(&ioport_resource, &standard_io_resources[i]);
806+ return 0;
807+}
808+
809+subsys_initcall(request_standard_resources);
810+
811+void __init add_memory_region(unsigned long long start,
812+ unsigned long long size, int type)
813+{
814+ int x;
815+
816+ if (!efi_enabled) {
817+ x = e820.nr_map;
818+
819+ if (x == E820MAX) {
820+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
821+ return;
822+ }
823+
824+ e820.map[x].addr = start;
825+ e820.map[x].size = size;
826+ e820.map[x].type = type;
827+ e820.nr_map++;
828+ }
829+} /* add_memory_region */
830+
831+/*
832+ * Sanitize the BIOS e820 map.
833+ *
834+ * Some e820 responses include overlapping entries. The following
835+ * replaces the original e820 map with a new one, removing overlaps.
836+ *
837+ */
838+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
839+{
840+ struct change_member *change_tmp;
841+ unsigned long current_type, last_type;
842+ unsigned long long last_addr;
843+ int chgidx, still_changing;
844+ int overlap_entries;
845+ int new_bios_entry;
846+ int old_nr, new_nr, chg_nr;
847+ int i;
848+
849+ /*
850+ Visually we're performing the following (1,2,3,4 = memory types)...
851+
852+ Sample memory map (w/overlaps):
853+ ____22__________________
854+ ______________________4_
855+ ____1111________________
856+ _44_____________________
857+ 11111111________________
858+ ____________________33__
859+ ___________44___________
860+ __________33333_________
861+ ______________22________
862+ ___________________2222_
863+ _________111111111______
864+ _____________________11_
865+ _________________4______
866+
867+ Sanitized equivalent (no overlap):
868+ 1_______________________
869+ _44_____________________
870+ ___1____________________
871+ ____22__________________
872+ ______11________________
873+ _________1______________
874+ __________3_____________
875+ ___________44___________
876+ _____________33_________
877+ _______________2________
878+ ________________1_______
879+ _________________4______
880+ ___________________2____
881+ ____________________33__
882+ ______________________4_
883+ */
884+ printk("sanitize start\n");
885+ /* if there's only one memory region, don't bother */
886+ if (*pnr_map < 2) {
887+ printk("sanitize bail 0\n");
888+ return -1;
889+ }
890+
891+ old_nr = *pnr_map;
892+
893+ /* bail out if we find any unreasonable addresses in bios map */
894+ for (i=0; i<old_nr; i++)
895+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
896+ printk("sanitize bail 1\n");
897+ return -1;
898+ }
899+
900+ /* create pointers for initial change-point information (for sorting) */
901+ for (i=0; i < 2*old_nr; i++)
902+ change_point[i] = &change_point_list[i];
903+
904+ /* record all known change-points (starting and ending addresses),
905+ omitting those that are for empty memory regions */
906+ chgidx = 0;
907+ for (i=0; i < old_nr; i++) {
908+ if (biosmap[i].size != 0) {
909+ change_point[chgidx]->addr = biosmap[i].addr;
910+ change_point[chgidx++]->pbios = &biosmap[i];
911+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
912+ change_point[chgidx++]->pbios = &biosmap[i];
913+ }
914+ }
915+ chg_nr = chgidx; /* true number of change-points */
916+
917+ /* sort change-point list by memory addresses (low -> high) */
918+ still_changing = 1;
919+ while (still_changing) {
920+ still_changing = 0;
921+ for (i=1; i < chg_nr; i++) {
922+ /* if <current_addr> > <last_addr>, swap */
923+ /* or, if current=<start_addr> & last=<end_addr>, swap */
924+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
925+ ((change_point[i]->addr == change_point[i-1]->addr) &&
926+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
927+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
928+ )
929+ {
930+ change_tmp = change_point[i];
931+ change_point[i] = change_point[i-1];
932+ change_point[i-1] = change_tmp;
933+ still_changing=1;
934+ }
935+ }
936+ }
937+
938+ /* create a new bios memory map, removing overlaps */
939+ overlap_entries=0; /* number of entries in the overlap table */
940+ new_bios_entry=0; /* index for creating new bios map entries */
941+ last_type = 0; /* start with undefined memory type */
942+ last_addr = 0; /* start with 0 as last starting address */
943+ /* loop through change-points, determining affect on the new bios map */
944+ for (chgidx=0; chgidx < chg_nr; chgidx++)
945+ {
946+ /* keep track of all overlapping bios entries */
947+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
948+ {
949+ /* add map entry to overlap list (> 1 entry implies an overlap) */
950+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
951+ }
952+ else
953+ {
954+ /* remove entry from list (order independent, so swap with last) */
955+ for (i=0; i<overlap_entries; i++)
956+ {
957+ if (overlap_list[i] == change_point[chgidx]->pbios)
958+ overlap_list[i] = overlap_list[overlap_entries-1];
959+ }
960+ overlap_entries--;
961+ }
962+ /* if there are overlapping entries, decide which "type" to use */
963+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
964+ current_type = 0;
965+ for (i=0; i<overlap_entries; i++)
966+ if (overlap_list[i]->type > current_type)
967+ current_type = overlap_list[i]->type;
968+ /* continue building up new bios map based on this information */
969+ if (current_type != last_type) {
970+ if (last_type != 0) {
971+ new_bios[new_bios_entry].size =
972+ change_point[chgidx]->addr - last_addr;
973+ /* move forward only if the new size was non-zero */
974+ if (new_bios[new_bios_entry].size != 0)
975+ if (++new_bios_entry >= E820MAX)
976+ break; /* no more space left for new bios entries */
977+ }
978+ if (current_type != 0) {
979+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
980+ new_bios[new_bios_entry].type = current_type;
981+ last_addr=change_point[chgidx]->addr;
982+ }
983+ last_type = current_type;
984+ }
985+ }
986+ new_nr = new_bios_entry; /* retain count for new bios entries */
987+
988+ /* copy new bios mapping into original location */
989+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
990+ *pnr_map = new_nr;
991+
992+ printk("sanitize end\n");
993+ return 0;
994+}
995+
996+/*
997+ * Copy the BIOS e820 map into a safe place.
998+ *
999+ * Sanity-check it while we're at it..
1000+ *
1001+ * If we're lucky and live on a modern system, the setup code
1002+ * will have given us a memory map that we can use to properly
1003+ * set up memory. If we aren't, we'll fake a memory map.
1004+ *
1005+ * We check to see that the memory map contains at least 2 elements
1006+ * before we'll use it, because the detection code in setup.S may
1007+ * not be perfect and most every PC known to man has two memory
1008+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
1009+ * thinkpad 560x, for example, does not cooperate with the memory
1010+ * detection code.)
1011+ */
1012+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
1013+{
1014+#ifndef CONFIG_XEN
1015+ /* Only one memory region (or negative)? Ignore it */
1016+ if (nr_map < 2)
1017+ return -1;
1018+#else
1019+ BUG_ON(nr_map < 1);
1020+#endif
1021+
1022+ do {
1023+ unsigned long long start = biosmap->addr;
1024+ unsigned long long size = biosmap->size;
1025+ unsigned long long end = start + size;
1026+ unsigned long type = biosmap->type;
1027+ printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
1028+
1029+ /* Overflow in 64 bits? Ignore the memory map. */
1030+ if (start > end)
1031+ return -1;
1032+
1033+#ifndef CONFIG_XEN
1034+ /*
1035+ * Some BIOSes claim RAM in the 640k - 1M region.
1036+ * Not right. Fix it up.
1037+ */
1038+ if (type == E820_RAM) {
1039+ printk("copy_e820_map() type is E820_RAM\n");
1040+ if (start < 0x100000ULL && end > 0xA0000ULL) {
1041+ printk("copy_e820_map() lies in range...\n");
1042+ if (start < 0xA0000ULL) {
1043+ printk("copy_e820_map() start < 0xA0000ULL\n");
1044+ add_memory_region(start, 0xA0000ULL-start, type);
1045+ }
1046+ if (end <= 0x100000ULL) {
1047+ printk("copy_e820_map() end <= 0x100000ULL\n");
1048+ continue;
1049+ }
1050+ start = 0x100000ULL;
1051+ size = end - start;
1052+ }
1053+ }
1054+#endif
1055+ add_memory_region(start, size, type);
1056+ } while (biosmap++,--nr_map);
1057+
1058+#ifdef CONFIG_XEN
1059+ if (is_initial_xendomain()) {
1060+ struct xen_memory_map memmap;
1061+
1062+ memmap.nr_entries = E820MAX;
1063+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
1064+
1065+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
1066+ BUG();
1067+ machine_e820.nr_map = memmap.nr_entries;
1068+ } else
1069+ machine_e820 = e820;
1070+#endif
1071+
1072+ return 0;
1073+}
1074+
1075+/*
1076+ * Callback for efi_memory_walk.
1077+ */
1078+static int __init
1079+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1080+{
1081+ unsigned long *max_pfn = arg, pfn;
1082+
1083+ if (start < end) {
1084+ pfn = PFN_UP(end -1);
1085+ if (pfn > *max_pfn)
1086+ *max_pfn = pfn;
1087+ }
1088+ return 0;
1089+}
1090+
1091+static int __init
1092+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1093+{
1094+ memory_present(0, PFN_UP(start), PFN_DOWN(end));
1095+ return 0;
1096+}
1097+
1098+/*
1099+ * Find the highest page frame number we have available
1100+ */
1101+void __init find_max_pfn(void)
1102+{
1103+ int i;
1104+
1105+ max_pfn = 0;
1106+ if (efi_enabled) {
1107+ efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1108+ efi_memmap_walk(efi_memory_present_wrapper, NULL);
1109+ return;
1110+ }
1111+
1112+ for (i = 0; i < e820.nr_map; i++) {
1113+ unsigned long start, end;
1114+ /* RAM? */
1115+ if (e820.map[i].type != E820_RAM)
1116+ continue;
1117+ start = PFN_UP(e820.map[i].addr);
1118+ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1119+ if (start >= end)
1120+ continue;
1121+ if (end > max_pfn)
1122+ max_pfn = end;
1123+ memory_present(0, start, end);
1124+ }
1125+}
1126+
1127+/*
1128+ * Free all available memory for boot time allocation. Used
1129+ * as a callback function by efi_memory_walk()
1130+ */
1131+
1132+static int __init
1133+free_available_memory(unsigned long start, unsigned long end, void *arg)
1134+{
1135+ /* check max_low_pfn */
1136+ if (start >= (max_low_pfn << PAGE_SHIFT))
1137+ return 0;
1138+ if (end >= (max_low_pfn << PAGE_SHIFT))
1139+ end = max_low_pfn << PAGE_SHIFT;
1140+ if (start < end)
1141+ free_bootmem(start, end - start);
1142+
1143+ return 0;
1144+}
1145+/*
1146+ * Register fully available low RAM pages with the bootmem allocator.
1147+ */
1148+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1149+{
1150+ int i;
1151+
1152+ if (efi_enabled) {
1153+ efi_memmap_walk(free_available_memory, NULL);
1154+ return;
1155+ }
1156+ for (i = 0; i < e820.nr_map; i++) {
1157+ unsigned long curr_pfn, last_pfn, size;
1158+ /*
1159+ * Reserve usable low memory
1160+ */
1161+ if (e820.map[i].type != E820_RAM)
1162+ continue;
1163+ /*
1164+ * We are rounding up the start address of usable memory:
1165+ */
1166+ curr_pfn = PFN_UP(e820.map[i].addr);
1167+ if (curr_pfn >= max_low_pfn)
1168+ continue;
1169+ /*
1170+ * ... and at the end of the usable range downwards:
1171+ */
1172+ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1173+
1174+#ifdef CONFIG_XEN
1175+ /*
1176+ * Truncate to the number of actual pages currently
1177+ * present.
1178+ */
1179+ if (last_pfn > xen_start_info->nr_pages)
1180+ last_pfn = xen_start_info->nr_pages;
1181+#endif
1182+
1183+ if (last_pfn > max_low_pfn)
1184+ last_pfn = max_low_pfn;
1185+
1186+ /*
1187+ * .. finally, did all the rounding and playing
1188+ * around just make the area go away?
1189+ */
1190+ if (last_pfn <= curr_pfn)
1191+ continue;
1192+
1193+ size = last_pfn - curr_pfn;
1194+ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1195+ }
1196+}
1197+
1198+void __init e820_register_memory(void)
1199+{
1200+ unsigned long gapstart, gapsize, round;
1201+ unsigned long long last;
1202+ int i;
1203+
1204+#ifdef CONFIG_XEN
1205+ if (is_initial_xendomain()) {
1206+ struct xen_memory_map memmap;
1207+
1208+ memmap.nr_entries = E820MAX;
1209+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
1210+
1211+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
1212+ BUG();
1213+ machine_e820.nr_map = memmap.nr_entries;
1214+ }
1215+ else
1216+ machine_e820 = e820;
1217+#define e820 machine_e820
1218+#endif
1219+
1220+ /*
1221+ * Search for the bigest gap in the low 32 bits of the e820
1222+ * memory space.
1223+ */
1224+ last = 0x100000000ull;
1225+ gapstart = 0x10000000;
1226+ gapsize = 0x400000;
1227+ i = e820.nr_map;
1228+ while (--i >= 0) {
1229+ unsigned long long start = e820.map[i].addr;
1230+ unsigned long long end = start + e820.map[i].size;
1231+
1232+ /*
1233+ * Since "last" is at most 4GB, we know we'll
1234+ * fit in 32 bits if this condition is true
1235+ */
1236+ if (last > end) {
1237+ unsigned long gap = last - end;
1238+
1239+ if (gap > gapsize) {
1240+ gapsize = gap;
1241+ gapstart = end;
1242+ }
1243+ }
1244+ if (start < last)
1245+ last = start;
1246+ }
1247+#undef e820
1248+
1249+ /*
1250+ * See how much we want to round up: start off with
1251+ * rounding to the next 1MB area.
1252+ */
1253+ round = 0x100000;
1254+ while ((gapsize >> 4) > round)
1255+ round += round;
1256+ /* Fun with two's complement */
1257+ pci_mem_start = (gapstart + round) & -round;
1258+
1259+ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1260+ pci_mem_start, gapstart, gapsize);
1261+}
1262+
1263+void __init print_memory_map(char *who)
1264+{
1265+ int i;
1266+
1267+ for (i = 0; i < e820.nr_map; i++) {
1268+ printk(" %s: %016Lx - %016Lx ", who,
1269+ e820.map[i].addr,
1270+ e820.map[i].addr + e820.map[i].size);
1271+ switch (e820.map[i].type) {
1272+ case E820_RAM: printk("(usable)\n");
1273+ break;
1274+ case E820_RESERVED:
1275+ printk("(reserved)\n");
1276+ break;
1277+ case E820_ACPI:
1278+ printk("(ACPI data)\n");
1279+ break;
1280+ case E820_NVS:
1281+ printk("(ACPI NVS)\n");
1282+ break;
1283+ default: printk("type %lu\n", e820.map[i].type);
1284+ break;
1285+ }
1286+ }
1287+}
1288+
1289+static __init __always_inline void efi_limit_regions(unsigned long long size)
1290+{
1291+ unsigned long long current_addr = 0;
1292+ efi_memory_desc_t *md, *next_md;
1293+ void *p, *p1;
1294+ int i, j;
1295+
1296+ j = 0;
1297+ p1 = memmap.map;
1298+ for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1299+ md = p;
1300+ next_md = p1;
1301+ current_addr = md->phys_addr +
1302+ PFN_PHYS(md->num_pages);
1303+ if (is_available_memory(md)) {
1304+ if (md->phys_addr >= size) continue;
1305+ memcpy(next_md, md, memmap.desc_size);
1306+ if (current_addr >= size) {
1307+ next_md->num_pages -=
1308+ PFN_UP(current_addr-size);
1309+ }
1310+ p1 += memmap.desc_size;
1311+ next_md = p1;
1312+ j++;
1313+ } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1314+ EFI_MEMORY_RUNTIME) {
1315+ /* In order to make runtime services
1316+ * available we have to include runtime
1317+ * memory regions in memory map */
1318+ memcpy(next_md, md, memmap.desc_size);
1319+ p1 += memmap.desc_size;
1320+ next_md = p1;
1321+ j++;
1322+ }
1323+ }
1324+ memmap.nr_map = j;
1325+ memmap.map_end = memmap.map +
1326+ (memmap.nr_map * memmap.desc_size);
1327+}
1328+
1329+void __init limit_regions(unsigned long long size)
1330+{
1331+ unsigned long long current_addr = 0;
1332+ int i;
1333+
1334+ print_memory_map("limit_regions start");
1335+ if (efi_enabled) {
1336+ efi_limit_regions(size);
1337+ return;
1338+ }
1339+ for (i = 0; i < e820.nr_map; i++) {
1340+ current_addr = e820.map[i].addr + e820.map[i].size;
1341+ if (current_addr < size)
1342+ continue;
1343+
1344+ if (e820.map[i].type != E820_RAM)
1345+ continue;
1346+
1347+ if (e820.map[i].addr >= size) {
1348+ /*
1349+ * This region starts past the end of the
1350+ * requested size, skip it completely.
1351+ */
1352+ e820.nr_map = i;
1353+ } else {
1354+ e820.nr_map = i + 1;
1355+ e820.map[i].size -= current_addr - size;
1356+ }
1357+ print_memory_map("limit_regions endfor");
1358+ return;
1359+ }
1360+#ifdef CONFIG_XEN
1361+ if (current_addr < size) {
1362+ /*
1363+ * The e820 map finished before our requested size so
1364+ * extend the final entry to the requested address.
1365+ */
1366+ --i;
1367+ if (e820.map[i].type == E820_RAM)
1368+ e820.map[i].size -= current_addr - size;
1369+ else
1370+ add_memory_region(current_addr, size - current_addr, E820_RAM);
1371+ }
1372+#endif
1373+ print_memory_map("limit_regions endfunc");
1374+}
1375+
1376+/*
1377+ * This function checks if any part of the range <start,end> is mapped
1378+ * with type.
1379+ */
1380+int
1381+e820_any_mapped(u64 start, u64 end, unsigned type)
1382+{
1383+ int i;
1384+
1385+#ifndef CONFIG_XEN
1386+ for (i = 0; i < e820.nr_map; i++) {
1387+ const struct e820entry *ei = &e820.map[i];
1388+#else
1389+ if (!is_initial_xendomain())
1390+ return 0;
1391+ for (i = 0; i < machine_e820.nr_map; ++i) {
1392+ const struct e820entry *ei = &machine_e820.map[i];
1393+#endif
1394+
1395+ if (type && ei->type != type)
1396+ continue;
1397+ if (ei->addr >= end || ei->addr + ei->size <= start)
1398+ continue;
1399+ return 1;
1400+ }
1401+ return 0;
1402+}
1403+EXPORT_SYMBOL_GPL(e820_any_mapped);
1404+
1405+ /*
1406+ * This function checks if the entire range <start,end> is mapped with type.
1407+ *
1408+ * Note: this function only works correct if the e820 table is sorted and
1409+ * not-overlapping, which is the case
1410+ */
1411+int __init
1412+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
1413+{
1414+ u64 start = s;
1415+ u64 end = e;
1416+ int i;
1417+
1418+#ifndef CONFIG_XEN
1419+ for (i = 0; i < e820.nr_map; i++) {
1420+ struct e820entry *ei = &e820.map[i];
1421+#else
1422+ if (!is_initial_xendomain())
1423+ return 0;
1424+ for (i = 0; i < machine_e820.nr_map; ++i) {
1425+ const struct e820entry *ei = &machine_e820.map[i];
1426+#endif
1427+
1428+ if (type && ei->type != type)
1429+ continue;
1430+ /* is the region (part) in overlap with the current region ?*/
1431+ if (ei->addr >= end || ei->addr + ei->size <= start)
1432+ continue;
1433+ /* if the region is at the beginning of <start,end> we move
1434+ * start to the end of the region since it's ok until there
1435+ */
1436+ if (ei->addr <= start)
1437+ start = ei->addr + ei->size;
1438+ /* if start is now at or beyond end, we're done, full
1439+ * coverage */
1440+ if (start >= end)
1441+ return 1; /* we're done */
1442+ }
1443+ return 0;
1444+}
1445+
1446+static int __init parse_memmap(char *arg)
1447+{
1448+ if (!arg)
1449+ return -EINVAL;
1450+
1451+ if (strcmp(arg, "exactmap") == 0) {
1452+#ifdef CONFIG_CRASH_DUMP
1453+ /* If we are doing a crash dump, we
1454+ * still need to know the real mem
1455+ * size before original memory map is
1456+ * reset.
1457+ */
1458+ find_max_pfn();
1459+ saved_max_pfn = max_pfn;
1460+#endif
1461+ e820.nr_map = 0;
1462+ user_defined_memmap = 1;
1463+ } else {
1464+ /* If the user specifies memory size, we
1465+ * limit the BIOS-provided memory map to
1466+ * that size. exactmap can be used to specify
1467+ * the exact map. mem=number can be used to
1468+ * trim the existing memory map.
1469+ */
1470+ unsigned long long start_at, mem_size;
1471+
1472+ mem_size = memparse(arg, &arg);
1473+ if (*arg == '@') {
1474+ start_at = memparse(arg+1, &arg);
1475+ add_memory_region(start_at, mem_size, E820_RAM);
1476+ } else if (*arg == '#') {
1477+ start_at = memparse(arg+1, &arg);
1478+ add_memory_region(start_at, mem_size, E820_ACPI);
1479+ } else if (*arg == '$') {
1480+ start_at = memparse(arg+1, &arg);
1481+ add_memory_region(start_at, mem_size, E820_RESERVED);
1482+ } else {
1483+ limit_regions(mem_size);
1484+ user_defined_memmap = 1;
1485+ }
1486+ }
1487+ return 0;
1488+}
1489+early_param("memmap", parse_memmap);
1490Index: head-2008-12-01/arch/x86/kernel/entry_32-xen.S
1491===================================================================
1492--- head-2008-12-01.orig/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:29:05.000000000 +0100
1493+++ head-2008-12-01/arch/x86/kernel/entry_32-xen.S 2008-12-01 11:32:38.000000000 +0100
1494@@ -30,12 +30,13 @@
1495 * 18(%esp) - %eax
1496 * 1C(%esp) - %ds
1497 * 20(%esp) - %es
1498- * 24(%esp) - orig_eax
1499- * 28(%esp) - %eip
1500- * 2C(%esp) - %cs
1501- * 30(%esp) - %eflags
1502- * 34(%esp) - %oldesp
1503- * 38(%esp) - %oldss
1504+ * 24(%esp) - %gs
1505+ * 28(%esp) - orig_eax
1506+ * 2C(%esp) - %eip
1507+ * 30(%esp) - %cs
1508+ * 34(%esp) - %eflags
1509+ * 38(%esp) - %oldesp
1510+ * 3C(%esp) - %oldss
1511 *
1512 * "current" is in register %ebx during any slow entries.
1513 */
1514@@ -48,27 +49,25 @@
1515 #include <asm/smp.h>
1516 #include <asm/page.h>
1517 #include <asm/desc.h>
1518+#include <asm/percpu.h>
1519 #include <asm/dwarf2.h>
1520 #include "irq_vectors.h"
1521 #include <xen/interface/xen.h>
1522
1523-#define nr_syscalls ((syscall_table_size)/4)
1524+/*
1525+ * We use macros for low-level operations which need to be overridden
1526+ * for paravirtualization. The following will never clobber any registers:
1527+ * INTERRUPT_RETURN (aka. "iret")
1528+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
1529+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
1530+ *
1531+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
1532+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
1533+ * Allowing a register to be clobbered can shrink the paravirt replacement
1534+ * enough to patch inline, increasing performance.
1535+ */
1536
1537-EBX = 0x00
1538-ECX = 0x04
1539-EDX = 0x08
1540-ESI = 0x0C
1541-EDI = 0x10
1542-EBP = 0x14
1543-EAX = 0x18
1544-DS = 0x1C
1545-ES = 0x20
1546-ORIG_EAX = 0x24
1547-EIP = 0x28
1548-CS = 0x2C
1549-EFLAGS = 0x30
1550-OLDESP = 0x34
1551-OLDSS = 0x38
1552+#define nr_syscalls ((syscall_table_size)/4)
1553
1554 CF_MASK = 0x00000001
1555 TF_MASK = 0x00000100
1556@@ -79,61 +78,16 @@ VM_MASK = 0x00020000
1557 /* Pseudo-eflags. */
1558 NMI_MASK = 0x80000000
1559
1560-#ifndef CONFIG_XEN
1561-/* These are replaces for paravirtualization */
1562-#define DISABLE_INTERRUPTS cli
1563-#define ENABLE_INTERRUPTS sti
1564-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
1565-#define INTERRUPT_RETURN iret
1566-#define GET_CR0_INTO_EAX movl %cr0, %eax
1567-#else
1568-/* Offsets into shared_info_t. */
1569-#define evtchn_upcall_pending /* 0 */
1570-#define evtchn_upcall_mask 1
1571-
1572-#define sizeof_vcpu_shift 6
1573-
1574-#ifdef CONFIG_SMP
1575-#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1576- shl $sizeof_vcpu_shift,%esi ; \
1577- addl HYPERVISOR_shared_info,%esi
1578-#else
1579-#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1580-#endif
1581-
1582-#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1583-#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1584-#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1585-#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1586- __DISABLE_INTERRUPTS
1587-#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1588- __ENABLE_INTERRUPTS
1589-#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
1590-sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
1591- __TEST_PENDING ; \
1592- jnz 14f # process more events if necessary... ; \
1593- movl ESI(%esp), %esi ; \
1594- sysexit ; \
1595-14: __DISABLE_INTERRUPTS ; \
1596- TRACE_IRQS_OFF ; \
1597-sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
1598- push %esp ; \
1599- call evtchn_do_upcall ; \
1600- add $4,%esp ; \
1601- jmp ret_from_intr
1602-#define INTERRUPT_RETURN iret
1603-#endif
1604-
1605 #ifdef CONFIG_PREEMPT
1606-#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
1607+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
1608 #else
1609-#define preempt_stop
1610+#define preempt_stop(clobbers)
1611 #define resume_kernel restore_nocheck
1612 #endif
1613
1614 .macro TRACE_IRQS_IRET
1615 #ifdef CONFIG_TRACE_IRQFLAGS
1616- testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1617+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1618 jz 1f
1619 TRACE_IRQS_ON
1620 1:
1621@@ -148,6 +102,9 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
1622
1623 #define SAVE_ALL \
1624 cld; \
1625+ pushl %gs; \
1626+ CFI_ADJUST_CFA_OFFSET 4;\
1627+ /*CFI_REL_OFFSET gs, 0;*/\
1628 pushl %es; \
1629 CFI_ADJUST_CFA_OFFSET 4;\
1630 /*CFI_REL_OFFSET es, 0;*/\
1631@@ -177,7 +134,9 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
1632 CFI_REL_OFFSET ebx, 0;\
1633 movl $(__USER_DS), %edx; \
1634 movl %edx, %ds; \
1635- movl %edx, %es;
1636+ movl %edx, %es; \
1637+ movl $(__KERNEL_PDA), %edx; \
1638+ movl %edx, %gs
1639
1640 #define RESTORE_INT_REGS \
1641 popl %ebx; \
1642@@ -210,17 +169,22 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
1643 2: popl %es; \
1644 CFI_ADJUST_CFA_OFFSET -4;\
1645 /*CFI_RESTORE es;*/\
1646-.section .fixup,"ax"; \
1647-3: movl $0,(%esp); \
1648- jmp 1b; \
1649+3: popl %gs; \
1650+ CFI_ADJUST_CFA_OFFSET -4;\
1651+ /*CFI_RESTORE gs;*/\
1652+.pushsection .fixup,"ax"; \
1653 4: movl $0,(%esp); \
1654+ jmp 1b; \
1655+5: movl $0,(%esp); \
1656 jmp 2b; \
1657-.previous; \
1658+6: movl $0,(%esp); \
1659+ jmp 3b; \
1660 .section __ex_table,"a";\
1661 .align 4; \
1662- .long 1b,3b; \
1663- .long 2b,4b; \
1664-.previous
1665+ .long 1b,4b; \
1666+ .long 2b,5b; \
1667+ .long 3b,6b; \
1668+.popsection
1669
1670 #define RING0_INT_FRAME \
1671 CFI_STARTPROC simple;\
1672@@ -239,18 +203,18 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
1673 #define RING0_PTREGS_FRAME \
1674 CFI_STARTPROC simple;\
1675 CFI_SIGNAL_FRAME;\
1676- CFI_DEF_CFA esp, OLDESP-EBX;\
1677- /*CFI_OFFSET cs, CS-OLDESP;*/\
1678- CFI_OFFSET eip, EIP-OLDESP;\
1679- /*CFI_OFFSET es, ES-OLDESP;*/\
1680- /*CFI_OFFSET ds, DS-OLDESP;*/\
1681- CFI_OFFSET eax, EAX-OLDESP;\
1682- CFI_OFFSET ebp, EBP-OLDESP;\
1683- CFI_OFFSET edi, EDI-OLDESP;\
1684- CFI_OFFSET esi, ESI-OLDESP;\
1685- CFI_OFFSET edx, EDX-OLDESP;\
1686- CFI_OFFSET ecx, ECX-OLDESP;\
1687- CFI_OFFSET ebx, EBX-OLDESP
1688+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
1689+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
1690+ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
1691+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
1692+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
1693+ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
1694+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
1695+ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
1696+ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
1697+ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
1698+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
1699+ CFI_OFFSET ebx, PT_EBX-PT_OLDESP
1700
1701 ENTRY(ret_from_fork)
1702 CFI_STARTPROC
1703@@ -278,17 +242,18 @@ ENTRY(ret_from_fork)
1704 ALIGN
1705 RING0_PTREGS_FRAME
1706 ret_from_exception:
1707- preempt_stop
1708+ preempt_stop(CLBR_ANY)
1709 ret_from_intr:
1710 GET_THREAD_INFO(%ebp)
1711 check_userspace:
1712- movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1713- movb CS(%esp), %al
1714+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1715+ movb PT_CS(%esp), %al
1716 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1717 cmpl $USER_RPL, %eax
1718 jb resume_kernel # not returning to v8086 or userspace
1719+
1720 ENTRY(resume_userspace)
1721- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1722+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1723 # setting need_resched or sigpending
1724 # between sampling and the iret
1725 movl TI_flags(%ebp), %ecx
1726@@ -299,14 +264,14 @@ ENTRY(resume_userspace)
1727
1728 #ifdef CONFIG_PREEMPT
1729 ENTRY(resume_kernel)
1730- DISABLE_INTERRUPTS
1731+ DISABLE_INTERRUPTS(CLBR_ANY)
1732 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1733 jnz restore_nocheck
1734 need_resched:
1735 movl TI_flags(%ebp), %ecx # need_resched set ?
1736 testb $_TIF_NEED_RESCHED, %cl
1737 jz restore_all
1738- testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1739+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1740 jz restore_all
1741 call preempt_schedule_irq
1742 jmp need_resched
1743@@ -328,7 +293,7 @@ sysenter_past_esp:
1744 * No need to follow this irqs on/off section: the syscall
1745 * disabled irqs and here we enable it straight after entry:
1746 */
1747- ENABLE_INTERRUPTS
1748+ ENABLE_INTERRUPTS(CLBR_NONE)
1749 pushl $(__USER_DS)
1750 CFI_ADJUST_CFA_OFFSET 4
1751 /*CFI_REL_OFFSET ss, 0*/
1752@@ -340,12 +305,16 @@ sysenter_past_esp:
1753 pushl $(__USER_CS)
1754 CFI_ADJUST_CFA_OFFSET 4
1755 /*CFI_REL_OFFSET cs, 0*/
1756+#ifndef CONFIG_COMPAT_VDSO
1757 /*
1758 * Push current_thread_info()->sysenter_return to the stack.
1759 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1760 * pushed above; +8 corresponds to copy_thread's esp0 setting.
1761 */
1762 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1763+#else
1764+ pushl $SYSENTER_RETURN
1765+#endif
1766 CFI_ADJUST_CFA_OFFSET 4
1767 CFI_REL_OFFSET eip, 0
1768
1769@@ -372,19 +341,27 @@ sysenter_past_esp:
1770 cmpl $(nr_syscalls), %eax
1771 jae syscall_badsys
1772 call *sys_call_table(,%eax,4)
1773- movl %eax,EAX(%esp)
1774- DISABLE_INTERRUPTS
1775+ movl %eax,PT_EAX(%esp)
1776+ DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
1777 TRACE_IRQS_OFF
1778 movl TI_flags(%ebp), %ecx
1779 testw $_TIF_ALLWORK_MASK, %cx
1780 jne syscall_exit_work
1781 /* if something modifies registers it must also disable sysexit */
1782- movl EIP(%esp), %edx
1783- movl OLDESP(%esp), %ecx
1784+ movl PT_EIP(%esp), %edx
1785+ movl PT_OLDESP(%esp), %ecx
1786 xorl %ebp,%ebp
1787 TRACE_IRQS_ON
1788+1: mov PT_GS(%esp), %gs
1789 ENABLE_INTERRUPTS_SYSEXIT
1790 CFI_ENDPROC
1791+.pushsection .fixup,"ax"
1792+2: movl $0,PT_GS(%esp)
1793+ jmp 1b
1794+.section __ex_table,"a"
1795+ .align 4
1796+ .long 1b,2b
1797+.popsection
1798
1799 # pv sysenter call handler stub
1800 ENTRY(sysenter_entry_pv)
1801@@ -419,7 +396,7 @@ ENTRY(system_call)
1802 CFI_ADJUST_CFA_OFFSET 4
1803 SAVE_ALL
1804 GET_THREAD_INFO(%ebp)
1805- testl $TF_MASK,EFLAGS(%esp)
1806+ testl $TF_MASK,PT_EFLAGS(%esp)
1807 jz no_singlestep
1808 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1809 no_singlestep:
1810@@ -431,9 +408,9 @@ no_singlestep:
1811 jae syscall_badsys
1812 syscall_call:
1813 call *sys_call_table(,%eax,4)
1814- movl %eax,EAX(%esp) # store the return value
1815+ movl %eax,PT_EAX(%esp) # store the return value
1816 syscall_exit:
1817- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1818+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1819 # setting need_resched or sigpending
1820 # between sampling and the iret
1821 TRACE_IRQS_OFF
1822@@ -443,12 +420,12 @@ syscall_exit:
1823
1824 restore_all:
1825 #ifndef CONFIG_XEN
1826- movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1827- # Warning: OLDSS(%esp) contains the wrong/random values if we
1828+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1829+ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
1830 # are returning to the kernel.
1831 # See comments in process.c:copy_thread() for details.
1832- movb OLDSS(%esp), %ah
1833- movb CS(%esp), %al
1834+ movb PT_OLDSS(%esp), %ah
1835+ movb PT_CS(%esp), %al
1836 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1837 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1838 CFI_REMEMBER_STATE
1839@@ -456,7 +433,7 @@ restore_all:
1840 restore_nocheck:
1841 #else
1842 restore_nocheck:
1843- movl EFLAGS(%esp), %eax
1844+ movl PT_EFLAGS(%esp), %eax
1845 testl $(VM_MASK|NMI_MASK), %eax
1846 CFI_REMEMBER_STATE
1847 jnz hypervisor_iret
1848@@ -470,13 +447,13 @@ restore_nocheck:
1849 TRACE_IRQS_IRET
1850 restore_nocheck_notrace:
1851 RESTORE_REGS
1852- addl $4, %esp
1853+ addl $4, %esp # skip orig_eax/error_code
1854 CFI_ADJUST_CFA_OFFSET -4
1855 1: INTERRUPT_RETURN
1856 .section .fixup,"ax"
1857 iret_exc:
1858 #ifndef CONFIG_XEN
1859- ENABLE_INTERRUPTS
1860+ ENABLE_INTERRUPTS(CLBR_NONE)
1861 #endif
1862 pushl $0 # no error code
1863 pushl $do_iret_error
1864@@ -490,33 +467,42 @@ iret_exc:
1865 CFI_RESTORE_STATE
1866 #ifndef CONFIG_XEN
1867 ldt_ss:
1868- larl OLDSS(%esp), %eax
1869+ larl PT_OLDSS(%esp), %eax
1870 jnz restore_nocheck
1871 testl $0x00400000, %eax # returning to 32bit stack?
1872 jnz restore_nocheck # allright, normal return
1873+
1874+#ifdef CONFIG_PARAVIRT
1875+ /*
1876+ * The kernel can't run on a non-flat stack if paravirt mode
1877+ * is active. Rather than try to fixup the high bits of
1878+ * ESP, bypass this code entirely. This may break DOSemu
1879+ * and/or Wine support in a paravirt VM, although the option
1880+ * is still available to implement the setting of the high
1881+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
1882+ */
1883+ cmpl $0, paravirt_ops+PARAVIRT_enabled
1884+ jne restore_nocheck
1885+#endif
1886+
1887 /* If returning to userspace with 16bit stack,
1888 * try to fix the higher word of ESP, as the CPU
1889 * won't restore it.
1890 * This is an "official" bug of all the x86-compatible
1891 * CPUs, which we can try to work around to make
1892 * dosemu and wine happy. */
1893- subl $8, %esp # reserve space for switch16 pointer
1894- CFI_ADJUST_CFA_OFFSET 8
1895- DISABLE_INTERRUPTS
1896+ movl PT_OLDESP(%esp), %eax
1897+ movl %esp, %edx
1898+ call patch_espfix_desc
1899+ pushl $__ESPFIX_SS
1900+ CFI_ADJUST_CFA_OFFSET 4
1901+ pushl %eax
1902+ CFI_ADJUST_CFA_OFFSET 4
1903+ DISABLE_INTERRUPTS(CLBR_EAX)
1904 TRACE_IRQS_OFF
1905- movl %esp, %eax
1906- /* Set up the 16bit stack frame with switch32 pointer on top,
1907- * and a switch16 pointer on top of the current frame. */
1908- call setup_x86_bogus_stack
1909- CFI_ADJUST_CFA_OFFSET -8 # frame has moved
1910- TRACE_IRQS_IRET
1911- RESTORE_REGS
1912- lss 20+4(%esp), %esp # switch to 16bit stack
1913-1: INTERRUPT_RETURN
1914-.section __ex_table,"a"
1915- .align 4
1916- .long 1b,iret_exc
1917-.previous
1918+ lss (%esp), %esp
1919+ CFI_ADJUST_CFA_OFFSET -8
1920+ jmp restore_nocheck
1921 #else
1922 ALIGN
1923 restore_all_enable_events:
1924@@ -540,7 +526,7 @@ ecrit: /**** END OF CRITICAL REGION ***
1925
1926 CFI_RESTORE_STATE
1927 hypervisor_iret:
1928- andl $~NMI_MASK, EFLAGS(%esp)
1929+ andl $~NMI_MASK, PT_EFLAGS(%esp)
1930 RESTORE_REGS
1931 addl $4, %esp
1932 CFI_ADJUST_CFA_OFFSET -4
1933@@ -556,7 +542,7 @@ work_pending:
1934 jz work_notifysig
1935 work_resched:
1936 call schedule
1937- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1938+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1939 # setting need_resched or sigpending
1940 # between sampling and the iret
1941 TRACE_IRQS_OFF
1942@@ -569,7 +555,8 @@ work_resched:
1943
1944 work_notifysig: # deal with pending signals and
1945 # notify-resume requests
1946- testl $VM_MASK, EFLAGS(%esp)
1947+#ifdef CONFIG_VM86
1948+ testl $VM_MASK, PT_EFLAGS(%esp)
1949 movl %esp, %eax
1950 jne work_notifysig_v86 # returning to kernel-space or
1951 # vm86-space
1952@@ -579,29 +566,30 @@ work_notifysig: # deal with pending s
1953
1954 ALIGN
1955 work_notifysig_v86:
1956-#ifdef CONFIG_VM86
1957 pushl %ecx # save ti_flags for do_notify_resume
1958 CFI_ADJUST_CFA_OFFSET 4
1959 call save_v86_state # %eax contains pt_regs pointer
1960 popl %ecx
1961 CFI_ADJUST_CFA_OFFSET -4
1962 movl %eax, %esp
1963+#else
1964+ movl %esp, %eax
1965+#endif
1966 xorl %edx, %edx
1967 call do_notify_resume
1968 jmp resume_userspace_sig
1969-#endif
1970
1971 # perform syscall exit tracing
1972 ALIGN
1973 syscall_trace_entry:
1974- movl $-ENOSYS,EAX(%esp)
1975+ movl $-ENOSYS,PT_EAX(%esp)
1976 movl %esp, %eax
1977 xorl %edx,%edx
1978 call do_syscall_trace
1979 cmpl $0, %eax
1980 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
1981 # so must skip actual syscall
1982- movl ORIG_EAX(%esp), %eax
1983+ movl PT_ORIG_EAX(%esp), %eax
1984 cmpl $(nr_syscalls), %eax
1985 jnae syscall_call
1986 jmp syscall_exit
1987@@ -612,7 +600,7 @@ syscall_exit_work:
1988 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
1989 jz work_pending
1990 TRACE_IRQS_ON
1991- ENABLE_INTERRUPTS # could let do_syscall_trace() call
1992+ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
1993 # schedule() instead
1994 movl %esp, %eax
1995 movl $1, %edx
1996@@ -626,40 +614,39 @@ syscall_fault:
1997 CFI_ADJUST_CFA_OFFSET 4
1998 SAVE_ALL
1999 GET_THREAD_INFO(%ebp)
2000- movl $-EFAULT,EAX(%esp)
2001+ movl $-EFAULT,PT_EAX(%esp)
2002 jmp resume_userspace
2003
2004 syscall_badsys:
2005- movl $-ENOSYS,EAX(%esp)
2006+ movl $-ENOSYS,PT_EAX(%esp)
2007 jmp resume_userspace
2008 CFI_ENDPROC
2009
2010 #ifndef CONFIG_XEN
2011 #define FIXUP_ESPFIX_STACK \
2012- movl %esp, %eax; \
2013- /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2014- lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2015- /* copy data from 16bit stack to 32bit stack */ \
2016- call fixup_x86_bogus_stack; \
2017- /* put ESP to the proper location */ \
2018- movl %eax, %esp;
2019-#define UNWIND_ESPFIX_STACK \
2020+ /* since we are on a wrong stack, we cant make it a C code :( */ \
2021+ movl %gs:PDA_cpu, %ebx; \
2022+ PER_CPU(cpu_gdt_descr, %ebx); \
2023+ movl GDS_address(%ebx), %ebx; \
2024+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
2025+ addl %esp, %eax; \
2026+ pushl $__KERNEL_DS; \
2027+ CFI_ADJUST_CFA_OFFSET 4; \
2028 pushl %eax; \
2029 CFI_ADJUST_CFA_OFFSET 4; \
2030+ lss (%esp), %esp; \
2031+ CFI_ADJUST_CFA_OFFSET -8;
2032+#define UNWIND_ESPFIX_STACK \
2033 movl %ss, %eax; \
2034- /* see if on 16bit stack */ \
2035+ /* see if on espfix stack */ \
2036 cmpw $__ESPFIX_SS, %ax; \
2037- je 28f; \
2038-27: popl %eax; \
2039- CFI_ADJUST_CFA_OFFSET -4; \
2040-.section .fixup,"ax"; \
2041-28: movl $__KERNEL_DS, %eax; \
2042+ jne 27f; \
2043+ movl $__KERNEL_DS, %eax; \
2044 movl %eax, %ds; \
2045 movl %eax, %es; \
2046- /* switch to 32bit stack */ \
2047+ /* switch to normal stack */ \
2048 FIXUP_ESPFIX_STACK; \
2049- jmp 27b; \
2050-.previous
2051+27:;
2052
2053 /*
2054 * Build the entry stubs and pointer table with
2055@@ -723,13 +710,16 @@ KPROBE_ENTRY(page_fault)
2056 CFI_ADJUST_CFA_OFFSET 4
2057 ALIGN
2058 error_code:
2059+ /* the function address is in %gs's slot on the stack */
2060+ pushl %es
2061+ CFI_ADJUST_CFA_OFFSET 4
2062+ /*CFI_REL_OFFSET es, 0*/
2063 pushl %ds
2064 CFI_ADJUST_CFA_OFFSET 4
2065 /*CFI_REL_OFFSET ds, 0*/
2066 pushl %eax
2067 CFI_ADJUST_CFA_OFFSET 4
2068 CFI_REL_OFFSET eax, 0
2069- xorl %eax, %eax
2070 pushl %ebp
2071 CFI_ADJUST_CFA_OFFSET 4
2072 CFI_REL_OFFSET ebp, 0
2073@@ -742,7 +732,6 @@ error_code:
2074 pushl %edx
2075 CFI_ADJUST_CFA_OFFSET 4
2076 CFI_REL_OFFSET edx, 0
2077- decl %eax # eax = -1
2078 pushl %ecx
2079 CFI_ADJUST_CFA_OFFSET 4
2080 CFI_REL_OFFSET ecx, 0
2081@@ -750,18 +739,20 @@ error_code:
2082 CFI_ADJUST_CFA_OFFSET 4
2083 CFI_REL_OFFSET ebx, 0
2084 cld
2085- pushl %es
2086+ pushl %gs
2087 CFI_ADJUST_CFA_OFFSET 4
2088- /*CFI_REL_OFFSET es, 0*/
2089+ /*CFI_REL_OFFSET gs, 0*/
2090+ movl $(__KERNEL_PDA), %ecx
2091+ movl %ecx, %gs
2092 UNWIND_ESPFIX_STACK
2093 popl %ecx
2094 CFI_ADJUST_CFA_OFFSET -4
2095 /*CFI_REGISTER es, ecx*/
2096- movl ES(%esp), %edi # get the function address
2097- movl ORIG_EAX(%esp), %edx # get the error code
2098- movl %eax, ORIG_EAX(%esp)
2099- movl %ecx, ES(%esp)
2100- /*CFI_REL_OFFSET es, ES*/
2101+ movl PT_GS(%esp), %edi # get the function address
2102+ movl PT_ORIG_EAX(%esp), %edx # get the error code
2103+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
2104+ mov %ecx, PT_GS(%esp)
2105+ /*CFI_REL_OFFSET gs, ES*/
2106 movl $(__USER_DS), %ecx
2107 movl %ecx, %ds
2108 movl %ecx, %es
2109@@ -793,7 +784,7 @@ ENTRY(hypervisor_callback)
2110 pushl %eax
2111 CFI_ADJUST_CFA_OFFSET 4
2112 SAVE_ALL
2113- movl EIP(%esp),%eax
2114+ movl PT_EIP(%esp),%eax
2115 cmpl $scrit,%eax
2116 jb 11f
2117 cmpl $ecrit,%eax
2118@@ -802,7 +793,7 @@ ENTRY(hypervisor_callback)
2119 jb 11f
2120 cmpl $sysexit_ecrit,%eax
2121 ja 11f
2122- addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2123+ addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame.
2124 11: push %esp
2125 CFI_ADJUST_CFA_OFFSET 4
2126 call evtchn_do_upcall
2127@@ -824,7 +815,7 @@ critical_region_fixup:
2128 jne 15f
2129 xorl %ecx,%ecx
2130 15: leal (%esp,%ecx),%esi # %esi points at end of src region
2131- leal OLDESP(%esp),%edi # %edi points at end of dst region
2132+ leal PT_OLDESP(%esp),%edi # %edi points at end of dst region
2133 shrl $2,%ecx # convert words to bytes
2134 je 17f # skip loop if nothing to copy
2135 16: subl $4,%esi # pre-decrementing copy loop
2136@@ -848,8 +839,9 @@ critical_fixup_table:
2137 .byte 0x18 # pop %eax
2138 .byte 0x1c # pop %ds
2139 .byte 0x20 # pop %es
2140- .byte 0x24,0x24,0x24 # add $4,%esp
2141- .byte 0x28 # iret
2142+ .byte 0x24,0x24 # pop %gs
2143+ .byte 0x28,0x28,0x28 # add $4,%esp
2144+ .byte 0x2c # iret
2145 .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2146 .byte 0x00,0x00 # jmp 11b
2147 .previous
2148@@ -940,7 +932,7 @@ ENTRY(device_not_available)
2149 jmp ret_from_exception
2150 device_available_emulate:
2151 #endif
2152- preempt_stop
2153+ preempt_stop(CLBR_ANY)
2154 call math_state_restore
2155 jmp ret_from_exception
2156 CFI_ENDPROC
2157@@ -1010,7 +1002,7 @@ KPROBE_ENTRY(nmi)
2158 cmpw $__ESPFIX_SS, %ax
2159 popl %eax
2160 CFI_ADJUST_CFA_OFFSET -4
2161- je nmi_16bit_stack
2162+ je nmi_espfix_stack
2163 cmpl $sysenter_entry,(%esp)
2164 je nmi_stack_fixup
2165 pushl %eax
2166@@ -1053,7 +1045,7 @@ nmi_debug_stack_check:
2167 FIX_STACK(24,nmi_stack_correct, 1)
2168 jmp nmi_stack_correct
2169
2170-nmi_16bit_stack:
2171+nmi_espfix_stack:
2172 /* We have a RING0_INT_FRAME here.
2173 *
2174 * create the pointer to lss back
2175@@ -1062,7 +1054,6 @@ nmi_16bit_stack:
2176 CFI_ADJUST_CFA_OFFSET 4
2177 pushl %esp
2178 CFI_ADJUST_CFA_OFFSET 4
2179- movzwl %sp, %esp
2180 addw $4, (%esp)
2181 /* copy the iret frame of 12 bytes */
2182 .rept 3
2183@@ -1073,11 +1064,11 @@ nmi_16bit_stack:
2184 CFI_ADJUST_CFA_OFFSET 4
2185 SAVE_ALL
2186 FIXUP_ESPFIX_STACK # %eax == %esp
2187- CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2188 xorl %edx,%edx # zero error code
2189 call do_nmi
2190 RESTORE_REGS
2191- lss 12+4(%esp), %esp # back to 16bit stack
2192+ lss 12+4(%esp), %esp # back to espfix stack
2193+ CFI_ADJUST_CFA_OFFSET -24
2194 1: INTERRUPT_RETURN
2195 CFI_ENDPROC
2196 .section __ex_table,"a"
2197@@ -1093,12 +1084,25 @@ KPROBE_ENTRY(nmi)
2198 xorl %edx,%edx # zero error code
2199 movl %esp,%eax # pt_regs pointer
2200 call do_nmi
2201- orl $NMI_MASK, EFLAGS(%esp)
2202+ orl $NMI_MASK, PT_EFLAGS(%esp)
2203 jmp restore_all
2204 CFI_ENDPROC
2205 #endif
2206 KPROBE_END(nmi)
2207
2208+#ifdef CONFIG_PARAVIRT
2209+ENTRY(native_iret)
2210+1: iret
2211+.section __ex_table,"a"
2212+ .align 4
2213+ .long 1b,iret_exc
2214+.previous
2215+
2216+ENTRY(native_irq_enable_sysexit)
2217+ sti
2218+ sysexit
2219+#endif
2220+
2221 KPROBE_ENTRY(int3)
2222 RING0_INT_FRAME
2223 pushl $-1 # mark this as an int
2224@@ -1214,37 +1218,6 @@ ENTRY(spurious_interrupt_bug)
2225 CFI_ENDPROC
2226 #endif /* !CONFIG_XEN */
2227
2228-#ifdef CONFIG_STACK_UNWIND
2229-ENTRY(arch_unwind_init_running)
2230- CFI_STARTPROC
2231- movl 4(%esp), %edx
2232- movl (%esp), %ecx
2233- leal 4(%esp), %eax
2234- movl %ebx, EBX(%edx)
2235- xorl %ebx, %ebx
2236- movl %ebx, ECX(%edx)
2237- movl %ebx, EDX(%edx)
2238- movl %esi, ESI(%edx)
2239- movl %edi, EDI(%edx)
2240- movl %ebp, EBP(%edx)
2241- movl %ebx, EAX(%edx)
2242- movl $__USER_DS, DS(%edx)
2243- movl $__USER_DS, ES(%edx)
2244- movl %ebx, ORIG_EAX(%edx)
2245- movl %ecx, EIP(%edx)
2246- movl 12(%esp), %ecx
2247- movl $__KERNEL_CS, CS(%edx)
2248- movl %ebx, EFLAGS(%edx)
2249- movl %eax, OLDESP(%edx)
2250- movl 8(%esp), %eax
2251- movl %ecx, 8(%esp)
2252- movl EBX(%edx), %ebx
2253- movl $__KERNEL_DS, OLDSS(%edx)
2254- jmpl *%eax
2255- CFI_ENDPROC
2256-ENDPROC(arch_unwind_init_running)
2257-#endif
2258-
2259 ENTRY(fixup_4gb_segment)
2260 RING0_EC_FRAME
2261 pushl $do_fixup_4gb_segment
2262Index: head-2008-12-01/arch/x86/kernel/head_32-xen.S
2263===================================================================
2264--- head-2008-12-01.orig/arch/x86/kernel/head_32-xen.S 2008-12-01 11:29:05.000000000 +0100
2265+++ head-2008-12-01/arch/x86/kernel/head_32-xen.S 2008-12-01 11:32:38.000000000 +0100
2266@@ -9,6 +9,7 @@
2267 #include <asm/cache.h>
2268 #include <asm/thread_info.h>
2269 #include <asm/asm-offsets.h>
2270+#include <asm/boot.h>
2271 #include <asm/dwarf2.h>
2272 #include <xen/interface/xen.h>
2273 #include <xen/interface/elfnote.h>
2274@@ -35,6 +36,8 @@ ENTRY(startup_32)
2275 /* Set up the stack pointer */
2276 movl $(init_thread_union+THREAD_SIZE),%esp
2277
2278+ call setup_pda
2279+
2280 /* get vendor info */
2281 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2282 XEN_CPUID
2283@@ -57,14 +60,58 @@ ENTRY(startup_32)
2284
2285 movb $1,X86_HARD_MATH
2286
2287- xorl %eax,%eax # Clear FS/GS and LDT
2288+ xorl %eax,%eax # Clear FS
2289 movl %eax,%fs
2290- movl %eax,%gs
2291+
2292+ movl $(__KERNEL_PDA),%eax
2293+ mov %eax,%gs
2294+
2295 cld # gcc2 wants the direction flag cleared at all times
2296
2297 pushl $0 # fake return address for unwinder
2298 jmp start_kernel
2299
2300+/*
2301+ * Point the GDT at this CPU's PDA. This will be
2302+ * cpu_gdt_table and boot_pda.
2303+ */
2304+setup_pda:
2305+ /* get the PDA pointer */
2306+ movl $boot_pda, %eax
2307+
2308+ /* slot the PDA address into the GDT */
2309+ mov $cpu_gdt_table, %ecx
2310+ mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
2311+ shr $16, %eax
2312+ mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
2313+ mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
2314+
2315+ # %esi still points to start_info, and no registers
2316+ # need to be preserved.
2317+
2318+ movl XEN_START_mfn_list(%esi), %ebx
2319+ movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
2320+ shrl $PAGE_SHIFT, %eax
2321+ movl (%ebx,%eax,4), %ecx
2322+ pushl %ecx # frame number for set_gdt below
2323+
2324+ xorl %esi, %esi
2325+ xorl %edx, %edx
2326+ shldl $PAGE_SHIFT, %ecx, %edx
2327+ shll $PAGE_SHIFT, %ecx
2328+ orl $0x61, %ecx
2329+ movl $cpu_gdt_table, %ebx
2330+ movl $__HYPERVISOR_update_va_mapping, %eax
2331+ int $0x82
2332+
2333+ movl $(PAGE_SIZE_asm / 8), %ecx
2334+ movl %esp, %ebx
2335+ movl $__HYPERVISOR_set_gdt, %eax
2336+ int $0x82
2337+
2338+ popl %ecx
2339+ ret
2340+
2341 #define HYPERCALL_PAGE_OFFSET 0x1000
2342 .org HYPERCALL_PAGE_OFFSET
2343 ENTRY(hypercall_page)
2344@@ -93,7 +140,8 @@ ENTRY(empty_zero_page)
2345 /*
2346 * The Global Descriptor Table contains 28 quadwords, per-CPU.
2347 */
2348- .align L1_CACHE_BYTES
2349+ .section .data.page_aligned, "aw"
2350+ .align PAGE_SIZE_asm
2351 ENTRY(cpu_gdt_table)
2352 .quad 0x0000000000000000 /* NULL descriptor */
2353 .quad 0x0000000000000000 /* 0x0b reserved */
2354@@ -135,12 +183,13 @@ ENTRY(cpu_gdt_table)
2355 .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
2356 .quad 0x0000000000000000 /* 0xc8 APM DS data */
2357
2358- .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
2359- .quad 0x0000000000000000 /* 0xd8 - unused */
2360+ .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
2361+ .quad 0x00cf92000000ffff /* 0xd8 - PDA */
2362 .quad 0x0000000000000000 /* 0xe0 - unused */
2363 .quad 0x0000000000000000 /* 0xe8 - unused */
2364 .quad 0x0000000000000000 /* 0xf0 - unused */
2365 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
2366+ .align PAGE_SIZE_asm
2367
2368 #if CONFIG_XEN_COMPAT <= 0x030002
2369 /*
2370@@ -165,9 +214,9 @@ ENTRY(cpu_gdt_table)
2371 .ascii ",ELF_PADDR_OFFSET=0x"
2372 utoa __PAGE_OFFSET
2373 .ascii ",VIRT_ENTRY=0x"
2374- utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
2375+ utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
2376 .ascii ",HYPERCALL_PAGE=0x"
2377- utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2378+ utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2379 .ascii ",FEATURES=writable_page_tables"
2380 .ascii "|writable_descriptor_tables"
2381 .ascii "|auto_translated_physmap"
2382Index: head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c
2383===================================================================
2384--- head-2008-12-01.orig/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:29:05.000000000 +0100
2385+++ head-2008-12-01/arch/x86/kernel/io_apic_32-xen.c 2008-12-01 11:32:38.000000000 +0100
2386@@ -34,6 +34,7 @@
2387 #include <linux/pci.h>
2388 #include <linux/msi.h>
2389 #include <linux/htirq.h>
2390+#include <linux/freezer.h>
2391
2392 #include <asm/io.h>
2393 #include <asm/smp.h>
2394@@ -199,14 +200,20 @@ static struct IO_APIC_route_entry ioapic
2395 * the interrupt, and we need to make sure the entry is fully populated
2396 * before that happens.
2397 */
2398-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2399+static void
2400+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2401 {
2402- unsigned long flags;
2403 union entry_union eu;
2404 eu.entry = e;
2405- spin_lock_irqsave(&ioapic_lock, flags);
2406 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2407 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2408+}
2409+
2410+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2411+{
2412+ unsigned long flags;
2413+ spin_lock_irqsave(&ioapic_lock, flags);
2414+ __ioapic_write_entry(apic, pin, e);
2415 spin_unlock_irqrestore(&ioapic_lock, flags);
2416 }
2417
2418@@ -889,8 +896,7 @@ static int __init find_isa_irq_pin(int i
2419
2420 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2421 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2422- mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2423- mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2424+ mp_bus_id_to_type[lbus] == MP_BUS_MCA
2425 ) &&
2426 (mp_irqs[i].mpc_irqtype == type) &&
2427 (mp_irqs[i].mpc_srcbusirq == irq))
2428@@ -909,8 +915,7 @@ static int __init find_isa_irq_apic(int
2429
2430 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2431 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2432- mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2433- mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2434+ mp_bus_id_to_type[lbus] == MP_BUS_MCA
2435 ) &&
2436 (mp_irqs[i].mpc_irqtype == type) &&
2437 (mp_irqs[i].mpc_srcbusirq == irq))
2438@@ -1043,12 +1048,6 @@ static int EISA_ELCR(unsigned int irq)
2439 #define default_MCA_trigger(idx) (1)
2440 #define default_MCA_polarity(idx) (0)
2441
2442-/* NEC98 interrupts are always polarity zero edge triggered,
2443- * when listed as conforming in the MP table. */
2444-
2445-#define default_NEC98_trigger(idx) (0)
2446-#define default_NEC98_polarity(idx) (0)
2447-
2448 static int __init MPBIOS_polarity(int idx)
2449 {
2450 int bus = mp_irqs[idx].mpc_srcbus;
2451@@ -1083,11 +1082,6 @@ static int __init MPBIOS_polarity(int id
2452 polarity = default_MCA_polarity(idx);
2453 break;
2454 }
2455- case MP_BUS_NEC98: /* NEC 98 pin */
2456- {
2457- polarity = default_NEC98_polarity(idx);
2458- break;
2459- }
2460 default:
2461 {
2462 printk(KERN_WARNING "broken BIOS!!\n");
2463@@ -1157,11 +1151,6 @@ static int MPBIOS_trigger(int idx)
2464 trigger = default_MCA_trigger(idx);
2465 break;
2466 }
2467- case MP_BUS_NEC98: /* NEC 98 pin */
2468- {
2469- trigger = default_NEC98_trigger(idx);
2470- break;
2471- }
2472 default:
2473 {
2474 printk(KERN_WARNING "broken BIOS!!\n");
2475@@ -1223,7 +1212,6 @@ static int pin_2_irq(int idx, int apic,
2476 case MP_BUS_ISA: /* ISA pin */
2477 case MP_BUS_EISA:
2478 case MP_BUS_MCA:
2479- case MP_BUS_NEC98:
2480 {
2481 irq = mp_irqs[idx].mpc_srcbusirq;
2482 break;
2483@@ -1291,7 +1279,7 @@ static inline int IO_APIC_irq_trigger(in
2484 }
2485
2486 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
2487-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2488+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2489
2490 static int __assign_irq_vector(int irq)
2491 {
2492@@ -1417,8 +1405,8 @@ static void __init setup_IO_APIC_irqs(vo
2493 if (!apic && (irq < 16))
2494 disable_8259A_irq(irq);
2495 }
2496- ioapic_write_entry(apic, pin, entry);
2497 spin_lock_irqsave(&ioapic_lock, flags);
2498+ __ioapic_write_entry(apic, pin, entry);
2499 set_native_irq_info(irq, TARGET_CPUS);
2500 spin_unlock_irqrestore(&ioapic_lock, flags);
2501 }
2502@@ -1988,6 +1976,15 @@ static void __init setup_ioapic_ids_from
2503 #endif
2504
2505 #ifndef CONFIG_XEN
2506+static int no_timer_check __initdata;
2507+
2508+static int __init notimercheck(char *s)
2509+{
2510+ no_timer_check = 1;
2511+ return 1;
2512+}
2513+__setup("no_timer_check", notimercheck);
2514+
2515 /*
2516 * There is a nasty bug in some older SMP boards, their mptable lies
2517 * about the timer IRQ. We do the following to work around the situation:
2518@@ -1996,10 +1993,13 @@ static void __init setup_ioapic_ids_from
2519 * - if this function detects that timer IRQs are defunct, then we fall
2520 * back to ISA timer IRQs
2521 */
2522-static int __init timer_irq_works(void)
2523+int __init timer_irq_works(void)
2524 {
2525 unsigned long t1 = jiffies;
2526
2527+ if (no_timer_check)
2528+ return 1;
2529+
2530 local_irq_enable();
2531 /* Let ten ticks pass... */
2532 mdelay((10 * 1000) / HZ);
2533@@ -2226,9 +2226,15 @@ static inline void unlock_ExtINT_logic(v
2534 unsigned char save_control, save_freq_select;
2535
2536 pin = find_isa_irq_pin(8, mp_INT);
2537+ if (pin == -1) {
2538+ WARN_ON_ONCE(1);
2539+ return;
2540+ }
2541 apic = find_isa_irq_apic(8, mp_INT);
2542- if (pin == -1)
2543+ if (apic == -1) {
2544+ WARN_ON_ONCE(1);
2545 return;
2546+ }
2547
2548 entry0 = ioapic_read_entry(apic, pin);
2549 clear_IO_APIC_pin(apic, pin);
2550@@ -2273,7 +2279,7 @@ int timer_uses_ioapic_pin_0;
2551 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2552 * fanatically on his truly buggy board.
2553 */
2554-static inline void check_timer(void)
2555+static inline void __init check_timer(void)
2556 {
2557 int apic1, pin1, apic2, pin2;
2558 int vector;
2559@@ -2558,7 +2564,7 @@ device_initcall(ioapic_init_sysfs);
2560 int create_irq(void)
2561 {
2562 /* Allocate an unused irq */
2563- int irq, new, vector;
2564+ int irq, new, vector = 0;
2565 unsigned long flags;
2566
2567 irq = -ENOSPC;
2568@@ -2939,8 +2945,8 @@ int io_apic_set_pci_routing (int ioapic,
2569 if (!ioapic && (irq < 16))
2570 disable_8259A_irq(irq);
2571
2572- ioapic_write_entry(ioapic, pin, entry);
2573 spin_lock_irqsave(&ioapic_lock, flags);
2574+ __ioapic_write_entry(ioapic, pin, entry);
2575 set_native_irq_info(irq, TARGET_CPUS);
2576 spin_unlock_irqrestore(&ioapic_lock, flags);
2577
2578Index: head-2008-12-01/arch/x86/kernel/ldt_32-xen.c
2579===================================================================
2580--- head-2008-12-01.orig/arch/x86/kernel/ldt_32-xen.c 2008-12-01 11:29:05.000000000 +0100
2581+++ head-2008-12-01/arch/x86/kernel/ldt_32-xen.c 2008-12-01 11:32:38.000000000 +0100
2582@@ -177,16 +177,14 @@ static int read_default_ldt(void __user
2583 {
2584 int err;
2585 unsigned long size;
2586- void *address;
2587
2588 err = 0;
2589- address = &default_ldt[0];
2590 size = 5*sizeof(struct desc_struct);
2591 if (size > bytecount)
2592 size = bytecount;
2593
2594 err = size;
2595- if (copy_to_user(ptr, address, size))
2596+ if (clear_user(ptr, size))
2597 err = -EFAULT;
2598
2599 return err;
2600Index: head-2008-12-01/arch/x86/kernel/microcode-xen.c
2601===================================================================
2602--- head-2008-12-01.orig/arch/x86/kernel/microcode-xen.c 2008-12-01 11:29:05.000000000 +0100
2603+++ head-2008-12-01/arch/x86/kernel/microcode-xen.c 2008-12-01 11:32:38.000000000 +0100
2604@@ -1,7 +1,7 @@
2605 /*
2606 * Intel CPU Microcode Update Driver for Linux
2607 *
2608- * Copyright (C) 2000-2004 Tigran Aivazian
2609+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
2610 * 2006 Shaohua Li <shaohua.li@intel.com>
2611 *
2612 * This driver allows to upgrade microcode on Intel processors
2613@@ -43,7 +43,7 @@
2614 #include <asm/processor.h>
2615
2616 MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
2617-MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
2618+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
2619 MODULE_LICENSE("GPL");
2620
2621 static int verbose;
2622@@ -195,7 +195,7 @@ static int __init microcode_init (void)
2623 request_microcode();
2624
2625 printk(KERN_INFO
2626- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
2627+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
2628 return 0;
2629 }
2630
2631Index: head-2008-12-01/arch/x86/kernel/mpparse_32-xen.c
2632===================================================================
2633--- head-2008-12-01.orig/arch/x86/kernel/mpparse_32-xen.c 2008-12-01 11:29:05.000000000 +0100
2634+++ head-2008-12-01/arch/x86/kernel/mpparse_32-xen.c 2008-12-01 11:32:38.000000000 +0100
2635@@ -36,7 +36,7 @@
2636
2637 /* Have we found an MP table */
2638 int smp_found_config;
2639-unsigned int __initdata maxcpus = NR_CPUS;
2640+unsigned int __cpuinitdata maxcpus = NR_CPUS;
2641
2642 /*
2643 * Various Linux-internal data structures created from the
2644@@ -102,10 +102,10 @@ static int __init mpf_checksum(unsigned
2645 */
2646
2647 static int mpc_record;
2648-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
2649+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
2650
2651 #ifndef CONFIG_XEN
2652-static void __devinit MP_processor_info (struct mpc_config_processor *m)
2653+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
2654 {
2655 int ver, apicid;
2656 physid_mask_t phys_cpu;
2657@@ -221,7 +221,7 @@ static void __devinit MP_processor_info
2658 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
2659 }
2660 #else
2661-void __init MP_processor_info (struct mpc_config_processor *m)
2662+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
2663 {
2664 num_processors++;
2665 }
2666@@ -256,8 +256,6 @@ static void __init MP_bus_info (struct m
2667 mp_current_pci_id++;
2668 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
2669 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
2670- } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
2671- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
2672 } else {
2673 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
2674 }
2675@@ -842,7 +840,7 @@ void __init mp_register_lapic_address(u6
2676 #endif
2677 }
2678
2679-void __devinit mp_register_lapic (u8 id, u8 enabled)
2680+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
2681 {
2682 struct mpc_config_processor processor;
2683 int boot_cpu = 0;
2684Index: head-2008-12-01/arch/x86/kernel/pci-dma-xen.c
2685===================================================================
2686--- head-2008-12-01.orig/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:29:05.000000000 +0100
2687+++ head-2008-12-01/arch/x86/kernel/pci-dma-xen.c 2008-12-01 11:32:38.000000000 +0100
2688@@ -276,7 +276,7 @@ EXPORT_SYMBOL(dma_free_coherent);
2689 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
2690 dma_addr_t device_addr, size_t size, int flags)
2691 {
2692- void __iomem *mem_base;
2693+ void __iomem *mem_base = NULL;
2694 int pages = size >> PAGE_SHIFT;
2695 int bitmap_size = (pages + 31)/32;
2696
2697@@ -293,14 +293,12 @@ int dma_declare_coherent_memory(struct d
2698 if (!mem_base)
2699 goto out;
2700
2701- dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
2702+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
2703 if (!dev->dma_mem)
2704 goto out;
2705- memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
2706- dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
2707+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
2708 if (!dev->dma_mem->bitmap)
2709 goto free1_out;
2710- memset(dev->dma_mem->bitmap, 0, bitmap_size);
2711
2712 dev->dma_mem->virt_base = mem_base;
2713 dev->dma_mem->device_base = device_addr;
2714@@ -315,6 +313,8 @@ int dma_declare_coherent_memory(struct d
2715 free1_out:
2716 kfree(dev->dma_mem->bitmap);
2717 out:
2718+ if (mem_base)
2719+ iounmap(mem_base);
2720 return 0;
2721 }
2722 EXPORT_SYMBOL(dma_declare_coherent_memory);
2723Index: head-2008-12-01/arch/x86/kernel/process_32-xen.c
2724===================================================================
2725--- head-2008-12-01.orig/arch/x86/kernel/process_32-xen.c 2008-12-01 11:29:05.000000000 +0100
2726+++ head-2008-12-01/arch/x86/kernel/process_32-xen.c 2008-12-01 11:32:38.000000000 +0100
2727@@ -60,6 +60,7 @@
2728
2729 #include <asm/tlbflush.h>
2730 #include <asm/cpu.h>
2731+#include <asm/pda.h>
2732
2733 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
2734
2735@@ -104,28 +105,24 @@ EXPORT_SYMBOL(enable_hlt);
2736 */
2737 static void poll_idle (void)
2738 {
2739- local_irq_enable();
2740-
2741- asm volatile(
2742- "2:"
2743- "testl %0, %1;"
2744- "rep; nop;"
2745- "je 2b;"
2746- : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
2747+ cpu_relax();
2748 }
2749
2750 static void xen_idle(void)
2751 {
2752- local_irq_disable();
2753+ current_thread_info()->status &= ~TS_POLLING;
2754+ /*
2755+ * TS_POLLING-cleared state must be visible before we
2756+ * test NEED_RESCHED:
2757+ */
2758+ smp_mb();
2759
2760- if (need_resched())
2761+ local_irq_disable();
2762+ if (!need_resched())
2763+ safe_halt(); /* enables interrupts racelessly */
2764+ else
2765 local_irq_enable();
2766- else {
2767- current_thread_info()->status &= ~TS_POLLING;
2768- smp_mb__after_clear_bit();
2769- safe_halt();
2770- current_thread_info()->status |= TS_POLLING;
2771- }
2772+ current_thread_info()->status |= TS_POLLING;
2773 }
2774 #ifdef CONFIG_APM_MODULE
2775 EXPORT_SYMBOL(default_idle);
2776@@ -250,8 +247,8 @@ void show_regs(struct pt_regs * regs)
2777 regs->eax,regs->ebx,regs->ecx,regs->edx);
2778 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
2779 regs->esi, regs->edi, regs->ebp);
2780- printk(" DS: %04x ES: %04x\n",
2781- 0xffff & regs->xds,0xffff & regs->xes);
2782+ printk(" DS: %04x ES: %04x GS: %04x\n",
2783+ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
2784
2785 cr0 = read_cr0();
2786 cr2 = read_cr2();
2787@@ -282,6 +279,7 @@ int kernel_thread(int (*fn)(void *), voi
2788
2789 regs.xds = __USER_DS;
2790 regs.xes = __USER_DS;
2791+ regs.xgs = __KERNEL_PDA;
2792 regs.orig_eax = -1;
2793 regs.eip = (unsigned long) kernel_thread_helper;
2794 regs.xcs = __KERNEL_CS | get_kernel_rpl();
2795@@ -359,7 +357,6 @@ int copy_thread(int nr, unsigned long cl
2796 p->thread.eip = (unsigned long) ret_from_fork;
2797
2798 savesegment(fs,p->thread.fs);
2799- savesegment(gs,p->thread.gs);
2800
2801 tsk = current;
2802 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
2803@@ -438,7 +435,7 @@ void dump_thread(struct pt_regs * regs,
2804 dump->regs.ds = regs->xds;
2805 dump->regs.es = regs->xes;
2806 savesegment(fs,dump->regs.fs);
2807- savesegment(gs,dump->regs.gs);
2808+ dump->regs.gs = regs->xgs;
2809 dump->regs.orig_eax = regs->orig_eax;
2810 dump->regs.eip = regs->eip;
2811 dump->regs.cs = regs->xcs;
2812@@ -635,17 +632,19 @@ struct task_struct fastcall * __switch_t
2813 if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
2814 BUG();
2815
2816+ /* we're going to use this soon, after a few expensive things */
2817+ if (next_p->fpu_counter > 5)
2818+ prefetch(&next->i387.fxsave);
2819+
2820 /*
2821- * Restore %fs and %gs if needed.
2822+ * Restore %fs if needed.
2823 *
2824- * Glibc normally makes %fs be zero, and %gs is one of
2825- * the TLS segments.
2826+ * Glibc normally makes %fs be zero.
2827 */
2828 if (unlikely(next->fs))
2829 loadsegment(fs, next->fs);
2830
2831- if (next->gs)
2832- loadsegment(gs, next->gs);
2833+ write_pda(pcurrent, next_p);
2834
2835 /*
2836 * Now maybe handle debug registers
2837@@ -655,6 +654,13 @@ struct task_struct fastcall * __switch_t
2838
2839 disable_tsc(prev_p, next_p);
2840
2841+ /* If the task has used fpu the last 5 timeslices, just do a full
2842+ * restore of the math state immediately to avoid the trap; the
2843+ * chances of needing FPU soon are obviously high now
2844+ */
2845+ if (next_p->fpu_counter > 5)
2846+ math_state_restore();
2847+
2848 return prev_p;
2849 }
2850
2851Index: head-2008-12-01/arch/x86/kernel/quirks-xen.c
2852===================================================================
2853--- head-2008-12-01.orig/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100
2854+++ head-2008-12-01/arch/x86/kernel/quirks-xen.c 2008-12-01 11:32:38.000000000 +0100
2855@@ -3,10 +3,12 @@
2856 */
2857 #include <linux/pci.h>
2858 #include <linux/irq.h>
2859+#include <asm/pci-direct.h>
2860+#include <asm/genapic.h>
2861+#include <asm/cpu.h>
2862
2863 #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
2864-
2865-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
2866+static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
2867 {
2868 u8 config, rev;
2869 u32 word;
2870@@ -14,14 +16,12 @@ static void __devinit quirk_intel_irqbal
2871 /* BIOS may enable hardware IRQ balancing for
2872 * E7520/E7320/E7525(revision ID 0x9 and below)
2873 * based platforms.
2874- * Disable SW irqbalance/affinity on those platforms.
2875+ * For those platforms, make sure that the genapic is set to 'flat'
2876 */
2877 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
2878 if (rev > 0x9)
2879 return;
2880
2881- printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2882-
2883 /* enable access to config space*/
2884 pci_read_config_byte(dev, 0xf4, &config);
2885 pci_write_config_byte(dev, 0xf4, config|0x2);
2886@@ -30,6 +30,46 @@ static void __devinit quirk_intel_irqbal
2887 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
2888
2889 if (!(word & (1 << 13))) {
2890+#ifndef CONFIG_XEN
2891+#ifdef CONFIG_X86_64
2892+ if (genapic != &apic_flat)
2893+ panic("APIC mode must be flat on this system\n");
2894+#elif defined(CONFIG_X86_GENERICARCH)
2895+ if (genapic != &apic_default)
2896+ panic("APIC mode must be default(flat) on this system. Use apic=default\n");
2897+#endif
2898+#endif
2899+ }
2900+
2901+ /* put back the original value for config space*/
2902+ if (!(config & 0x2))
2903+ pci_write_config_byte(dev, 0xf4, config);
2904+}
2905+
2906+void __init quirk_intel_irqbalance(void)
2907+{
2908+ u8 config, rev;
2909+ u32 word;
2910+
2911+ /* BIOS may enable hardware IRQ balancing for
2912+ * E7520/E7320/E7525(revision ID 0x9 and below)
2913+ * based platforms.
2914+ * Disable SW irqbalance/affinity on those platforms.
2915+ */
2916+ rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
2917+ if (rev > 0x9)
2918+ return;
2919+
2920+ printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2921+
2922+ /* enable access to config space */
2923+ config = read_pci_config_byte(0, 0, 0, 0xf4);
2924+ write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
2925+
2926+ /* read xTPR register */
2927+ word = read_pci_config_16(0, 0, 0x40, 0x4c);
2928+
2929+ if (!(word & (1 << 13))) {
2930 struct xen_platform_op op;
2931 printk(KERN_INFO "Disabling irq balancing and affinity\n");
2932 op.cmd = XENPF_platform_quirk;
2933@@ -37,11 +77,12 @@ static void __devinit quirk_intel_irqbal
2934 WARN_ON(HYPERVISOR_platform_op(&op));
2935 }
2936
2937- /* put back the original value for config space*/
2938+ /* put back the original value for config space */
2939 if (!(config & 0x2))
2940- pci_write_config_byte(dev, 0xf4, config);
2941+ write_pci_config_byte(0, 0, 0, 0xf4, config);
2942 }
2943-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
2944-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
2945-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
2946+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
2947+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
2948+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
2949+
2950 #endif
2951Index: head-2008-12-01/arch/x86/kernel/setup_32-xen.c
2952===================================================================
2953--- head-2008-12-01.orig/arch/x86/kernel/setup_32-xen.c 2008-12-01 11:29:05.000000000 +0100
2954+++ head-2008-12-01/arch/x86/kernel/setup_32-xen.c 2008-12-01 11:32:38.000000000 +0100
2955@@ -76,9 +76,6 @@
2956 #include <xen/interface/kexec.h>
2957 #endif
2958
2959-/* Forward Declaration. */
2960-void __init find_max_pfn(void);
2961-
2962 static int xen_panic_event(struct notifier_block *, unsigned long, void *);
2963 static struct notifier_block xen_panic_block = {
2964 xen_panic_event, NULL, 0 /* try to go last */
2965@@ -89,14 +86,11 @@ int disable_pse __devinitdata = 0;
2966 /*
2967 * Machine setup..
2968 */
2969-
2970-#ifdef CONFIG_EFI
2971-int efi_enabled = 0;
2972-EXPORT_SYMBOL(efi_enabled);
2973-#endif
2974+extern struct resource code_resource;
2975+extern struct resource data_resource;
2976
2977 /* cpu data as detected by the assembly code in head.S */
2978-struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
2979+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
2980 /* common cpu data for all cpus */
2981 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
2982 EXPORT_SYMBOL(boot_cpu_data);
2983@@ -112,12 +106,6 @@ unsigned int machine_submodel_id;
2984 unsigned int BIOS_revision;
2985 unsigned int mca_pentium_flag;
2986
2987-/* For PCI or other memory-mapped resources */
2988-unsigned long pci_mem_start = 0x10000000;
2989-#ifdef CONFIG_PCI
2990-EXPORT_SYMBOL(pci_mem_start);
2991-#endif
2992-
2993 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
2994 int bootloader_type;
2995
2996@@ -150,10 +138,6 @@ struct ist_info ist_info;
2997 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
2998 EXPORT_SYMBOL(ist_info);
2999 #endif
3000-struct e820map e820;
3001-#ifdef CONFIG_XEN
3002-struct e820map machine_e820;
3003-#endif
3004
3005 extern void early_cpu_init(void);
3006 extern int root_mountflags;
3007@@ -168,209 +152,6 @@ static char command_line[COMMAND_LINE_SI
3008
3009 unsigned char __initdata boot_params[PARAM_SIZE];
3010
3011-static struct resource data_resource = {
3012- .name = "Kernel data",
3013- .start = 0,
3014- .end = 0,
3015- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3016-};
3017-
3018-static struct resource code_resource = {
3019- .name = "Kernel code",
3020- .start = 0,
3021- .end = 0,
3022- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3023-};
3024-
3025-static struct resource system_rom_resource = {
3026- .name = "System ROM",
3027- .start = 0xf0000,
3028- .end = 0xfffff,
3029- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3030-};
3031-
3032-static struct resource extension_rom_resource = {
3033- .name = "Extension ROM",
3034- .start = 0xe0000,
3035- .end = 0xeffff,
3036- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3037-};
3038-
3039-static struct resource adapter_rom_resources[] = { {
3040- .name = "Adapter ROM",
3041- .start = 0xc8000,
3042- .end = 0,
3043- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3044-}, {
3045- .name = "Adapter ROM",
3046- .start = 0,
3047- .end = 0,
3048- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3049-}, {
3050- .name = "Adapter ROM",
3051- .start = 0,
3052- .end = 0,
3053- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3054-}, {
3055- .name = "Adapter ROM",
3056- .start = 0,
3057- .end = 0,
3058- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3059-}, {
3060- .name = "Adapter ROM",
3061- .start = 0,
3062- .end = 0,
3063- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3064-}, {
3065- .name = "Adapter ROM",
3066- .start = 0,
3067- .end = 0,
3068- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3069-} };
3070-
3071-static struct resource video_rom_resource = {
3072- .name = "Video ROM",
3073- .start = 0xc0000,
3074- .end = 0xc7fff,
3075- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3076-};
3077-
3078-static struct resource video_ram_resource = {
3079- .name = "Video RAM area",
3080- .start = 0xa0000,
3081- .end = 0xbffff,
3082- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3083-};
3084-
3085-static struct resource standard_io_resources[] = { {
3086- .name = "dma1",
3087- .start = 0x0000,
3088- .end = 0x001f,
3089- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3090-}, {
3091- .name = "pic1",
3092- .start = 0x0020,
3093- .end = 0x0021,
3094- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3095-}, {
3096- .name = "timer0",
3097- .start = 0x0040,
3098- .end = 0x0043,
3099- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3100-}, {
3101- .name = "timer1",
3102- .start = 0x0050,
3103- .end = 0x0053,
3104- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3105-}, {
3106- .name = "keyboard",
3107- .start = 0x0060,
3108- .end = 0x006f,
3109- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3110-}, {
3111- .name = "dma page reg",
3112- .start = 0x0080,
3113- .end = 0x008f,
3114- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3115-}, {
3116- .name = "pic2",
3117- .start = 0x00a0,
3118- .end = 0x00a1,
3119- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3120-}, {
3121- .name = "dma2",
3122- .start = 0x00c0,
3123- .end = 0x00df,
3124- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3125-}, {
3126- .name = "fpu",
3127- .start = 0x00f0,
3128- .end = 0x00ff,
3129- .flags = IORESOURCE_BUSY | IORESOURCE_IO
3130-} };
3131-
3132-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
3133-
3134-static int __init romchecksum(unsigned char *rom, unsigned long length)
3135-{
3136- unsigned char *p, sum = 0;
3137-
3138- for (p = rom; p < rom + length; p++)
3139- sum += *p;
3140- return sum == 0;
3141-}
3142-
3143-static void __init probe_roms(void)
3144-{
3145- unsigned long start, length, upper;
3146- unsigned char *rom;
3147- int i;
3148-
3149-#ifdef CONFIG_XEN
3150- /* Nothing to do if not running in dom0. */
3151- if (!is_initial_xendomain())
3152- return;
3153-#endif
3154-
3155- /* video rom */
3156- upper = adapter_rom_resources[0].start;
3157- for (start = video_rom_resource.start; start < upper; start += 2048) {
3158- rom = isa_bus_to_virt(start);
3159- if (!romsignature(rom))
3160- continue;
3161-
3162- video_rom_resource.start = start;
3163-
3164- /* 0 < length <= 0x7f * 512, historically */
3165- length = rom[2] * 512;
3166-
3167- /* if checksum okay, trust length byte */
3168- if (length && romchecksum(rom, length))
3169- video_rom_resource.end = start + length - 1;
3170-
3171- request_resource(&iomem_resource, &video_rom_resource);
3172- break;
3173- }
3174-
3175- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3176- if (start < upper)
3177- start = upper;
3178-
3179- /* system rom */
3180- request_resource(&iomem_resource, &system_rom_resource);
3181- upper = system_rom_resource.start;
3182-
3183- /* check for extension rom (ignore length byte!) */
3184- rom = isa_bus_to_virt(extension_rom_resource.start);
3185- if (romsignature(rom)) {
3186- length = extension_rom_resource.end - extension_rom_resource.start + 1;
3187- if (romchecksum(rom, length)) {
3188- request_resource(&iomem_resource, &extension_rom_resource);
3189- upper = extension_rom_resource.start;
3190- }
3191- }
3192-
3193- /* check for adapter roms on 2k boundaries */
3194- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3195- rom = isa_bus_to_virt(start);
3196- if (!romsignature(rom))
3197- continue;
3198-
3199- /* 0 < length <= 0x7f * 512, historically */
3200- length = rom[2] * 512;
3201-
3202- /* but accept any length that fits if checksum okay */
3203- if (!length || start + length > upper || !romchecksum(rom, length))
3204- continue;
3205-
3206- adapter_rom_resources[i].start = start;
3207- adapter_rom_resources[i].end = start + length - 1;
3208- request_resource(&iomem_resource, &adapter_rom_resources[i]);
3209-
3210- start = adapter_rom_resources[i++].end & ~2047UL;
3211- }
3212-}
3213-
3214 /*
3215 * Point at the empty zero page to start with. We map the real shared_info
3216 * page as soon as fixmap is up and running.
3217@@ -386,353 +167,6 @@ EXPORT_SYMBOL(phys_to_machine_mapping);
3218 start_info_t *xen_start_info;
3219 EXPORT_SYMBOL(xen_start_info);
3220
3221-void __init add_memory_region(unsigned long long start,
3222- unsigned long long size, int type)
3223-{
3224- int x;
3225-
3226- if (!efi_enabled) {
3227- x = e820.nr_map;
3228-
3229- if (x == E820MAX) {
3230- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3231- return;
3232- }
3233-
3234- e820.map[x].addr = start;
3235- e820.map[x].size = size;
3236- e820.map[x].type = type;
3237- e820.nr_map++;
3238- }
3239-} /* add_memory_region */
3240-
3241-static void __init limit_regions(unsigned long long size)
3242-{
3243- unsigned long long current_addr = 0;
3244- int i;
3245-
3246- if (efi_enabled) {
3247- efi_memory_desc_t *md;
3248- void *p;
3249-
3250- for (p = memmap.map, i = 0; p < memmap.map_end;
3251- p += memmap.desc_size, i++) {
3252- md = p;
3253- current_addr = md->phys_addr + (md->num_pages << 12);
3254- if (md->type == EFI_CONVENTIONAL_MEMORY) {
3255- if (current_addr >= size) {
3256- md->num_pages -=
3257- (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
3258- memmap.nr_map = i + 1;
3259- return;
3260- }
3261- }
3262- }
3263- }
3264- for (i = 0; i < e820.nr_map; i++) {
3265- current_addr = e820.map[i].addr + e820.map[i].size;
3266- if (current_addr < size)
3267- continue;
3268-
3269- if (e820.map[i].type != E820_RAM)
3270- continue;
3271-
3272- if (e820.map[i].addr >= size) {
3273- /*
3274- * This region starts past the end of the
3275- * requested size, skip it completely.
3276- */
3277- e820.nr_map = i;
3278- } else {
3279- e820.nr_map = i + 1;
3280- e820.map[i].size -= current_addr - size;
3281- }
3282- return;
3283- }
3284-#ifdef CONFIG_XEN
3285- if (i==e820.nr_map && current_addr < size) {
3286- /*
3287- * The e820 map finished before our requested size so
3288- * extend the final entry to the requested address.
3289- */
3290- --i;
3291- if (e820.map[i].type == E820_RAM)
3292- e820.map[i].size -= current_addr - size;
3293- else
3294- add_memory_region(current_addr, size - current_addr, E820_RAM);
3295- }
3296-#endif
3297-}
3298-
3299-#define E820_DEBUG 1
3300-
3301-static void __init print_memory_map(char *who)
3302-{
3303- int i;
3304-
3305- for (i = 0; i < e820.nr_map; i++) {
3306- printk(" %s: %016Lx - %016Lx ", who,
3307- e820.map[i].addr,
3308- e820.map[i].addr + e820.map[i].size);
3309- switch (e820.map[i].type) {
3310- case E820_RAM: printk("(usable)\n");
3311- break;
3312- case E820_RESERVED:
3313- printk("(reserved)\n");
3314- break;
3315- case E820_ACPI:
3316- printk("(ACPI data)\n");
3317- break;
3318- case E820_NVS:
3319- printk("(ACPI NVS)\n");
3320- break;
3321- default: printk("type %lu\n", e820.map[i].type);
3322- break;
3323- }
3324- }
3325-}
3326-
3327-/*
3328- * Sanitize the BIOS e820 map.
3329- *
3330- * Some e820 responses include overlapping entries. The following
3331- * replaces the original e820 map with a new one, removing overlaps.
3332- *
3333- */
3334-struct change_member {
3335- struct e820entry *pbios; /* pointer to original bios entry */
3336- unsigned long long addr; /* address for this change point */
3337-};
3338-static struct change_member change_point_list[2*E820MAX] __initdata;
3339-static struct change_member *change_point[2*E820MAX] __initdata;
3340-static struct e820entry *overlap_list[E820MAX] __initdata;
3341-static struct e820entry new_bios[E820MAX] __initdata;
3342-
3343-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3344-{
3345- struct change_member *change_tmp;
3346- unsigned long current_type, last_type;
3347- unsigned long long last_addr;
3348- int chgidx, still_changing;
3349- int overlap_entries;
3350- int new_bios_entry;
3351- int old_nr, new_nr, chg_nr;
3352- int i;
3353-
3354- /*
3355- Visually we're performing the following (1,2,3,4 = memory types)...
3356-
3357- Sample memory map (w/overlaps):
3358- ____22__________________
3359- ______________________4_
3360- ____1111________________
3361- _44_____________________
3362- 11111111________________
3363- ____________________33__
3364- ___________44___________
3365- __________33333_________
3366- ______________22________
3367- ___________________2222_
3368- _________111111111______
3369- _____________________11_
3370- _________________4______
3371-
3372- Sanitized equivalent (no overlap):
3373- 1_______________________
3374- _44_____________________
3375- ___1____________________
3376- ____22__________________
3377- ______11________________
3378- _________1______________
3379- __________3_____________
3380- ___________44___________
3381- _____________33_________
3382- _______________2________
3383- ________________1_______
3384- _________________4______
3385- ___________________2____
3386- ____________________33__
3387- ______________________4_
3388- */
3389-
3390- /* if there's only one memory region, don't bother */
3391- if (*pnr_map < 2)
3392- return -1;
3393-
3394- old_nr = *pnr_map;
3395-
3396- /* bail out if we find any unreasonable addresses in bios map */
3397- for (i=0; i<old_nr; i++)
3398- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
3399- return -1;
3400-
3401- /* create pointers for initial change-point information (for sorting) */
3402- for (i=0; i < 2*old_nr; i++)
3403- change_point[i] = &change_point_list[i];
3404-
3405- /* record all known change-points (starting and ending addresses),
3406- omitting those that are for empty memory regions */
3407- chgidx = 0;
3408- for (i=0; i < old_nr; i++) {
3409- if (biosmap[i].size != 0) {
3410- change_point[chgidx]->addr = biosmap[i].addr;
3411- change_point[chgidx++]->pbios = &biosmap[i];
3412- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3413- change_point[chgidx++]->pbios = &biosmap[i];
3414- }
3415- }
3416- chg_nr = chgidx; /* true number of change-points */
3417-
3418- /* sort change-point list by memory addresses (low -> high) */
3419- still_changing = 1;
3420- while (still_changing) {
3421- still_changing = 0;
3422- for (i=1; i < chg_nr; i++) {
3423- /* if <current_addr> > <last_addr>, swap */
3424- /* or, if current=<start_addr> & last=<end_addr>, swap */
3425- if ((change_point[i]->addr < change_point[i-1]->addr) ||
3426- ((change_point[i]->addr == change_point[i-1]->addr) &&
3427- (change_point[i]->addr == change_point[i]->pbios->addr) &&
3428- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3429- )
3430- {
3431- change_tmp = change_point[i];
3432- change_point[i] = change_point[i-1];
3433- change_point[i-1] = change_tmp;
3434- still_changing=1;
3435- }
3436- }
3437- }
3438-
3439- /* create a new bios memory map, removing overlaps */
3440- overlap_entries=0; /* number of entries in the overlap table */
3441- new_bios_entry=0; /* index for creating new bios map entries */
3442- last_type = 0; /* start with undefined memory type */
3443- last_addr = 0; /* start with 0 as last starting address */
3444- /* loop through change-points, determining affect on the new bios map */
3445- for (chgidx=0; chgidx < chg_nr; chgidx++)
3446- {
3447- /* keep track of all overlapping bios entries */
3448- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3449- {
3450- /* add map entry to overlap list (> 1 entry implies an overlap) */
3451- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3452- }
3453- else
3454- {
3455- /* remove entry from list (order independent, so swap with last) */
3456- for (i=0; i<overlap_entries; i++)
3457- {
3458- if (overlap_list[i] == change_point[chgidx]->pbios)
3459- overlap_list[i] = overlap_list[overlap_entries-1];
3460- }
3461- overlap_entries--;
3462- }
3463- /* if there are overlapping entries, decide which "type" to use */
3464- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3465- current_type = 0;
3466- for (i=0; i<overlap_entries; i++)
3467- if (overlap_list[i]->type > current_type)
3468- current_type = overlap_list[i]->type;
3469- /* continue building up new bios map based on this information */
3470- if (current_type != last_type) {
3471- if (last_type != 0) {
3472- new_bios[new_bios_entry].size =
3473- change_point[chgidx]->addr - last_addr;
3474- /* move forward only if the new size was non-zero */
3475- if (new_bios[new_bios_entry].size != 0)
3476- if (++new_bios_entry >= E820MAX)
3477- break; /* no more space left for new bios entries */
3478- }
3479- if (current_type != 0) {
3480- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3481- new_bios[new_bios_entry].type = current_type;
3482- last_addr=change_point[chgidx]->addr;
3483- }
3484- last_type = current_type;
3485- }
3486- }
3487- new_nr = new_bios_entry; /* retain count for new bios entries */
3488-
3489- /* copy new bios mapping into original location */
3490- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3491- *pnr_map = new_nr;
3492-
3493- return 0;
3494-}
3495-
3496-/*
3497- * Copy the BIOS e820 map into a safe place.
3498- *
3499- * Sanity-check it while we're at it..
3500- *
3501- * If we're lucky and live on a modern system, the setup code
3502- * will have given us a memory map that we can use to properly
3503- * set up memory. If we aren't, we'll fake a memory map.
3504- *
3505- * We check to see that the memory map contains at least 2 elements
3506- * before we'll use it, because the detection code in setup.S may
3507- * not be perfect and most every PC known to man has two memory
3508- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3509- * thinkpad 560x, for example, does not cooperate with the memory
3510- * detection code.)
3511- */
3512-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
3513-{
3514-#ifndef CONFIG_XEN
3515- /* Only one memory region (or negative)? Ignore it */
3516- if (nr_map < 2)
3517- return -1;
3518-#else
3519- BUG_ON(nr_map < 1);
3520-#endif
3521-
3522- do {
3523- unsigned long long start = biosmap->addr;
3524- unsigned long long size = biosmap->size;
3525- unsigned long long end = start + size;
3526- unsigned long type = biosmap->type;
3527-
3528- /* Overflow in 64 bits? Ignore the memory map. */
3529- if (start > end)
3530- return -1;
3531-
3532-#ifndef CONFIG_XEN
3533- /*
3534- * Some BIOSes claim RAM in the 640k - 1M region.
3535- * Not right. Fix it up.
3536- */
3537- if (type == E820_RAM) {
3538- if (start < 0x100000ULL && end > 0xA0000ULL) {
3539- if (start < 0xA0000ULL)
3540- add_memory_region(start, 0xA0000ULL-start, type);
3541- if (end <= 0x100000ULL)
3542- continue;
3543- start = 0x100000ULL;
3544- size = end - start;
3545- }
3546- }
3547-#endif
3548- add_memory_region(start, size, type);
3549- } while (biosmap++,--nr_map);
3550-
3551-#ifdef CONFIG_XEN
3552- if (is_initial_xendomain()) {
3553- struct xen_memory_map memmap;
3554-
3555- memmap.nr_entries = E820MAX;
3556- set_xen_guest_handle(memmap.buffer, machine_e820.map);
3557-
3558- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3559- BUG();
3560- machine_e820.nr_map = memmap.nr_entries;
3561- } else
3562- machine_e820 = e820;
3563-#endif
3564-
3565- return 0;
3566-}
3567-
3568 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
3569 struct edd edd;
3570 #ifdef CONFIG_EDD_MODULE
3571@@ -758,7 +192,7 @@ static inline void copy_edd(void)
3572 }
3573 #endif
3574
3575-static int __initdata user_defined_memmap = 0;
3576+int __initdata user_defined_memmap = 0;
3577
3578 /*
3579 * "mem=nopentium" disables the 4MB page tables.
3580@@ -795,51 +229,6 @@ static int __init parse_mem(char *arg)
3581 }
3582 early_param("mem", parse_mem);
3583
3584-static int __init parse_memmap(char *arg)
3585-{
3586- if (!arg)
3587- return -EINVAL;
3588-
3589- if (strcmp(arg, "exactmap") == 0) {
3590-#ifdef CONFIG_CRASH_DUMP
3591- /* If we are doing a crash dump, we
3592- * still need to know the real mem
3593- * size before original memory map is
3594- * reset.
3595- */
3596- find_max_pfn();
3597- saved_max_pfn = max_pfn;
3598-#endif
3599- e820.nr_map = 0;
3600- user_defined_memmap = 1;
3601- } else {
3602- /* If the user specifies memory size, we
3603- * limit the BIOS-provided memory map to
3604- * that size. exactmap can be used to specify
3605- * the exact map. mem=number can be used to
3606- * trim the existing memory map.
3607- */
3608- unsigned long long start_at, mem_size;
3609-
3610- mem_size = memparse(arg, &arg);
3611- if (*arg == '@') {
3612- start_at = memparse(arg+1, &arg);
3613- add_memory_region(start_at, mem_size, E820_RAM);
3614- } else if (*arg == '#') {
3615- start_at = memparse(arg+1, &arg);
3616- add_memory_region(start_at, mem_size, E820_ACPI);
3617- } else if (*arg == '$') {
3618- start_at = memparse(arg+1, &arg);
3619- add_memory_region(start_at, mem_size, E820_RESERVED);
3620- } else {
3621- limit_regions(mem_size);
3622- user_defined_memmap = 1;
3623- }
3624- }
3625- return 0;
3626-}
3627-early_param("memmap", parse_memmap);
3628-
3629 #ifdef CONFIG_PROC_VMCORE
3630 /* elfcorehdr= specifies the location of elf core header
3631 * stored by the crashed kernel.
3632@@ -906,127 +295,6 @@ early_param("reservetop", parse_reservet
3633 #endif
3634
3635 /*
3636- * Callback for efi_memory_walk.
3637- */
3638-static int __init
3639-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
3640-{
3641- unsigned long *max_pfn = arg, pfn;
3642-
3643- if (start < end) {
3644- pfn = PFN_UP(end -1);
3645- if (pfn > *max_pfn)
3646- *max_pfn = pfn;
3647- }
3648- return 0;
3649-}
3650-
3651-static int __init
3652-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
3653-{
3654- memory_present(0, PFN_UP(start), PFN_DOWN(end));
3655- return 0;
3656-}
3657-
3658-/*
3659- * This function checks if any part of the range <start,end> is mapped
3660- * with type.
3661- */
3662-int
3663-e820_any_mapped(u64 start, u64 end, unsigned type)
3664-{
3665- int i;
3666-
3667-#ifndef CONFIG_XEN
3668- for (i = 0; i < e820.nr_map; i++) {
3669- const struct e820entry *ei = &e820.map[i];
3670-#else
3671- if (!is_initial_xendomain())
3672- return 0;
3673- for (i = 0; i < machine_e820.nr_map; ++i) {
3674- const struct e820entry *ei = &machine_e820.map[i];
3675-#endif
3676-
3677- if (type && ei->type != type)
3678- continue;
3679- if (ei->addr >= end || ei->addr + ei->size <= start)
3680- continue;
3681- return 1;
3682- }
3683- return 0;
3684-}
3685-EXPORT_SYMBOL_GPL(e820_any_mapped);
3686-
3687- /*
3688- * This function checks if the entire range <start,end> is mapped with type.
3689- *
3690- * Note: this function only works correct if the e820 table is sorted and
3691- * not-overlapping, which is the case
3692- */
3693-int __init
3694-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3695-{
3696- u64 start = s;
3697- u64 end = e;
3698- int i;
3699-
3700-#ifndef CONFIG_XEN
3701- for (i = 0; i < e820.nr_map; i++) {
3702- struct e820entry *ei = &e820.map[i];
3703-#else
3704- if (!is_initial_xendomain())
3705- return 0;
3706- for (i = 0; i < machine_e820.nr_map; ++i) {
3707- const struct e820entry *ei = &machine_e820.map[i];
3708-#endif
3709- if (type && ei->type != type)
3710- continue;
3711- /* is the region (part) in overlap with the current region ?*/
3712- if (ei->addr >= end || ei->addr + ei->size <= start)
3713- continue;
3714- /* if the region is at the beginning of <start,end> we move
3715- * start to the end of the region since it's ok until there
3716- */
3717- if (ei->addr <= start)
3718- start = ei->addr + ei->size;
3719- /* if start is now at or beyond end, we're done, full
3720- * coverage */
3721- if (start >= end)
3722- return 1; /* we're done */
3723- }
3724- return 0;
3725-}
3726-
3727-/*
3728- * Find the highest page frame number we have available
3729- */
3730-void __init find_max_pfn(void)
3731-{
3732- int i;
3733-
3734- max_pfn = 0;
3735- if (efi_enabled) {
3736- efi_memmap_walk(efi_find_max_pfn, &max_pfn);
3737- efi_memmap_walk(efi_memory_present_wrapper, NULL);
3738- return;
3739- }
3740-
3741- for (i = 0; i < e820.nr_map; i++) {
3742- unsigned long start, end;
3743- /* RAM? */
3744- if (e820.map[i].type != E820_RAM)
3745- continue;
3746- start = PFN_UP(e820.map[i].addr);
3747- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3748- if (start >= end)
3749- continue;
3750- if (end > max_pfn)
3751- max_pfn = end;
3752- memory_present(0, start, end);
3753- }
3754-}
3755-
3756-/*
3757 * Determine low and high memory ranges:
3758 */
3759 unsigned long __init find_max_low_pfn(void)
3760@@ -1085,77 +353,6 @@ unsigned long __init find_max_low_pfn(vo
3761 return max_low_pfn;
3762 }
3763
3764-/*
3765- * Free all available memory for boot time allocation. Used
3766- * as a callback function by efi_memory_walk()
3767- */
3768-
3769-static int __init
3770-free_available_memory(unsigned long start, unsigned long end, void *arg)
3771-{
3772- /* check max_low_pfn */
3773- if (start >= (max_low_pfn << PAGE_SHIFT))
3774- return 0;
3775- if (end >= (max_low_pfn << PAGE_SHIFT))
3776- end = max_low_pfn << PAGE_SHIFT;
3777- if (start < end)
3778- free_bootmem(start, end - start);
3779-
3780- return 0;
3781-}
3782-/*
3783- * Register fully available low RAM pages with the bootmem allocator.
3784- */
3785-static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3786-{
3787- int i;
3788-
3789- if (efi_enabled) {
3790- efi_memmap_walk(free_available_memory, NULL);
3791- return;
3792- }
3793- for (i = 0; i < e820.nr_map; i++) {
3794- unsigned long curr_pfn, last_pfn, size;
3795- /*
3796- * Reserve usable low memory
3797- */
3798- if (e820.map[i].type != E820_RAM)
3799- continue;
3800- /*
3801- * We are rounding up the start address of usable memory:
3802- */
3803- curr_pfn = PFN_UP(e820.map[i].addr);
3804- if (curr_pfn >= max_low_pfn)
3805- continue;
3806- /*
3807- * ... and at the end of the usable range downwards:
3808- */
3809- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3810-
3811-#ifdef CONFIG_XEN
3812- /*
3813- * Truncate to the number of actual pages currently
3814- * present.
3815- */
3816- if (last_pfn > xen_start_info->nr_pages)
3817- last_pfn = xen_start_info->nr_pages;
3818-#endif
3819-
3820- if (last_pfn > max_low_pfn)
3821- last_pfn = max_low_pfn;
3822-
3823- /*
3824- * .. finally, did all the rounding and playing
3825- * around just make the area go away?
3826- */
3827- if (last_pfn <= curr_pfn)
3828- continue;
3829-
3830- size = last_pfn - curr_pfn;
3831- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3832- }
3833-}
3834-
3835 #ifndef CONFIG_XEN
3836 /*
3837 * workaround for Dell systems that neglect to reserve EBDA
3838@@ -1245,8 +442,8 @@ void __init setup_bootmem_allocator(void
3839 * the (very unlikely) case of us accidentally initializing the
3840 * bootmem allocator with an invalid RAM area.
3841 */
3842- reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
3843- bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
3844+ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
3845+ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
3846
3847 #ifndef CONFIG_XEN
3848 /*
3849@@ -1328,160 +525,6 @@ void __init remapped_pgdat_init(void)
3850 }
3851 }
3852
3853-/*
3854- * Request address space for all standard RAM and ROM resources
3855- * and also for regions reported as reserved by the e820.
3856- */
3857-static void __init
3858-legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
3859- struct resource *code_resource,
3860- struct resource *data_resource)
3861-{
3862- int i;
3863-
3864- probe_roms();
3865-
3866- for (i = 0; i < nr_map; i++) {
3867- struct resource *res;
3868-#ifndef CONFIG_RESOURCES_64BIT
3869- if (e820[i].addr + e820[i].size > 0x100000000ULL)
3870- continue;
3871-#endif
3872- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3873- switch (e820[i].type) {
3874- case E820_RAM: res->name = "System RAM"; break;
3875- case E820_ACPI: res->name = "ACPI Tables"; break;
3876- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3877- default: res->name = "reserved";
3878- }
3879- res->start = e820[i].addr;
3880- res->end = res->start + e820[i].size - 1;
3881- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3882- if (request_resource(&iomem_resource, res)) {
3883- kfree(res);
3884- continue;
3885- }
3886- if (e820[i].type == E820_RAM) {
3887- /*
3888- * We don't know which RAM region contains kernel data,
3889- * so we try it repeatedly and let the resource manager
3890- * test it.
3891- */
3892-#ifndef CONFIG_XEN
3893- request_resource(res, code_resource);
3894- request_resource(res, data_resource);
3895-#endif
3896-#ifdef CONFIG_KEXEC
3897- if (crashk_res.start != crashk_res.end)
3898- request_resource(res, &crashk_res);
3899-#ifdef CONFIG_XEN
3900- xen_machine_kexec_register_resources(res);
3901-#endif
3902-#endif
3903- }
3904- }
3905-}
3906-
3907-/*
3908- * Locate a unused range of the physical address space below 4G which
3909- * can be used for PCI mappings.
3910- */
3911-static void __init
3912-e820_setup_gap(struct e820entry *e820, int nr_map)
3913-{
3914- unsigned long gapstart, gapsize, round;
3915- unsigned long long last;
3916- int i;
3917-
3918- /*
3919- * Search for the bigest gap in the low 32 bits of the e820
3920- * memory space.
3921- */
3922- last = 0x100000000ull;
3923- gapstart = 0x10000000;
3924- gapsize = 0x400000;
3925- i = nr_map;
3926- while (--i >= 0) {
3927- unsigned long long start = e820[i].addr;
3928- unsigned long long end = start + e820[i].size;
3929-
3930- /*
3931- * Since "last" is at most 4GB, we know we'll
3932- * fit in 32 bits if this condition is true
3933- */
3934- if (last > end) {
3935- unsigned long gap = last - end;
3936-
3937- if (gap > gapsize) {
3938- gapsize = gap;
3939- gapstart = end;
3940- }
3941- }
3942- if (start < last)
3943- last = start;
3944- }
3945-
3946- /*
3947- * See how much we want to round up: start off with
3948- * rounding to the next 1MB area.
3949- */
3950- round = 0x100000;
3951- while ((gapsize >> 4) > round)
3952- round += round;
3953- /* Fun with two's complement */
3954- pci_mem_start = (gapstart + round) & -round;
3955-
3956- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3957- pci_mem_start, gapstart, gapsize);
3958-}
3959-
3960-/*
3961- * Request address space for all standard resources
3962- *
3963- * This is called just before pcibios_init(), which is also a
3964- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
3965- */
3966-static int __init request_standard_resources(void)
3967-{
3968- int i;
3969-
3970- /* Nothing to do if not running in dom0. */
3971- if (!is_initial_xendomain())
3972- return 0;
3973-
3974- printk("Setting up standard PCI resources\n");
3975-#ifdef CONFIG_XEN
3976- legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
3977- &code_resource, &data_resource);
3978-#else
3979- if (efi_enabled)
3980- efi_initialize_iomem_resources(&code_resource, &data_resource);
3981- else
3982- legacy_init_iomem_resources(e820.map, e820.nr_map,
3983- &code_resource, &data_resource);
3984-#endif
3985-
3986- /* EFI systems may still have VGA */
3987- request_resource(&iomem_resource, &video_ram_resource);
3988-
3989- /* request I/O space for devices used on all i[345]86 PCs */
3990- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
3991- request_resource(&ioport_resource, &standard_io_resources[i]);
3992- return 0;
3993-}
3994-
3995-subsys_initcall(request_standard_resources);
3996-
3997-static void __init register_memory(void)
3998-{
3999-#ifdef CONFIG_XEN
4000- if (is_initial_xendomain())
4001- e820_setup_gap(machine_e820.map, machine_e820.nr_map);
4002- else
4003-#endif
4004- e820_setup_gap(e820.map, e820.nr_map);
4005-}
4006-
4007 #ifdef CONFIG_MCA
4008 static void set_mca_bus(int x)
4009 {
4010@@ -1491,6 +534,12 @@ static void set_mca_bus(int x)
4011 static void set_mca_bus(int x) { }
4012 #endif
4013
4014+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
4015+char * __init __attribute__((weak)) memory_setup(void)
4016+{
4017+ return machine_specific_memory_setup();
4018+}
4019+
4020 /*
4021 * Determine if we were loaded by an EFI loader. If so, then we have also been
4022 * passed the efi memmap, systab, etc., so we should use these data structures
4023@@ -1578,7 +627,7 @@ void __init setup_arch(char **cmdline_p)
4024 efi_init();
4025 else {
4026 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4027- print_memory_map(machine_specific_memory_setup());
4028+ print_memory_map(memory_setup());
4029 }
4030
4031 copy_edd();
4032@@ -1757,7 +806,7 @@ void __init setup_arch(char **cmdline_p)
4033 get_smp_config();
4034 #endif
4035
4036- register_memory();
4037+ e820_register_memory();
4038
4039 if (is_initial_xendomain()) {
4040 #ifdef CONFIG_VT
4041Index: head-2008-12-01/arch/x86/kernel/smp_32-xen.c
4042===================================================================
4043--- head-2008-12-01.orig/arch/x86/kernel/smp_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4044+++ head-2008-12-01/arch/x86/kernel/smp_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4045@@ -659,6 +659,10 @@ int smp_call_function_single(int cpu, vo
4046 put_cpu();
4047 return -EBUSY;
4048 }
4049+
4050+ /* Can deadlock when called with interrupts disabled */
4051+ WARN_ON(irqs_disabled());
4052+
4053 spin_lock_bh(&call_lock);
4054 __smp_call_function_single(cpu, func, info, nonatomic, wait);
4055 spin_unlock_bh(&call_lock);
4056Index: head-2008-12-01/arch/x86/kernel/time_32-xen.c
4057===================================================================
4058--- head-2008-12-01.orig/arch/x86/kernel/time_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4059+++ head-2008-12-01/arch/x86/kernel/time_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4060@@ -61,6 +61,7 @@
4061 #include <asm/uaccess.h>
4062 #include <asm/processor.h>
4063 #include <asm/timer.h>
4064+#include <asm/time.h>
4065 #include <asm/sections.h>
4066
4067 #include "mach_time.h"
4068@@ -129,11 +130,11 @@ static DEFINE_PER_CPU(struct vcpu_runsta
4069 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
4070 #define NS_PER_TICK (1000000000LL/HZ)
4071
4072-static void __clock_was_set(void *unused)
4073+static void __clock_was_set(struct work_struct *unused)
4074 {
4075 clock_was_set();
4076 }
4077-static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
4078+static DECLARE_WORK(clock_was_set_work, __clock_was_set);
4079
4080 /*
4081 * GCC 4.3 can turn loops over an induction variable into division. We do
4082@@ -544,10 +545,7 @@ static int set_rtc_mmss(unsigned long no
4083 /* gets recalled with irq locally disabled */
4084 /* XXX - does irqsave resolve this? -johnstul */
4085 spin_lock_irqsave(&rtc_lock, flags);
4086- if (efi_enabled)
4087- retval = efi_set_rtc_mmss(nowtime);
4088- else
4089- retval = mach_set_rtc_mmss(nowtime);
4090+ retval = set_wallclock(nowtime);
4091 spin_unlock_irqrestore(&rtc_lock, flags);
4092
4093 return retval;
4094@@ -874,10 +872,7 @@ unsigned long get_cmos_time(void)
4095
4096 spin_lock_irqsave(&rtc_lock, flags);
4097
4098- if (efi_enabled)
4099- retval = efi_get_time();
4100- else
4101- retval = mach_get_cmos_time();
4102+ retval = get_wallclock();
4103
4104 spin_unlock_irqrestore(&rtc_lock, flags);
4105
4106@@ -979,7 +974,7 @@ static void __init hpet_time_init(void)
4107 printk("Using HPET for base-timer\n");
4108 }
4109
4110- time_init_hook();
4111+ do_time_init();
4112 }
4113 #endif
4114
4115Index: head-2008-12-01/arch/x86/kernel/traps_32-xen.c
4116===================================================================
4117--- head-2008-12-01.orig/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4118+++ head-2008-12-01/arch/x86/kernel/traps_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4119@@ -29,6 +29,8 @@
4120 #include <linux/kexec.h>
4121 #include <linux/unwind.h>
4122 #include <linux/uaccess.h>
4123+#include <linux/nmi.h>
4124+#include <linux/bug.h>
4125
4126 #ifdef CONFIG_EISA
4127 #include <linux/ioport.h>
4128@@ -61,9 +63,6 @@ int panic_on_unrecovered_nmi;
4129
4130 asmlinkage int system_call(void);
4131
4132-struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
4133- { 0, 0 }, { 0, 0 } };
4134-
4135 /* Do we ignore FPU interrupts ? */
4136 char ignore_fpu_irq = 0;
4137
4138@@ -100,12 +99,7 @@ asmlinkage void fixup_4gb_segment(void);
4139 #endif
4140 asmlinkage void machine_check(void);
4141
4142-static int kstack_depth_to_print = 24;
4143-#ifdef CONFIG_STACK_UNWIND
4144-static int call_trace = 1;
4145-#else
4146-#define call_trace (-1)
4147-#endif
4148+int kstack_depth_to_print = 24;
4149 ATOMIC_NOTIFIER_HEAD(i386die_chain);
4150
4151 int register_die_notifier(struct notifier_block *nb)
4152@@ -159,25 +153,7 @@ static inline unsigned long print_contex
4153 return ebp;
4154 }
4155
4156-struct ops_and_data {
4157- struct stacktrace_ops *ops;
4158- void *data;
4159-};
4160-
4161-static asmlinkage int
4162-dump_trace_unwind(struct unwind_frame_info *info, void *data)
4163-{
4164- struct ops_and_data *oad = (struct ops_and_data *)data;
4165- int n = 0;
4166-
4167- while (unwind(info) == 0 && UNW_PC(info)) {
4168- n++;
4169- oad->ops->address(oad->data, UNW_PC(info));
4170- if (arch_unw_user_mode(info))
4171- break;
4172- }
4173- return n;
4174-}
4175+#define MSG(msg) ops->warning(data, msg)
4176
4177 void dump_trace(struct task_struct *task, struct pt_regs *regs,
4178 unsigned long *stack,
4179@@ -188,39 +164,6 @@ void dump_trace(struct task_struct *task
4180 if (!task)
4181 task = current;
4182
4183- if (call_trace >= 0) {
4184- int unw_ret = 0;
4185- struct unwind_frame_info info;
4186- struct ops_and_data oad = { .ops = ops, .data = data };
4187-
4188- if (regs) {
4189- if (unwind_init_frame_info(&info, task, regs) == 0)
4190- unw_ret = dump_trace_unwind(&info, &oad);
4191- } else if (task == current)
4192- unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
4193- else {
4194- if (unwind_init_blocked(&info, task) == 0)
4195- unw_ret = dump_trace_unwind(&info, &oad);
4196- }
4197- if (unw_ret > 0) {
4198- if (call_trace == 1 && !arch_unw_user_mode(&info)) {
4199- ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
4200- UNW_PC(&info));
4201- if (UNW_SP(&info) >= PAGE_OFFSET) {
4202- ops->warning(data, "Leftover inexact backtrace:\n");
4203- stack = (void *)UNW_SP(&info);
4204- if (!stack)
4205- return;
4206- ebp = UNW_FP(&info);
4207- } else
4208- ops->warning(data, "Full inexact backtrace again:\n");
4209- } else if (call_trace >= 1)
4210- return;
4211- else
4212- ops->warning(data, "Full inexact backtrace again:\n");
4213- } else
4214- ops->warning(data, "Inexact backtrace:\n");
4215- }
4216 if (!stack) {
4217 unsigned long dummy;
4218 stack = &dummy;
4219@@ -253,6 +196,7 @@ void dump_trace(struct task_struct *task
4220 stack = (unsigned long*)context->previous_esp;
4221 if (!stack)
4222 break;
4223+ touch_nmi_watchdog();
4224 }
4225 }
4226 EXPORT_SYMBOL(dump_trace);
4227@@ -385,7 +329,7 @@ void show_registers(struct pt_regs *regs
4228 * time of the fault..
4229 */
4230 if (in_kernel) {
4231- u8 __user *eip;
4232+ u8 *eip;
4233 int code_bytes = 64;
4234 unsigned char c;
4235
4236@@ -394,18 +338,20 @@ void show_registers(struct pt_regs *regs
4237
4238 printk(KERN_EMERG "Code: ");
4239
4240- eip = (u8 __user *)regs->eip - 43;
4241- if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4242+ eip = (u8 *)regs->eip - 43;
4243+ if (eip < (u8 *)PAGE_OFFSET ||
4244+ probe_kernel_address(eip, c)) {
4245 /* try starting at EIP */
4246- eip = (u8 __user *)regs->eip;
4247+ eip = (u8 *)regs->eip;
4248 code_bytes = 32;
4249 }
4250 for (i = 0; i < code_bytes; i++, eip++) {
4251- if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4252+ if (eip < (u8 *)PAGE_OFFSET ||
4253+ probe_kernel_address(eip, c)) {
4254 printk(" Bad EIP value.");
4255 break;
4256 }
4257- if (eip == (u8 __user *)regs->eip)
4258+ if (eip == (u8 *)regs->eip)
4259 printk("<%02x> ", c);
4260 else
4261 printk("%02x ", c);
4262@@ -414,43 +360,22 @@ void show_registers(struct pt_regs *regs
4263 printk("\n");
4264 }
4265
4266-static void handle_BUG(struct pt_regs *regs)
4267+int is_valid_bugaddr(unsigned long eip)
4268 {
4269- unsigned long eip = regs->eip;
4270 unsigned short ud2;
4271
4272 if (eip < PAGE_OFFSET)
4273- return;
4274- if (probe_kernel_address((unsigned short __user *)eip, ud2))
4275- return;
4276- if (ud2 != 0x0b0f)
4277- return;
4278+ return 0;
4279+ if (probe_kernel_address((unsigned short *)eip, ud2))
4280+ return 0;
4281
4282- printk(KERN_EMERG "------------[ cut here ]------------\n");
4283-
4284-#ifdef CONFIG_DEBUG_BUGVERBOSE
4285- do {
4286- unsigned short line;
4287- char *file;
4288- char c;
4289-
4290- if (probe_kernel_address((unsigned short __user *)(eip + 2),
4291- line))
4292- break;
4293- if (__get_user(file, (char * __user *)(eip + 4)) ||
4294- (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
4295- file = "<bad filename>";
4296-
4297- printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
4298- return;
4299- } while (0);
4300-#endif
4301- printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
4302+ return ud2 == 0x0b0f;
4303 }
4304
4305-/* This is gone through when something in the kernel
4306- * has done something bad and is about to be terminated.
4307-*/
4308+/*
4309+ * This is gone through when something in the kernel has done something bad and
4310+ * is about to be terminated.
4311+ */
4312 void die(const char * str, struct pt_regs * regs, long err)
4313 {
4314 static struct {
4315@@ -458,7 +383,7 @@ void die(const char * str, struct pt_reg
4316 u32 lock_owner;
4317 int lock_owner_depth;
4318 } die = {
4319- .lock = SPIN_LOCK_UNLOCKED,
4320+ .lock = __SPIN_LOCK_UNLOCKED(die.lock),
4321 .lock_owner = -1,
4322 .lock_owner_depth = 0
4323 };
4324@@ -482,7 +407,8 @@ void die(const char * str, struct pt_reg
4325 unsigned long esp;
4326 unsigned short ss;
4327
4328- handle_BUG(regs);
4329+ report_bug(regs->eip);
4330+
4331 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
4332 #ifdef CONFIG_PREEMPT
4333 printk(KERN_EMERG "PREEMPT ");
4334@@ -682,8 +608,7 @@ mem_parity_error(unsigned char reason, s
4335 {
4336 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
4337 "CPU %d.\n", reason, smp_processor_id());
4338- printk(KERN_EMERG "You probably have a hardware problem with your RAM "
4339- "chips\n");
4340+ printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
4341 if (panic_on_unrecovered_nmi)
4342 panic("NMI: Not continuing");
4343
4344@@ -741,7 +666,6 @@ void __kprobes die_nmi(struct pt_regs *r
4345 printk(" on CPU%d, eip %08lx, registers:\n",
4346 smp_processor_id(), regs->eip);
4347 show_registers(regs);
4348- printk(KERN_EMERG "console shuts up ...\n");
4349 console_silent();
4350 spin_unlock(&nmi_print_lock);
4351 bust_spinlocks(0);
4352@@ -1057,49 +981,24 @@ fastcall void do_spurious_interrupt_bug(
4353 #endif
4354 }
4355
4356-fastcall void setup_x86_bogus_stack(unsigned char * stk)
4357+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
4358+ unsigned long kesp)
4359 {
4360- unsigned long *switch16_ptr, *switch32_ptr;
4361- struct pt_regs *regs;
4362- unsigned long stack_top, stack_bot;
4363- unsigned short iret_frame16_off;
4364- int cpu = smp_processor_id();
4365- /* reserve the space on 32bit stack for the magic switch16 pointer */
4366- memmove(stk, stk + 8, sizeof(struct pt_regs));
4367- switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
4368- regs = (struct pt_regs *)stk;
4369- /* now the switch32 on 16bit stack */
4370- stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4371- stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4372- switch32_ptr = (unsigned long *)(stack_top - 8);
4373- iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
4374- /* copy iret frame on 16bit stack */
4375- memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
4376- /* fill in the switch pointers */
4377- switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
4378- switch16_ptr[1] = __ESPFIX_SS;
4379- switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
4380- 8 - CPU_16BIT_STACK_SIZE;
4381- switch32_ptr[1] = __KERNEL_DS;
4382-}
4383-
4384-fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
4385-{
4386- unsigned long *switch32_ptr;
4387- unsigned char *stack16, *stack32;
4388- unsigned long stack_top, stack_bot;
4389- int len;
4390 int cpu = smp_processor_id();
4391- stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4392- stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4393- switch32_ptr = (unsigned long *)(stack_top - 8);
4394- /* copy the data from 16bit stack to 32bit stack */
4395- len = CPU_16BIT_STACK_SIZE - 8 - sp;
4396- stack16 = (unsigned char *)(stack_bot + sp);
4397- stack32 = (unsigned char *)
4398- (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
4399- memcpy(stack32, stack16, len);
4400- return stack32;
4401+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4402+ struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
4403+ unsigned long base = (kesp - uesp) & -THREAD_SIZE;
4404+ unsigned long new_kesp = kesp - base;
4405+ unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
4406+ __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
4407+ /* Set up base for espfix segment */
4408+ desc &= 0x00f0ff0000000000ULL;
4409+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
4410+ ((((__u64)base) << 32) & 0xff00000000000000ULL) |
4411+ ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
4412+ (lim_pages & 0xffff);
4413+ *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
4414+ return new_kesp;
4415 }
4416 #endif
4417
4418@@ -1113,7 +1012,7 @@ fastcall unsigned char * fixup_x86_bogus
4419 * Must be called with kernel preemption disabled (in this case,
4420 * local interrupts are disabled at the call-site in entry.S).
4421 */
4422-asmlinkage void math_state_restore(struct pt_regs regs)
4423+asmlinkage void math_state_restore(void)
4424 {
4425 struct thread_info *thread = current_thread_info();
4426 struct task_struct *tsk = thread->task;
4427@@ -1123,6 +1022,7 @@ asmlinkage void math_state_restore(struc
4428 init_fpu(tsk);
4429 restore_fpu(tsk);
4430 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
4431+ tsk->fpu_counter++;
4432 }
4433
4434 #ifndef CONFIG_MATH_EMULATION
4435@@ -1234,19 +1134,3 @@ static int __init kstack_setup(char *s)
4436 return 1;
4437 }
4438 __setup("kstack=", kstack_setup);
4439-
4440-#ifdef CONFIG_STACK_UNWIND
4441-static int __init call_trace_setup(char *s)
4442-{
4443- if (strcmp(s, "old") == 0)
4444- call_trace = -1;
4445- else if (strcmp(s, "both") == 0)
4446- call_trace = 0;
4447- else if (strcmp(s, "newfallback") == 0)
4448- call_trace = 1;
4449- else if (strcmp(s, "new") == 2)
4450- call_trace = 2;
4451- return 1;
4452-}
4453-__setup("call_trace=", call_trace_setup);
4454-#endif
4455Index: head-2008-12-01/arch/x86/kernel/vmlinux_32.lds.S
4456===================================================================
4457--- head-2008-12-01.orig/arch/x86/kernel/vmlinux_32.lds.S 2008-12-01 10:53:14.000000000 +0100
4458+++ head-2008-12-01/arch/x86/kernel/vmlinux_32.lds.S 2008-12-01 11:32:38.000000000 +0100
4459@@ -29,6 +29,12 @@ PHDRS {
4460 SECTIONS
4461 {
4462 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
4463+
4464+#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
4465+#undef LOAD_OFFSET
4466+#define LOAD_OFFSET 0
4467+#endif
4468+
4469 phys_startup_32 = startup_32 - LOAD_OFFSET;
4470
4471 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
4472Index: head-2008-12-01/arch/x86/kvm/Kconfig
4473===================================================================
4474--- head-2008-12-01.orig/arch/x86/kvm/Kconfig 2008-12-01 10:53:14.000000000 +0100
4475+++ head-2008-12-01/arch/x86/kvm/Kconfig 2008-12-01 11:32:38.000000000 +0100
4476@@ -7,6 +7,7 @@ config HAVE_KVM
4477 menuconfig VIRTUALIZATION
4478 bool "Virtualization"
4479 depends on HAVE_KVM || X86
4480+ depends on !XEN
4481 default y
4482 ---help---
4483 Say Y here to get to see options for using your Linux host to run other
4484Index: head-2008-12-01/arch/x86/mm/fault_32-xen.c
4485===================================================================
4486--- head-2008-12-01.orig/arch/x86/mm/fault_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4487+++ head-2008-12-01/arch/x86/mm/fault_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4488@@ -22,9 +22,9 @@
4489 #include <linux/highmem.h>
4490 #include <linux/module.h>
4491 #include <linux/kprobes.h>
4492+#include <linux/uaccess.h>
4493
4494 #include <asm/system.h>
4495-#include <asm/uaccess.h>
4496 #include <asm/desc.h>
4497 #include <asm/kdebug.h>
4498 #include <asm/segment.h>
4499@@ -167,7 +167,7 @@ static inline unsigned long get_segment_
4500 static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
4501 {
4502 unsigned long limit;
4503- unsigned long instr = get_segment_eip (regs, &limit);
4504+ unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
4505 int scan_more = 1;
4506 int prefetch = 0;
4507 int i;
4508@@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs
4509 unsigned char instr_hi;
4510 unsigned char instr_lo;
4511
4512- if (instr > limit)
4513+ if (instr > (unsigned char *)limit)
4514 break;
4515- if (__get_user(opcode, (unsigned char __user *) instr))
4516+ if (probe_kernel_address(instr, opcode))
4517 break;
4518
4519 instr_hi = opcode & 0xf0;
4520@@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs
4521 case 0x00:
4522 /* Prefetch instruction is 0x0F0D or 0x0F18 */
4523 scan_more = 0;
4524- if (instr > limit)
4525+ if (instr > (unsigned char *)limit)
4526 break;
4527- if (__get_user(opcode, (unsigned char __user *) instr))
4528+ if (probe_kernel_address(instr, opcode))
4529 break;
4530 prefetch = (instr_lo == 0xF) &&
4531 (opcode == 0x0D || opcode == 0x18);
4532Index: head-2008-12-01/arch/x86/mm/highmem_32-xen.c
4533===================================================================
4534--- head-2008-12-01.orig/arch/x86/mm/highmem_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4535+++ head-2008-12-01/arch/x86/mm/highmem_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4536@@ -32,7 +32,7 @@ static void *__kmap_atomic(struct page *
4537 unsigned long vaddr;
4538
4539 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
4540- inc_preempt_count();
4541+ pagefault_disable();
4542 if (!PageHighMem(page))
4543 return page_address(page);
4544
4545@@ -63,26 +63,22 @@ void kunmap_atomic(void *kvaddr, enum km
4546 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
4547 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
4548
4549-#ifdef CONFIG_DEBUG_HIGHMEM
4550- if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
4551- dec_preempt_count();
4552- preempt_check_resched();
4553- return;
4554- }
4555-
4556- if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
4557- BUG();
4558-#endif
4559 /*
4560 * Force other mappings to Oops if they'll try to access this pte
4561 * without first remap it. Keeping stale mappings around is a bad idea
4562 * also, in case the page changes cacheability attributes or becomes
4563 * a protected page in a hypervisor.
4564 */
4565- kpte_clear_flush(kmap_pte-idx, vaddr);
4566+ if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
4567+ kpte_clear_flush(kmap_pte-idx, vaddr);
4568+ else {
4569+#ifdef CONFIG_DEBUG_HIGHMEM
4570+ BUG_ON(vaddr < PAGE_OFFSET);
4571+ BUG_ON(vaddr >= (unsigned long)high_memory);
4572+#endif
4573+ }
4574
4575- dec_preempt_count();
4576- preempt_check_resched();
4577+ pagefault_enable();
4578 }
4579
4580 /* This is the same as kmap_atomic() but can map memory that doesn't
4581@@ -93,7 +89,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
4582 enum fixed_addresses idx;
4583 unsigned long vaddr;
4584
4585- inc_preempt_count();
4586+ pagefault_disable();
4587
4588 idx = type + KM_TYPE_NR*smp_processor_id();
4589 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
4590Index: head-2008-12-01/arch/x86/mm/init_32-xen.c
4591===================================================================
4592--- head-2008-12-01.orig/arch/x86/mm/init_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4593+++ head-2008-12-01/arch/x86/mm/init_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4594@@ -233,8 +233,6 @@ static inline int page_kills_ppro(unsign
4595
4596 #endif
4597
4598-extern int is_available_memory(efi_memory_desc_t *);
4599-
4600 int page_is_ram(unsigned long pagenr)
4601 {
4602 int i;
4603@@ -327,7 +325,7 @@ void __init add_one_highpage_init(struct
4604 SetPageReserved(page);
4605 }
4606
4607-static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
4608+static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
4609 {
4610 free_new_highpage(page, pfn);
4611 totalram_pages++;
4612@@ -344,7 +342,7 @@ static int add_one_highpage_hotplug(stru
4613 * has been added dynamically that would be
4614 * onlined here is in HIGHMEM
4615 */
4616-void online_page(struct page *page)
4617+void __meminit online_page(struct page *page)
4618 {
4619 ClearPageReserved(page);
4620 add_one_highpage_hotplug(page, page_to_pfn(page));
4621@@ -732,16 +730,10 @@ void __init mem_init(void)
4622 set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
4623 }
4624
4625-/*
4626- * this is for the non-NUMA, single node SMP system case.
4627- * Specifically, in the case of x86, we will always add
4628- * memory to the highmem for now.
4629- */
4630 #ifdef CONFIG_MEMORY_HOTPLUG
4631-#ifndef CONFIG_NEED_MULTIPLE_NODES
4632 int arch_add_memory(int nid, u64 start, u64 size)
4633 {
4634- struct pglist_data *pgdata = &contig_page_data;
4635+ struct pglist_data *pgdata = NODE_DATA(nid);
4636 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
4637 unsigned long start_pfn = start >> PAGE_SHIFT;
4638 unsigned long nr_pages = size >> PAGE_SHIFT;
4639@@ -753,11 +745,11 @@ int remove_memory(u64 start, u64 size)
4640 {
4641 return -EINVAL;
4642 }
4643-#endif
4644+EXPORT_SYMBOL_GPL(remove_memory);
4645 #endif
4646
4647-kmem_cache_t *pgd_cache;
4648-kmem_cache_t *pmd_cache;
4649+struct kmem_cache *pgd_cache;
4650+struct kmem_cache *pmd_cache;
4651
4652 void __init pgtable_cache_init(void)
4653 {
4654Index: head-2008-12-01/arch/x86/mm/pgtable_32-xen.c
4655===================================================================
4656--- head-2008-12-01.orig/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:29:05.000000000 +0100
4657+++ head-2008-12-01/arch/x86/mm/pgtable_32-xen.c 2008-12-01 11:32:38.000000000 +0100
4658@@ -203,7 +203,7 @@ void pte_free(struct page *pte)
4659 __free_page(pte);
4660 }
4661
4662-void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
4663+void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
4664 {
4665 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4666 }
4667@@ -243,7 +243,7 @@ static inline void pgd_list_del(pgd_t *p
4668 set_page_private(next, (unsigned long)pprev);
4669 }
4670
4671-void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
4672+void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4673 {
4674 unsigned long flags;
4675
4676@@ -264,7 +264,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
4677 }
4678
4679 /* never called when PTRS_PER_PMD > 1 */
4680-void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
4681+void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4682 {
4683 unsigned long flags; /* can be called from interrupt context */
4684
4685Index: head-2008-12-01/arch/x86/pci/irq-xen.c
4686===================================================================
4687--- head-2008-12-01.orig/arch/x86/pci/irq-xen.c 2008-12-01 11:29:05.000000000 +0100
4688+++ head-2008-12-01/arch/x86/pci/irq-xen.c 2008-12-01 11:32:38.000000000 +0100
4689@@ -768,7 +768,7 @@ static void __init pirq_find_router(stru
4690 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
4691 rt->rtr_vendor, rt->rtr_device);
4692
4693- pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
4694+ pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
4695 if (!pirq_router_dev) {
4696 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
4697 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
4698@@ -788,6 +788,8 @@ static void __init pirq_find_router(stru
4699 pirq_router_dev->vendor,
4700 pirq_router_dev->device,
4701 pci_name(pirq_router_dev));
4702+
4703+ /* The device remains referenced for the kernel lifetime */
4704 }
4705
4706 static struct irq_info *pirq_get_info(struct pci_dev *dev)
4707Index: head-2008-12-01/arch/x86/kernel/entry_64-xen.S
4708===================================================================
4709--- head-2008-12-01.orig/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:29:05.000000000 +0100
4710+++ head-2008-12-01/arch/x86/kernel/entry_64-xen.S 2008-12-01 11:32:38.000000000 +0100
4711@@ -261,7 +261,6 @@ ENTRY(system_call)
4712 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
4713 GET_THREAD_INFO(%rcx)
4714 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
4715- CFI_REMEMBER_STATE
4716 jnz tracesys
4717 cmpq $__NR_syscall_max,%rax
4718 ja badsys
4719@@ -272,7 +271,6 @@ ENTRY(system_call)
4720 * Syscall return path ending with SYSRET (fast path)
4721 * Has incomplete stack frame and undefined top of stack.
4722 */
4723- .globl ret_from_sys_call
4724 ret_from_sys_call:
4725 movl $_TIF_ALLWORK_MASK,%edi
4726 /* edi: flagmask */
4727@@ -282,8 +280,8 @@ sysret_check:
4728 TRACE_IRQS_OFF
4729 movl threadinfo_flags(%rcx),%edx
4730 andl %edi,%edx
4731- CFI_REMEMBER_STATE
4732 jnz sysret_careful
4733+ CFI_REMEMBER_STATE
4734 /*
4735 * sysretq will re-enable interrupts:
4736 */
4737@@ -292,10 +290,10 @@ sysret_check:
4738 RESTORE_ARGS 0,8,0
4739 HYPERVISOR_IRET VGCF_IN_SYSCALL
4740
4741+ CFI_RESTORE_STATE
4742 /* Handle reschedules */
4743 /* edx: work, edi: workmask */
4744 sysret_careful:
4745- CFI_RESTORE_STATE
4746 bt $TIF_NEED_RESCHED,%edx
4747 jnc sysret_signal
4748 TRACE_IRQS_ON
4749@@ -334,7 +332,6 @@ badsys:
4750
4751 /* Do syscall tracing */
4752 tracesys:
4753- CFI_RESTORE_STATE
4754 SAVE_REST
4755 movq $-ENOSYS,RAX(%rsp)
4756 FIXUP_TOP_OF_STACK %rdi
4757@@ -350,32 +347,13 @@ tracesys:
4758 call *sys_call_table(,%rax,8)
4759 1: movq %rax,RAX-ARGOFFSET(%rsp)
4760 /* Use IRET because user could have changed frame */
4761- jmp int_ret_from_sys_call
4762- CFI_ENDPROC
4763-END(system_call)
4764
4765 /*
4766 * Syscall return path ending with IRET.
4767 * Has correct top of stack, but partial stack frame.
4768- */
4769-ENTRY(int_ret_from_sys_call)
4770- CFI_STARTPROC simple
4771- CFI_SIGNAL_FRAME
4772- CFI_DEF_CFA rsp,SS+8-ARGOFFSET
4773- /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
4774- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
4775- /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
4776- /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
4777- CFI_REL_OFFSET rip,RIP-ARGOFFSET
4778- CFI_REL_OFFSET rdx,RDX-ARGOFFSET
4779- CFI_REL_OFFSET rcx,RCX-ARGOFFSET
4780- CFI_REL_OFFSET rax,RAX-ARGOFFSET
4781- CFI_REL_OFFSET rdi,RDI-ARGOFFSET
4782- CFI_REL_OFFSET rsi,RSI-ARGOFFSET
4783- CFI_REL_OFFSET r8,R8-ARGOFFSET
4784- CFI_REL_OFFSET r9,R9-ARGOFFSET
4785- CFI_REL_OFFSET r10,R10-ARGOFFSET
4786- CFI_REL_OFFSET r11,R11-ARGOFFSET
4787+ */
4788+ .globl int_ret_from_sys_call
4789+int_ret_from_sys_call:
4790 XEN_BLOCK_EVENTS(%rsi)
4791 TRACE_IRQS_OFF
4792 testb $3,CS-ARGOFFSET(%rsp)
4793@@ -428,8 +406,6 @@ int_very_careful:
4794 popq %rdi
4795 CFI_ADJUST_CFA_OFFSET -8
4796 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
4797- XEN_BLOCK_EVENTS(%rsi)
4798- TRACE_IRQS_OFF
4799 jmp int_restore_rest
4800
4801 int_signal:
4802@@ -445,7 +421,7 @@ int_restore_rest:
4803 TRACE_IRQS_OFF
4804 jmp int_with_check
4805 CFI_ENDPROC
4806-END(int_ret_from_sys_call)
4807+END(system_call)
4808
4809 /*
4810 * Certain special system calls that need to save a complete full stack frame.
4811@@ -1275,36 +1251,3 @@ ENTRY(call_softirq)
4812 ret
4813 CFI_ENDPROC
4814 ENDPROC(call_softirq)
4815-
4816-#ifdef CONFIG_STACK_UNWIND
4817-ENTRY(arch_unwind_init_running)
4818- CFI_STARTPROC
4819- movq %r15, R15(%rdi)
4820- movq %r14, R14(%rdi)
4821- xchgq %rsi, %rdx
4822- movq %r13, R13(%rdi)
4823- movq %r12, R12(%rdi)
4824- xorl %eax, %eax
4825- movq %rbp, RBP(%rdi)
4826- movq %rbx, RBX(%rdi)
4827- movq (%rsp), %rcx
4828- movq %rax, R11(%rdi)
4829- movq %rax, R10(%rdi)
4830- movq %rax, R9(%rdi)
4831- movq %rax, R8(%rdi)
4832- movq %rax, RAX(%rdi)
4833- movq %rax, RCX(%rdi)
4834- movq %rax, RDX(%rdi)
4835- movq %rax, RSI(%rdi)
4836- movq %rax, RDI(%rdi)
4837- movq %rax, ORIG_RAX(%rdi)
4838- movq %rcx, RIP(%rdi)
4839- leaq 8(%rsp), %rcx
4840- movq $__KERNEL_CS, CS(%rdi)
4841- movq %rax, EFLAGS(%rdi)
4842- movq %rcx, RSP(%rdi)
4843- movq $__KERNEL_DS, SS(%rdi)
4844- jmpq *%rdx
4845- CFI_ENDPROC
4846-ENDPROC(arch_unwind_init_running)
4847-#endif
4848Index: head-2008-12-01/arch/x86/kernel/genapic_64-xen.c
4849===================================================================
4850--- head-2008-12-01.orig/arch/x86/kernel/genapic_64-xen.c 2007-06-12 13:13:01.000000000 +0200
4851+++ head-2008-12-01/arch/x86/kernel/genapic_64-xen.c 2008-12-01 11:32:38.000000000 +0100
4852@@ -34,6 +34,7 @@ extern struct genapic apic_physflat;
4853
4854 #ifndef CONFIG_XEN
4855 struct genapic *genapic = &apic_flat;
4856+struct genapic *genapic_force;
4857 #else
4858 extern struct genapic apic_xen;
4859 struct genapic *genapic = &apic_xen;
4860@@ -52,6 +53,13 @@ void __init clustered_apic_check(void)
4861 u8 cluster_cnt[NUM_APIC_CLUSTERS];
4862 int max_apic = 0;
4863
4864+ /* genapic selection can be forced because of certain quirks.
4865+ */
4866+ if (genapic_force) {
4867+ genapic = genapic_force;
4868+ goto print;
4869+ }
4870+
4871 #if defined(CONFIG_ACPI)
4872 /*
4873 * Some x86_64 machines use physical APIC mode regardless of how many
4874Index: head-2008-12-01/arch/x86/kernel/head64-xen.c
4875===================================================================
4876--- head-2008-12-01.orig/arch/x86/kernel/head64-xen.c 2008-12-01 11:29:05.000000000 +0100
4877+++ head-2008-12-01/arch/x86/kernel/head64-xen.c 2008-12-01 11:32:38.000000000 +0100
4878@@ -104,7 +104,10 @@ void __init x86_64_start_kernel(char * r
4879 machine_to_phys_order++;
4880
4881 #if 0
4882- for (i = 0; i < 256; i++)
4883+ /* clear bss before set_intr_gate with early_idt_handler */
4884+ clear_bss();
4885+
4886+ for (i = 0; i < IDT_ENTRIES; i++)
4887 set_intr_gate(i, early_idt_handler);
4888 asm volatile("lidt %0" :: "m" (idt_descr));
4889 #endif
4890Index: head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c
4891===================================================================
4892--- head-2008-12-01.orig/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:29:05.000000000 +0100
4893+++ head-2008-12-01/arch/x86/kernel/io_apic_64-xen.c 2008-12-01 11:32:38.000000000 +0100
4894@@ -202,14 +202,20 @@ static struct IO_APIC_route_entry ioapic
4895 * the interrupt, and we need to make sure the entry is fully populated
4896 * before that happens.
4897 */
4898-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
4899+static void
4900+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
4901 {
4902- unsigned long flags;
4903 union entry_union eu;
4904 eu.entry = e;
4905- spin_lock_irqsave(&ioapic_lock, flags);
4906 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
4907 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
4908+}
4909+
4910+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
4911+{
4912+ unsigned long flags;
4913+ spin_lock_irqsave(&ioapic_lock, flags);
4914+ __ioapic_write_entry(apic, pin, e);
4915 spin_unlock_irqrestore(&ioapic_lock, flags);
4916 }
4917
4918@@ -720,6 +726,22 @@ static int assign_irq_vector(int irq, cp
4919 }
4920
4921 #ifndef CONFIG_XEN
4922+static void __clear_irq_vector(int irq)
4923+{
4924+ cpumask_t mask;
4925+ int cpu, vector;
4926+
4927+ BUG_ON(!irq_vector[irq]);
4928+
4929+ vector = irq_vector[irq];
4930+ cpus_and(mask, irq_domain[irq], cpu_online_map);
4931+ for_each_cpu_mask(cpu, mask)
4932+ per_cpu(vector_irq, cpu)[vector] = -1;
4933+
4934+ irq_vector[irq] = 0;
4935+ irq_domain[irq] = CPU_MASK_NONE;
4936+}
4937+
4938 void __setup_vector_irq(int cpu)
4939 {
4940 /* Initialize vector_irq on a new cpu */
4941@@ -767,26 +789,65 @@ static void ioapic_register_intr(int irq
4942 #define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq)
4943 #endif /* !CONFIG_XEN */
4944
4945-static void __init setup_IO_APIC_irqs(void)
4946+static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
4947 {
4948 struct IO_APIC_route_entry entry;
4949- int apic, pin, idx, irq, first_notcon = 1, vector;
4950+ int vector;
4951 unsigned long flags;
4952
4953- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
4954
4955- for (apic = 0; apic < nr_ioapics; apic++) {
4956- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
4957+ /*
4958+ * add it to the IO-APIC irq-routing table:
4959+ */
4960+ memset(&entry,0,sizeof(entry));
4961
4962- /*
4963- * add it to the IO-APIC irq-routing table:
4964- */
4965- memset(&entry,0,sizeof(entry));
4966+ entry.delivery_mode = INT_DELIVERY_MODE;
4967+ entry.dest_mode = INT_DEST_MODE;
4968+ entry.mask = 0; /* enable IRQ */
4969+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4970
4971- entry.delivery_mode = INT_DELIVERY_MODE;
4972- entry.dest_mode = INT_DEST_MODE;
4973- entry.mask = 0; /* enable IRQ */
4974+ entry.trigger = irq_trigger(idx);
4975+ entry.polarity = irq_polarity(idx);
4976+
4977+ if (irq_trigger(idx)) {
4978+ entry.trigger = 1;
4979+ entry.mask = 1;
4980 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4981+ }
4982+
4983+ if (/* !apic && */ !IO_APIC_IRQ(irq))
4984+ return;
4985+
4986+ if (IO_APIC_IRQ(irq)) {
4987+ cpumask_t mask;
4988+ vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
4989+ if (vector < 0)
4990+ return;
4991+
4992+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
4993+ entry.vector = vector;
4994+
4995+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
4996+ if (!apic && (irq < 16))
4997+ disable_8259A_irq(irq);
4998+ }
4999+
5000+ ioapic_write_entry(apic, pin, entry);
5001+
5002+ spin_lock_irqsave(&ioapic_lock, flags);
5003+ set_native_irq_info(irq, TARGET_CPUS);
5004+ spin_unlock_irqrestore(&ioapic_lock, flags);
5005+
5006+}
5007+
5008+static void __init setup_IO_APIC_irqs(void)
5009+{
5010+ int apic, pin, idx, irq, first_notcon = 1;
5011+
5012+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5013+
5014+ for (apic = 0; apic < nr_ioapics; apic++) {
5015+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5016
5017 idx = find_irq_entry(apic,pin,mp_INT);
5018 if (idx == -1) {
5019@@ -798,39 +859,11 @@ static void __init setup_IO_APIC_irqs(vo
5020 continue;
5021 }
5022
5023- entry.trigger = irq_trigger(idx);
5024- entry.polarity = irq_polarity(idx);
5025-
5026- if (irq_trigger(idx)) {
5027- entry.trigger = 1;
5028- entry.mask = 1;
5029- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5030- }
5031-
5032 irq = pin_2_irq(idx, apic, pin);
5033 add_pin_to_irq(irq, apic, pin);
5034
5035- if (/* !apic && */ !IO_APIC_IRQ(irq))
5036- continue;
5037-
5038- if (IO_APIC_IRQ(irq)) {
5039- cpumask_t mask;
5040- vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
5041- if (vector < 0)
5042- continue;
5043-
5044- entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
5045- entry.vector = vector;
5046+ setup_IO_APIC_irq(apic, pin, idx, irq);
5047
5048- ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5049- if (!apic && (irq < 16))
5050- disable_8259A_irq(irq);
5051- }
5052- ioapic_write_entry(apic, pin, entry);
5053-
5054- spin_lock_irqsave(&ioapic_lock, flags);
5055- set_native_irq_info(irq, TARGET_CPUS);
5056- spin_unlock_irqrestore(&ioapic_lock, flags);
5057 }
5058 }
5059
5060@@ -1826,7 +1859,7 @@ void destroy_irq(unsigned int irq)
5061 dynamic_irq_cleanup(irq);
5062
5063 spin_lock_irqsave(&vector_lock, flags);
5064- irq_vector[irq] = 0;
5065+ __clear_irq_vector(irq);
5066 spin_unlock_irqrestore(&vector_lock, flags);
5067 }
5068
5069@@ -2131,7 +2164,15 @@ void __init setup_ioapic_dest(void)
5070 if (irq_entry == -1)
5071 continue;
5072 irq = pin_2_irq(irq_entry, ioapic, pin);
5073- set_ioapic_affinity_irq(irq, TARGET_CPUS);
5074+
5075+ /* setup_IO_APIC_irqs could fail to get vector for some device
5076+ * when you have too many devices, because at that time only boot
5077+ * cpu is online.
5078+ */
5079+ if(!irq_vector[irq])
5080+ setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
5081+ else
5082+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
5083 }
5084
5085 }
5086Index: head-2008-12-01/arch/x86/kernel/irq_64-xen.c
5087===================================================================
5088--- head-2008-12-01.orig/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5089+++ head-2008-12-01/arch/x86/kernel/irq_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5090@@ -120,7 +120,7 @@ asmlinkage unsigned int do_IRQ(struct pt
5091
5092 if (likely(irq < NR_IRQS))
5093 generic_handle_irq(irq);
5094- else
5095+ else if (printk_ratelimit())
5096 printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
5097 __func__, smp_processor_id(), irq);
5098
5099Index: head-2008-12-01/arch/x86/kernel/mpparse_64-xen.c
5100===================================================================
5101--- head-2008-12-01.orig/arch/x86/kernel/mpparse_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5102+++ head-2008-12-01/arch/x86/kernel/mpparse_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5103@@ -35,8 +35,6 @@
5104 int smp_found_config;
5105 unsigned int __initdata maxcpus = NR_CPUS;
5106
5107-int acpi_found_madt;
5108-
5109 /*
5110 * Various Linux-internal data structures created from the
5111 * MP-table.
5112Index: head-2008-12-01/arch/x86/kernel/process_64-xen.c
5113===================================================================
5114--- head-2008-12-01.orig/arch/x86/kernel/process_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5115+++ head-2008-12-01/arch/x86/kernel/process_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5116@@ -119,29 +119,23 @@ void exit_idle(void)
5117 static void poll_idle (void)
5118 {
5119 local_irq_enable();
5120-
5121- asm volatile(
5122- "2:"
5123- "testl %0,%1;"
5124- "rep; nop;"
5125- "je 2b;"
5126- : :
5127- "i" (_TIF_NEED_RESCHED),
5128- "m" (current_thread_info()->flags));
5129+ cpu_relax();
5130 }
5131
5132 static void xen_idle(void)
5133 {
5134+ current_thread_info()->status &= ~TS_POLLING;
5135+ /*
5136+ * TS_POLLING-cleared state must be visible before we
5137+ * test NEED_RESCHED:
5138+ */
5139+ smp_mb();
5140 local_irq_disable();
5141-
5142- if (need_resched())
5143- local_irq_enable();
5144- else {
5145- current_thread_info()->status &= ~TS_POLLING;
5146- smp_mb__after_clear_bit();
5147+ if (!need_resched())
5148 safe_halt();
5149- current_thread_info()->status |= TS_POLLING;
5150- }
5151+ else
5152+ local_irq_enable();
5153+ current_thread_info()->status |= TS_POLLING;
5154 }
5155
5156 #ifdef CONFIG_HOTPLUG_CPU
5157@@ -181,6 +175,12 @@ void cpu_idle (void)
5158 idle = xen_idle; /* no alternatives */
5159 if (cpu_is_offline(smp_processor_id()))
5160 play_dead();
5161+ /*
5162+ * Idle routines should keep interrupts disabled
5163+ * from here on, until they go to idle.
5164+ * Otherwise, idle callbacks can misfire.
5165+ */
5166+ local_irq_disable();
5167 enter_idle();
5168 idle();
5169 /* In many cases the interrupt that ended idle
5170Index: head-2008-12-01/arch/x86/kernel/setup_64-xen.c
5171===================================================================
5172--- head-2008-12-01.orig/arch/x86/kernel/setup_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5173+++ head-2008-12-01/arch/x86/kernel/setup_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5174@@ -573,8 +573,7 @@ void __init setup_arch(char **cmdline_p)
5175 if (LOADER_TYPE && INITRD_START) {
5176 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
5177 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
5178- initrd_start =
5179- INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
5180+ initrd_start = INITRD_START + PAGE_OFFSET;
5181 initrd_end = initrd_start+INITRD_SIZE;
5182 }
5183 else {
5184@@ -990,11 +989,8 @@ static void __cpuinit init_amd(struct cp
5185 /* Fix cpuid4 emulation for more */
5186 num_cache_leaves = 3;
5187
5188- /* When there is only one core no need to synchronize RDTSC */
5189- if (num_possible_cpus() == 1)
5190- set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5191- else
5192- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5193+ /* RDTSC can be speculated around */
5194+ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5195 }
5196
5197 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
5198@@ -1093,6 +1089,15 @@ static void __cpuinit init_intel(struct
5199 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
5200 }
5201
5202+ if (cpu_has_ds) {
5203+ unsigned int l1, l2;
5204+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
5205+ if (!(l1 & (1<<11)))
5206+ set_bit(X86_FEATURE_BTS, c->x86_capability);
5207+ if (!(l1 & (1<<12)))
5208+ set_bit(X86_FEATURE_PEBS, c->x86_capability);
5209+ }
5210+
5211 n = c->extended_cpuid_level;
5212 if (n >= 0x80000008) {
5213 unsigned eax = cpuid_eax(0x80000008);
5214@@ -1112,7 +1117,10 @@ static void __cpuinit init_intel(struct
5215 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
5216 if (c->x86 == 6)
5217 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
5218- set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5219+ if (c->x86 == 15)
5220+ set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5221+ else
5222+ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
5223 c->x86_max_cores = intel_num_cpu_cores(c);
5224
5225 srat_detect_node();
5226Index: head-2008-12-01/arch/x86/kernel/smp_64-xen.c
5227===================================================================
5228--- head-2008-12-01.orig/arch/x86/kernel/smp_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5229+++ head-2008-12-01/arch/x86/kernel/smp_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5230@@ -384,12 +384,17 @@ int smp_call_function_single (int cpu, v
5231 put_cpu();
5232 return 0;
5233 }
5234+
5235+ /* Can deadlock when called with interrupts disabled */
5236+ WARN_ON(irqs_disabled());
5237+
5238 spin_lock_bh(&call_lock);
5239 __smp_call_function_single(cpu, func, info, nonatomic, wait);
5240 spin_unlock_bh(&call_lock);
5241 put_cpu();
5242 return 0;
5243 }
5244+EXPORT_SYMBOL(smp_call_function_single);
5245
5246 /*
5247 * this function sends a 'generic call function' IPI to all other CPUs
5248Index: head-2008-12-01/arch/x86/kernel/traps_64-xen.c
5249===================================================================
5250--- head-2008-12-01.orig/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5251+++ head-2008-12-01/arch/x86/kernel/traps_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5252@@ -30,9 +30,10 @@
5253 #include <linux/kprobes.h>
5254 #include <linux/kexec.h>
5255 #include <linux/unwind.h>
5256+#include <linux/uaccess.h>
5257+#include <linux/bug.h>
5258
5259 #include <asm/system.h>
5260-#include <asm/uaccess.h>
5261 #include <asm/io.h>
5262 #include <asm/atomic.h>
5263 #include <asm/debugreg.h>
5264@@ -108,12 +109,7 @@ static inline void preempt_conditional_c
5265 preempt_enable_no_resched();
5266 }
5267
5268-static int kstack_depth_to_print = 12;
5269-#ifdef CONFIG_STACK_UNWIND
5270-static int call_trace = 1;
5271-#else
5272-#define call_trace (-1)
5273-#endif
5274+int kstack_depth_to_print = 12;
5275
5276 #ifdef CONFIG_KALLSYMS
5277 void printk_address(unsigned long address)
5278@@ -218,24 +214,7 @@ static unsigned long *in_exception_stack
5279 return NULL;
5280 }
5281
5282-struct ops_and_data {
5283- struct stacktrace_ops *ops;
5284- void *data;
5285-};
5286-
5287-static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
5288-{
5289- struct ops_and_data *oad = (struct ops_and_data *)context;
5290- int n = 0;
5291-
5292- while (unwind(info) == 0 && UNW_PC(info)) {
5293- n++;
5294- oad->ops->address(oad->data, UNW_PC(info));
5295- if (arch_unw_user_mode(info))
5296- break;
5297- }
5298- return n;
5299-}
5300+#define MSG(txt) ops->warning(data, txt)
5301
5302 /*
5303 * x86-64 can have upto three kernel stacks:
5304@@ -250,61 +229,24 @@ static inline int valid_stack_ptr(struct
5305 return p > t && p < t + THREAD_SIZE - 3;
5306 }
5307
5308-void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
5309+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
5310+ unsigned long *stack,
5311 struct stacktrace_ops *ops, void *data)
5312 {
5313- const unsigned cpu = smp_processor_id();
5314- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
5315+ const unsigned cpu = get_cpu();
5316+ unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
5317 unsigned used = 0;
5318 struct thread_info *tinfo;
5319
5320 if (!tsk)
5321 tsk = current;
5322
5323- if (call_trace >= 0) {
5324- int unw_ret = 0;
5325- struct unwind_frame_info info;
5326- struct ops_and_data oad = { .ops = ops, .data = data };
5327-
5328- if (regs) {
5329- if (unwind_init_frame_info(&info, tsk, regs) == 0)
5330- unw_ret = dump_trace_unwind(&info, &oad);
5331- } else if (tsk == current)
5332- unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
5333- else {
5334- if (unwind_init_blocked(&info, tsk) == 0)
5335- unw_ret = dump_trace_unwind(&info, &oad);
5336- }
5337- if (unw_ret > 0) {
5338- if (call_trace == 1 && !arch_unw_user_mode(&info)) {
5339- ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
5340- UNW_PC(&info));
5341- if ((long)UNW_SP(&info) < 0) {
5342- ops->warning(data, "Leftover inexact backtrace:\n");
5343- stack = (unsigned long *)UNW_SP(&info);
5344- if (!stack)
5345- return;
5346- } else
5347- ops->warning(data, "Full inexact backtrace again:\n");
5348- } else if (call_trace >= 1)
5349- return;
5350- else
5351- ops->warning(data, "Full inexact backtrace again:\n");
5352- } else
5353- ops->warning(data, "Inexact backtrace:\n");
5354- }
5355 if (!stack) {
5356 unsigned long dummy;
5357 stack = &dummy;
5358 if (tsk && tsk != current)
5359 stack = (unsigned long *)tsk->thread.rsp;
5360 }
5361- /*
5362- * Align the stack pointer on word boundary, later loops
5363- * rely on that (and corruption / debug info bugs can cause
5364- * unaligned values here):
5365- */
5366- stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
5367
5368 /*
5369 * Print function call entries within a stack. 'cond' is the
5370@@ -314,9 +256,9 @@ void dump_trace(struct task_struct *tsk,
5371 #define HANDLE_STACK(cond) \
5372 do while (cond) { \
5373 unsigned long addr = *stack++; \
5374- if (oops_in_progress ? \
5375- __kernel_text_address(addr) : \
5376- kernel_text_address(addr)) { \
5377+ /* Use unlocked access here because except for NMIs \
5378+ we should be already protected against module unloads */ \
5379+ if (__kernel_text_address(addr)) { \
5380 /* \
5381 * If the address is either in the text segment of the \
5382 * kernel, or in the region which contains vmalloc'ed \
5383@@ -379,9 +321,10 @@ void dump_trace(struct task_struct *tsk,
5384 /*
5385 * This handles the process stack:
5386 */
5387- tinfo = current_thread_info();
5388+ tinfo = task_thread_info(tsk);
5389 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
5390 #undef HANDLE_STACK
5391+ put_cpu();
5392 }
5393 EXPORT_SYMBOL(dump_trace);
5394
5395@@ -518,30 +461,15 @@ bad:
5396 printk("\n");
5397 }
5398
5399-void handle_BUG(struct pt_regs *regs)
5400-{
5401- struct bug_frame f;
5402- long len;
5403- const char *prefix = "";
5404+int is_valid_bugaddr(unsigned long rip)
5405+{
5406+ unsigned short ud2;
5407
5408- if (user_mode(regs))
5409- return;
5410- if (__copy_from_user(&f, (const void __user *) regs->rip,
5411- sizeof(struct bug_frame)))
5412- return;
5413- if (f.filename >= 0 ||
5414- f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
5415- return;
5416- len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
5417- if (len < 0 || len >= PATH_MAX)
5418- f.filename = (int)(long)"unmapped filename";
5419- else if (len > 50) {
5420- f.filename += len - 50;
5421- prefix = "...";
5422- }
5423- printk("----------- [cut here ] --------- [please bite here ] ---------\n");
5424- printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
5425-}
5426+ if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
5427+ return 0;
5428+
5429+ return ud2 == 0x0b0f;
5430+}
5431
5432 #ifdef CONFIG_BUG
5433 void out_of_line_bug(void)
5434@@ -621,7 +549,9 @@ void die(const char * str, struct pt_reg
5435 {
5436 unsigned long flags = oops_begin();
5437
5438- handle_BUG(regs);
5439+ if (!user_mode(regs))
5440+ report_bug(regs->rip);
5441+
5442 __die(str, regs, err);
5443 oops_end(flags);
5444 do_exit(SIGSEGV);
5445@@ -790,8 +720,7 @@ mem_parity_error(unsigned char reason, s
5446 {
5447 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
5448 reason);
5449- printk(KERN_EMERG "You probably have a hardware problem with your "
5450- "RAM chips\n");
5451+ printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
5452
5453 if (panic_on_unrecovered_nmi)
5454 panic("NMI: Not continuing");
5455@@ -1227,21 +1156,3 @@ static int __init kstack_setup(char *s)
5456 return 0;
5457 }
5458 early_param("kstack", kstack_setup);
5459-
5460-#ifdef CONFIG_STACK_UNWIND
5461-static int __init call_trace_setup(char *s)
5462-{
5463- if (!s)
5464- return -EINVAL;
5465- if (strcmp(s, "old") == 0)
5466- call_trace = -1;
5467- else if (strcmp(s, "both") == 0)
5468- call_trace = 0;
5469- else if (strcmp(s, "newfallback") == 0)
5470- call_trace = 1;
5471- else if (strcmp(s, "new") == 0)
5472- call_trace = 2;
5473- return 0;
5474-}
5475-early_param("call_trace", call_trace_setup);
5476-#endif
5477Index: head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c
5478===================================================================
5479--- head-2008-12-01.orig/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5480+++ head-2008-12-01/arch/x86/kernel/vsyscall_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5481@@ -42,6 +42,7 @@
5482 #include <asm/topology.h>
5483
5484 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
5485+#define __syscall_clobber "r11","rcx","memory"
5486
5487 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
5488 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
5489@@ -224,8 +225,7 @@ out:
5490
5491 static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
5492 void __user *oldval, size_t __user *oldlenp,
5493- void __user *newval, size_t newlen,
5494- void **context)
5495+ void __user *newval, size_t newlen)
5496 {
5497 return -ENOSYS;
5498 }
5499@@ -277,7 +277,6 @@ static void __cpuinit cpu_vsyscall_init(
5500 vsyscall_set_cpu(raw_smp_processor_id());
5501 }
5502
5503-#ifdef CONFIG_HOTPLUG_CPU
5504 static int __cpuinit
5505 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
5506 {
5507@@ -286,13 +285,13 @@ cpu_vsyscall_notifier(struct notifier_bl
5508 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
5509 return NOTIFY_DONE;
5510 }
5511-#endif
5512
5513 static void __init map_vsyscall(void)
5514 {
5515 extern char __vsyscall_0;
5516 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
5517
5518+ /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
5519 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
5520 }
5521
5522Index: head-2008-12-01/arch/x86/mm/fault_64-xen.c
5523===================================================================
5524--- head-2008-12-01.orig/arch/x86/mm/fault_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5525+++ head-2008-12-01/arch/x86/mm/fault_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5526@@ -23,9 +23,9 @@
5527 #include <linux/compiler.h>
5528 #include <linux/module.h>
5529 #include <linux/kprobes.h>
5530+#include <linux/uaccess.h>
5531
5532 #include <asm/system.h>
5533-#include <asm/uaccess.h>
5534 #include <asm/pgalloc.h>
5535 #include <asm/smp.h>
5536 #include <asm/tlbflush.h>
5537@@ -96,7 +96,7 @@ void bust_spinlocks(int yes)
5538 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
5539 unsigned long error_code)
5540 {
5541- unsigned char __user *instr;
5542+ unsigned char *instr;
5543 int scan_more = 1;
5544 int prefetch = 0;
5545 unsigned char *max_instr;
5546@@ -116,7 +116,7 @@ static noinline int is_prefetch(struct p
5547 unsigned char instr_hi;
5548 unsigned char instr_lo;
5549
5550- if (__get_user(opcode, (char __user *)instr))
5551+ if (probe_kernel_address(instr, opcode))
5552 break;
5553
5554 instr_hi = opcode & 0xf0;
5555@@ -154,7 +154,7 @@ static noinline int is_prefetch(struct p
5556 case 0x00:
5557 /* Prefetch instruction is 0x0F0D or 0x0F18 */
5558 scan_more = 0;
5559- if (__get_user(opcode, (char __user *)instr))
5560+ if (probe_kernel_address(instr, opcode))
5561 break;
5562 prefetch = (instr_lo == 0xF) &&
5563 (opcode == 0x0D || opcode == 0x18);
5564@@ -170,7 +170,7 @@ static noinline int is_prefetch(struct p
5565 static int bad_address(void *p)
5566 {
5567 unsigned long dummy;
5568- return __get_user(dummy, (unsigned long __user *)p);
5569+ return probe_kernel_address((unsigned long *)p, dummy);
5570 }
5571
5572 void dump_pagetable(unsigned long address)
5573Index: head-2008-12-01/arch/x86/mm/init_64-xen.c
5574===================================================================
5575--- head-2008-12-01.orig/arch/x86/mm/init_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5576+++ head-2008-12-01/arch/x86/mm/init_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5577@@ -1129,14 +1129,15 @@ static __init int x8664_sysctl_init(void
5578 __initcall(x8664_sysctl_init);
5579 #endif
5580
5581-/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
5582+/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
5583 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
5584 not need special handling anymore. */
5585
5586 static struct vm_area_struct gate_vma = {
5587 .vm_start = VSYSCALL_START,
5588- .vm_end = VSYSCALL_END,
5589- .vm_page_prot = PAGE_READONLY
5590+ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
5591+ .vm_page_prot = PAGE_READONLY_EXEC,
5592+ .vm_flags = VM_READ | VM_EXEC
5593 };
5594
5595 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
5596Index: head-2008-12-01/arch/x86/mm/pageattr_64-xen.c
5597===================================================================
5598--- head-2008-12-01.orig/arch/x86/mm/pageattr_64-xen.c 2008-12-01 11:29:05.000000000 +0100
5599+++ head-2008-12-01/arch/x86/mm/pageattr_64-xen.c 2008-12-01 11:32:38.000000000 +0100
5600@@ -330,34 +330,40 @@ static struct page *split_large_page(uns
5601 return base;
5602 }
5603
5604-
5605-static void flush_kernel_map(void *address)
5606+static void cache_flush_page(void *adr)
5607 {
5608- if (0 && address && cpu_has_clflush) {
5609- /* is this worth it? */
5610- int i;
5611- for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5612- asm volatile("clflush (%0)" :: "r" (address + i));
5613- } else
5614- asm volatile("wbinvd":::"memory");
5615- if (address)
5616- __flush_tlb_one(address);
5617- else
5618- __flush_tlb_all();
5619+ int i;
5620+ for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5621+ asm volatile("clflush (%0)" :: "r" (adr + i));
5622 }
5623
5624+static void flush_kernel_map(void *arg)
5625+{
5626+ struct list_head *l = (struct list_head *)arg;
5627+ struct page *pg;
5628
5629-static inline void flush_map(unsigned long address)
5630+ /* When clflush is available always use it because it is
5631+ much cheaper than WBINVD */
5632+ if (!cpu_has_clflush)
5633+ asm volatile("wbinvd" ::: "memory");
5634+ list_for_each_entry(pg, l, lru) {
5635+ void *adr = page_address(pg);
5636+ if (cpu_has_clflush)
5637+ cache_flush_page(adr);
5638+ __flush_tlb_one(adr);
5639+ }
5640+}
5641+
5642+static inline void flush_map(struct list_head *l)
5643 {
5644- on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
5645+ on_each_cpu(flush_kernel_map, l, 1, 1);
5646 }
5647
5648-static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
5649+static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
5650
5651 static inline void save_page(struct page *fpage)
5652 {
5653- fpage->lru.next = (struct list_head *)deferred_pages;
5654- deferred_pages = fpage;
5655+ list_add(&fpage->lru, &deferred_pages);
5656 }
5657
5658 /*
5659@@ -487,18 +493,18 @@ int change_page_attr(struct page *page,
5660
5661 void global_flush_tlb(void)
5662 {
5663- struct page *dpage;
5664+ struct page *pg, *next;
5665+ struct list_head l;
5666
5667 down_read(&init_mm.mmap_sem);
5668- dpage = xchg(&deferred_pages, NULL);
5669+ list_replace_init(&deferred_pages, &l);
5670 up_read(&init_mm.mmap_sem);
5671
5672- flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
5673- while (dpage) {
5674- struct page *tmp = dpage;
5675- dpage = (struct page *)dpage->lru.next;
5676- ClearPagePrivate(tmp);
5677- __free_page(tmp);
5678+ flush_map(&l);
5679+
5680+ list_for_each_entry_safe(pg, next, &l, lru) {
5681+ ClearPagePrivate(pg);
5682+ __free_page(pg);
5683 }
5684 }
5685
5686Index: head-2008-12-01/drivers/pci/msi-xen.c
5687===================================================================
5688--- head-2008-12-01.orig/drivers/pci/msi-xen.c 2008-12-01 11:29:05.000000000 +0100
5689+++ head-2008-12-01/drivers/pci/msi-xen.c 2008-12-01 11:32:38.000000000 +0100
5690@@ -263,10 +263,8 @@ void disable_msi_mode(struct pci_dev *de
5691 pci_write_config_word(dev, msi_control_reg(pos), control);
5692 dev->msix_enabled = 0;
5693 }
5694- if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
5695- /* PCI Express Endpoint device detected */
5696- pci_intx(dev, 1); /* enable intx */
5697- }
5698+
5699+ pci_intx(dev, 1); /* enable intx */
5700 }
5701
5702 static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
5703@@ -284,10 +282,8 @@ static void enable_msi_mode(struct pci_d
5704 pci_write_config_word(dev, msi_control_reg(pos), control);
5705 dev->msix_enabled = 1;
5706 }
5707- if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
5708- /* PCI Express Endpoint device detected */
5709- pci_intx(dev, 0); /* disable intx */
5710- }
5711+
5712+ pci_intx(dev, 0); /* disable intx */
5713 }
5714
5715 #ifdef CONFIG_PM
5716Index: head-2008-12-01/drivers/xen/balloon/balloon.c
5717===================================================================
5718--- head-2008-12-01.orig/drivers/xen/balloon/balloon.c 2008-12-01 11:29:05.000000000 +0100
5719+++ head-2008-12-01/drivers/xen/balloon/balloon.c 2008-12-01 11:32:38.000000000 +0100
5720@@ -97,8 +97,8 @@ extern unsigned long totalhigh_pages;
5721 static LIST_HEAD(ballooned_pages);
5722
5723 /* Main work function, always executed in process context. */
5724-static void balloon_process(void *unused);
5725-static DECLARE_WORK(balloon_worker, balloon_process, NULL);
5726+static void balloon_process(struct work_struct *unused);
5727+static DECLARE_WORK(balloon_worker, balloon_process);
5728 static struct timer_list balloon_timer;
5729
5730 /* When ballooning out (allocating memory to return to Xen) we don't really
5731@@ -389,7 +389,7 @@ static int decrease_reservation(unsigned
5732 * by the balloon lock), or with changes to the Xen hard limit, but we will
5733 * recover from these in time.
5734 */
5735-static void balloon_process(void *unused)
5736+static void balloon_process(struct work_struct *unused)
5737 {
5738 int need_sleep = 0;
5739 long credit;
5740Index: head-2008-12-01/drivers/xen/blkback/blkback.c
5741===================================================================
5742--- head-2008-12-01.orig/drivers/xen/blkback/blkback.c 2008-12-01 11:29:05.000000000 +0100
5743+++ head-2008-12-01/drivers/xen/blkback/blkback.c 2008-12-01 11:32:38.000000000 +0100
5744@@ -37,6 +37,7 @@
5745
5746 #include <linux/spinlock.h>
5747 #include <linux/kthread.h>
5748+#include <linux/freezer.h>
5749 #include <linux/list.h>
5750 #include <linux/delay.h>
5751 #include <xen/balloon.h>
5752Index: head-2008-12-01/drivers/xen/blkback/interface.c
5753===================================================================
5754--- head-2008-12-01.orig/drivers/xen/blkback/interface.c 2007-06-12 13:13:44.000000000 +0200
5755+++ head-2008-12-01/drivers/xen/blkback/interface.c 2008-12-01 11:32:38.000000000 +0100
5756@@ -34,7 +34,7 @@
5757 #include <xen/evtchn.h>
5758 #include <linux/kthread.h>
5759
5760-static kmem_cache_t *blkif_cachep;
5761+static struct kmem_cache *blkif_cachep;
5762
5763 blkif_t *blkif_alloc(domid_t domid)
5764 {
5765Index: head-2008-12-01/drivers/xen/blkfront/blkfront.c
5766===================================================================
5767--- head-2008-12-01.orig/drivers/xen/blkfront/blkfront.c 2008-12-01 11:29:05.000000000 +0100
5768+++ head-2008-12-01/drivers/xen/blkfront/blkfront.c 2008-12-01 11:32:38.000000000 +0100
5769@@ -70,7 +70,7 @@ static int setup_blkring(struct xenbus_d
5770 static void kick_pending_request_queues(struct blkfront_info *);
5771
5772 static irqreturn_t blkif_int(int irq, void *dev_id);
5773-static void blkif_restart_queue(void *arg);
5774+static void blkif_restart_queue(struct work_struct *arg);
5775 static void blkif_recover(struct blkfront_info *);
5776 static void blkif_completion(struct blk_shadow *);
5777 static void blkif_free(struct blkfront_info *, int);
5778@@ -110,7 +110,7 @@ static int blkfront_probe(struct xenbus_
5779 info->xbdev = dev;
5780 info->vdevice = vdevice;
5781 info->connected = BLKIF_STATE_DISCONNECTED;
5782- INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
5783+ INIT_WORK(&info->work, blkif_restart_queue);
5784
5785 for (i = 0; i < BLK_RING_SIZE; i++)
5786 info->shadow[i].req.id = i+1;
5787@@ -459,9 +459,9 @@ static void kick_pending_request_queues(
5788 }
5789 }
5790
5791-static void blkif_restart_queue(void *arg)
5792+static void blkif_restart_queue(struct work_struct *arg)
5793 {
5794- struct blkfront_info *info = (struct blkfront_info *)arg;
5795+ struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
5796 spin_lock_irq(&blkif_io_lock);
5797 if (info->connected == BLKIF_STATE_CONNECTED)
5798 kick_pending_request_queues(info);
5799Index: head-2008-12-01/drivers/xen/blktap/blktap.c
5800===================================================================
5801--- head-2008-12-01.orig/drivers/xen/blktap/blktap.c 2008-12-01 11:29:05.000000000 +0100
5802+++ head-2008-12-01/drivers/xen/blktap/blktap.c 2008-12-01 11:32:38.000000000 +0100
5803@@ -40,6 +40,7 @@
5804
5805 #include <linux/spinlock.h>
5806 #include <linux/kthread.h>
5807+#include <linux/freezer.h>
5808 #include <linux/list.h>
5809 #include <asm/hypervisor.h>
5810 #include "common.h"
5811Index: head-2008-12-01/drivers/xen/blktap/interface.c
5812===================================================================
5813--- head-2008-12-01.orig/drivers/xen/blktap/interface.c 2008-09-15 13:40:15.000000000 +0200
5814+++ head-2008-12-01/drivers/xen/blktap/interface.c 2008-12-01 11:32:38.000000000 +0100
5815@@ -34,7 +34,7 @@
5816 #include "common.h"
5817 #include <xen/evtchn.h>
5818
5819-static kmem_cache_t *blkif_cachep;
5820+static struct kmem_cache *blkif_cachep;
5821
5822 blkif_t *tap_alloc_blkif(domid_t domid)
5823 {
5824Index: head-2008-12-01/drivers/xen/char/mem.c
5825===================================================================
5826--- head-2008-12-01.orig/drivers/xen/char/mem.c 2007-08-06 15:10:49.000000000 +0200
5827+++ head-2008-12-01/drivers/xen/char/mem.c 2008-12-01 11:32:38.000000000 +0100
5828@@ -157,7 +157,7 @@ static loff_t memory_lseek(struct file *
5829 {
5830 loff_t ret;
5831
5832- mutex_lock(&file->f_dentry->d_inode->i_mutex);
5833+ mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
5834 switch (orig) {
5835 case 0:
5836 file->f_pos = offset;
5837@@ -172,7 +172,7 @@ static loff_t memory_lseek(struct file *
5838 default:
5839 ret = -EINVAL;
5840 }
5841- mutex_unlock(&file->f_dentry->d_inode->i_mutex);
5842+ mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
5843 return ret;
5844 }
5845
5846Index: head-2008-12-01/drivers/xen/console/console.c
5847===================================================================
5848--- head-2008-12-01.orig/drivers/xen/console/console.c 2008-12-01 11:29:05.000000000 +0100
5849+++ head-2008-12-01/drivers/xen/console/console.c 2008-12-01 11:32:38.000000000 +0100
5850@@ -80,11 +80,6 @@ static int xc_num = -1;
5851 #define XEN_XVC_MAJOR 204
5852 #define XEN_XVC_MINOR 191
5853
5854-#ifdef CONFIG_MAGIC_SYSRQ
5855-static unsigned long sysrq_requested;
5856-extern int sysrq_enabled;
5857-#endif
5858-
5859 static int __init xencons_setup(char *str)
5860 {
5861 char *q;
5862@@ -339,8 +334,8 @@ void __init dom0_init_screen_info(const
5863 #define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
5864 ((_tty)->index != (xc_num - 1)))
5865
5866-static struct termios *xencons_termios[MAX_NR_CONSOLES];
5867-static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
5868+static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
5869+static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES];
5870 static struct tty_struct *xencons_tty;
5871 static int xencons_priv_irq;
5872 static char x_char;
5873@@ -356,7 +351,9 @@ void xencons_rx(char *buf, unsigned len)
5874
5875 for (i = 0; i < len; i++) {
5876 #ifdef CONFIG_MAGIC_SYSRQ
5877- if (sysrq_enabled) {
5878+ if (sysrq_on()) {
5879+ static unsigned long sysrq_requested;
5880+
5881 if (buf[i] == '\x0f') { /* ^O */
5882 if (!sysrq_requested) {
5883 sysrq_requested = jiffies;
5884Index: head-2008-12-01/drivers/xen/core/reboot.c
5885===================================================================
5886--- head-2008-12-01.orig/drivers/xen/core/reboot.c 2008-12-01 11:29:05.000000000 +0100
5887+++ head-2008-12-01/drivers/xen/core/reboot.c 2008-12-01 11:32:38.000000000 +0100
5888@@ -34,8 +34,8 @@ static int suspend_cancelled;
5889 /* Can we leave APs online when we suspend? */
5890 static int fast_suspend;
5891
5892-static void __shutdown_handler(void *unused);
5893-static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
5894+static void __shutdown_handler(struct work_struct *unused);
5895+static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
5896
5897 static int setup_suspend_evtchn(void);
5898
5899@@ -105,7 +105,7 @@ static int xen_suspend(void *__unused)
5900 case SHUTDOWN_RESUMING:
5901 break;
5902 default:
5903- schedule_work(&shutdown_work);
5904+ schedule_delayed_work(&shutdown_work, 0);
5905 break;
5906 }
5907
5908@@ -137,12 +137,12 @@ static void switch_shutdown_state(int ne
5909
5910 /* Either we kick off the work, or we leave it to xen_suspend(). */
5911 if (old_state == SHUTDOWN_INVALID)
5912- schedule_work(&shutdown_work);
5913+ schedule_delayed_work(&shutdown_work, 0);
5914 else
5915 BUG_ON(old_state != SHUTDOWN_RESUMING);
5916 }
5917
5918-static void __shutdown_handler(void *unused)
5919+static void __shutdown_handler(struct work_struct *unused)
5920 {
5921 int err;
5922
5923Index: head-2008-12-01/drivers/xen/core/smpboot.c
5924===================================================================
5925--- head-2008-12-01.orig/drivers/xen/core/smpboot.c 2008-12-01 11:29:05.000000000 +0100
5926+++ head-2008-12-01/drivers/xen/core/smpboot.c 2008-12-01 11:32:38.000000000 +0100
5927@@ -165,7 +165,12 @@ static void xen_smp_intr_exit(unsigned i
5928
5929 void __cpuinit cpu_bringup(void)
5930 {
5931+#ifdef __i386__
5932+ cpu_set_gdt(current_thread_info()->cpu);
5933+ secondary_cpu_init();
5934+#else
5935 cpu_init();
5936+#endif
5937 identify_cpu(cpu_data + smp_processor_id());
5938 touch_softlockup_watchdog();
5939 preempt_disable();
5940@@ -304,11 +309,12 @@ void __init smp_prepare_cpus(unsigned in
5941 if (cpu == 0)
5942 continue;
5943
5944+ idle = fork_idle(cpu);
5945+ if (IS_ERR(idle))
5946+ panic("failed fork for CPU %d", cpu);
5947+
5948 #ifdef __x86_64__
5949 gdt_descr = &cpu_gdt_descr[cpu];
5950-#else
5951- gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5952-#endif
5953 gdt_descr->address = get_zeroed_page(GFP_KERNEL);
5954 if (unlikely(!gdt_descr->address)) {
5955 printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
5956@@ -317,6 +323,11 @@ void __init smp_prepare_cpus(unsigned in
5957 }
5958 gdt_descr->size = GDT_SIZE;
5959 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
5960+#else
5961+ if (unlikely(!init_gdt(cpu, idle)))
5962+ continue;
5963+ gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5964+#endif
5965 make_page_readonly(
5966 (void *)gdt_descr->address,
5967 XENFEAT_writable_descriptor_tables);
5968@@ -336,10 +347,6 @@ void __init smp_prepare_cpus(unsigned in
5969 cpu_2_logical_apicid[cpu] = apicid;
5970 x86_cpu_to_apicid[cpu] = apicid;
5971
5972- idle = fork_idle(cpu);
5973- if (IS_ERR(idle))
5974- panic("failed fork for CPU %d", cpu);
5975-
5976 #ifdef __x86_64__
5977 cpu_pda(cpu)->pcurrent = idle;
5978 cpu_pda(cpu)->cpunumber = cpu;
5979Index: head-2008-12-01/drivers/xen/fbfront/xenfb.c
5980===================================================================
5981--- head-2008-12-01.orig/drivers/xen/fbfront/xenfb.c 2008-12-01 11:29:05.000000000 +0100
5982+++ head-2008-12-01/drivers/xen/fbfront/xenfb.c 2008-12-01 11:32:38.000000000 +0100
5983@@ -25,6 +25,7 @@
5984 #include <linux/vmalloc.h>
5985 #include <linux/mm.h>
5986 #include <linux/mutex.h>
5987+#include <linux/freezer.h>
5988 #include <asm/hypervisor.h>
5989 #include <xen/evtchn.h>
5990 #include <xen/interface/io/fbif.h>
5991Index: head-2008-12-01/drivers/xen/netback/loopback.c
5992===================================================================
5993--- head-2008-12-01.orig/drivers/xen/netback/loopback.c 2008-12-01 11:29:05.000000000 +0100
5994+++ head-2008-12-01/drivers/xen/netback/loopback.c 2008-12-01 11:32:38.000000000 +0100
5995@@ -54,6 +54,7 @@
5996 #include <net/dst.h>
5997 #include <net/xfrm.h> /* secpath_reset() */
5998 #include <asm/hypervisor.h> /* is_initial_xendomain() */
5999+#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
6000
6001 static int nloopbacks = -1;
6002 module_param(nloopbacks, int, 0);
6003Index: head-2008-12-01/drivers/xen/pciback/conf_space_header.c
6004===================================================================
6005--- head-2008-12-01.orig/drivers/xen/pciback/conf_space_header.c 2008-10-29 09:55:56.000000000 +0100
6006+++ head-2008-12-01/drivers/xen/pciback/conf_space_header.c 2008-12-01 11:32:38.000000000 +0100
6007@@ -22,14 +22,14 @@ static int command_write(struct pci_dev
6008 {
6009 int err;
6010
6011- if (!dev->is_enabled && is_enable_cmd(value)) {
6012+ if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) {
6013 if (unlikely(verbose_request))
6014 printk(KERN_DEBUG "pciback: %s: enable\n",
6015 pci_name(dev));
6016 err = pci_enable_device(dev);
6017 if (err)
6018 return err;
6019- } else if (dev->is_enabled && !is_enable_cmd(value)) {
6020+ } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) {
6021 if (unlikely(verbose_request))
6022 printk(KERN_DEBUG "pciback: %s: disable\n",
6023 pci_name(dev));
6024Index: head-2008-12-01/drivers/xen/pciback/pciback.h
6025===================================================================
6026--- head-2008-12-01.orig/drivers/xen/pciback/pciback.h 2008-12-01 11:29:05.000000000 +0100
6027+++ head-2008-12-01/drivers/xen/pciback/pciback.h 2008-12-01 11:32:38.000000000 +0100
6028@@ -88,7 +88,7 @@ void pciback_release_devices(struct pcib
6029
6030 /* Handles events from front-end */
6031 irqreturn_t pciback_handle_event(int irq, void *dev_id);
6032-void pciback_do_op(void *data);
6033+void pciback_do_op(struct work_struct *work);
6034
6035 int pciback_xenbus_register(void);
6036 void pciback_xenbus_unregister(void);
6037Index: head-2008-12-01/drivers/xen/pciback/pciback_ops.c
6038===================================================================
6039--- head-2008-12-01.orig/drivers/xen/pciback/pciback_ops.c 2008-12-01 11:29:05.000000000 +0100
6040+++ head-2008-12-01/drivers/xen/pciback/pciback_ops.c 2008-12-01 11:32:38.000000000 +0100
6041@@ -25,7 +25,7 @@ void pciback_reset_device(struct pci_dev
6042
6043 pci_write_config_word(dev, PCI_COMMAND, 0);
6044
6045- dev->is_enabled = 0;
6046+ atomic_set(&dev->enable_cnt, 0);
6047 dev->is_busmaster = 0;
6048 } else {
6049 pci_read_config_word(dev, PCI_COMMAND, &cmd);
6050@@ -51,9 +51,9 @@ static inline void test_and_schedule_op(
6051 * context because some of the pci_* functions can sleep (mostly due to ACPI
6052 * use of semaphores). This function is intended to be called from a work
6053 * queue in process context taking a struct pciback_device as a parameter */
6054-void pciback_do_op(void *data)
6055+void pciback_do_op(struct work_struct *work)
6056 {
6057- struct pciback_device *pdev = data;
6058+ struct pciback_device *pdev = container_of(work, struct pciback_device, op_work);
6059 struct pci_dev *dev;
6060 struct xen_pci_op *op = &pdev->sh_info->op;
6061
6062Index: head-2008-12-01/drivers/xen/pciback/xenbus.c
6063===================================================================
6064--- head-2008-12-01.orig/drivers/xen/pciback/xenbus.c 2008-07-21 11:00:33.000000000 +0200
6065+++ head-2008-12-01/drivers/xen/pciback/xenbus.c 2008-12-01 11:32:38.000000000 +0100
6066@@ -32,7 +32,7 @@ static struct pciback_device *alloc_pdev
6067 pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
6068 pdev->be_watching = 0;
6069
6070- INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
6071+ INIT_WORK(&pdev->op_work, pciback_do_op);
6072
6073 if (pciback_init_devices(pdev)) {
6074 kfree(pdev);
6075@@ -54,7 +54,6 @@ static void pciback_disconnect(struct pc
6076
6077 /* If the driver domain started an op, make sure we complete it or
6078 * delete it before releasing the shared memory */
6079- cancel_delayed_work(&pdev->op_work);
6080 flush_scheduled_work();
6081
6082 if (pdev->sh_info != NULL) {
6083Index: head-2008-12-01/drivers/xen/scsiback/interface.c
6084===================================================================
6085--- head-2008-12-01.orig/drivers/xen/scsiback/interface.c 2008-07-21 11:00:33.000000000 +0200
6086+++ head-2008-12-01/drivers/xen/scsiback/interface.c 2008-12-01 11:32:38.000000000 +0100
6087@@ -39,7 +39,7 @@
6088 #include <linux/kthread.h>
6089
6090
6091-static kmem_cache_t *scsiback_cachep;
6092+static struct kmem_cache *scsiback_cachep;
6093
6094 struct vscsibk_info *vscsibk_info_alloc(domid_t domid)
6095 {
6096Index: head-2008-12-01/drivers/xen/scsiback/scsiback.c
6097===================================================================
6098--- head-2008-12-01.orig/drivers/xen/scsiback/scsiback.c 2008-12-01 11:29:05.000000000 +0100
6099+++ head-2008-12-01/drivers/xen/scsiback/scsiback.c 2008-12-01 11:32:38.000000000 +0100
6100@@ -322,13 +322,11 @@ static int scsiback_merge_bio(struct req
6101
6102 if (!rq->bio)
6103 blk_rq_bio_prep(q, rq, bio);
6104- else if (!q->back_merge_fn(q, rq, bio))
6105+ else if (!ll_back_merge_fn(q, rq, bio))
6106 return -EINVAL;
6107 else {
6108 rq->biotail->bi_next = bio;
6109 rq->biotail = bio;
6110- rq->hard_nr_sectors += bio_sectors(bio);
6111- rq->nr_sectors = rq->hard_nr_sectors;
6112 }
6113
6114 return 0;
6115Index: head-2008-12-01/drivers/xen/sfc_netfront/accel_vi.c
6116===================================================================
6117--- head-2008-12-01.orig/drivers/xen/sfc_netfront/accel_vi.c 2008-12-01 11:29:05.000000000 +0100
6118+++ head-2008-12-01/drivers/xen/sfc_netfront/accel_vi.c 2008-12-01 11:32:38.000000000 +0100
6119@@ -463,7 +463,7 @@ netfront_accel_enqueue_skb_multi(netfron
6120
6121 if (skb->ip_summed == CHECKSUM_PARTIAL) {
6122 /* Set to zero to encourage falcon to work it out for us */
6123- *(u16*)(skb->h.raw + skb->csum) = 0;
6124+ *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6125 }
6126
6127 if (multi_post_start_new_buffer(vnic, &state)) {
6128@@ -582,7 +582,7 @@ netfront_accel_enqueue_skb_single(netfro
6129
6130 if (skb->ip_summed == CHECKSUM_PARTIAL) {
6131 /* Set to zero to encourage falcon to work it out for us */
6132- *(u16*)(skb->h.raw + skb->csum) = 0;
6133+ *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6134 }
6135 NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
6136 (skb, idx, frag_data, frag_len, {
6137Index: head-2008-12-01/drivers/xen/tpmback/interface.c
6138===================================================================
6139--- head-2008-12-01.orig/drivers/xen/tpmback/interface.c 2008-01-21 11:15:26.000000000 +0100
6140+++ head-2008-12-01/drivers/xen/tpmback/interface.c 2008-12-01 11:32:38.000000000 +0100
6141@@ -15,7 +15,7 @@
6142 #include <xen/balloon.h>
6143 #include <xen/gnttab.h>
6144
6145-static kmem_cache_t *tpmif_cachep;
6146+static struct kmem_cache *tpmif_cachep;
6147 int num_frontends = 0;
6148
6149 LIST_HEAD(tpmif_list);
6150Index: head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c
6151===================================================================
6152--- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:29:05.000000000 +0100
6153+++ head-2008-12-01/drivers/xen/xenbus/xenbus_comms.c 2008-12-01 11:32:38.000000000 +0100
6154@@ -49,9 +49,9 @@
6155
6156 static int xenbus_irq;
6157
6158-extern void xenbus_probe(void *);
6159+extern void xenbus_probe(struct work_struct *);
6160 extern int xenstored_ready;
6161-static DECLARE_WORK(probe_work, xenbus_probe, NULL);
6162+static DECLARE_WORK(probe_work, xenbus_probe);
6163
6164 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
6165
6166Index: head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c
6167===================================================================
6168--- head-2008-12-01.orig/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:22:58.000000000 +0100
6169+++ head-2008-12-01/drivers/xen/xenbus/xenbus_probe.c 2008-12-01 11:32:38.000000000 +0100
6170@@ -843,7 +843,7 @@ void unregister_xenstore_notifier(struct
6171 EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
6172
6173
6174-void xenbus_probe(void *unused)
6175+void xenbus_probe(struct work_struct *unused)
6176 {
6177 BUG_ON((xenstored_ready <= 0));
6178
6179Index: head-2008-12-01/include/asm-x86/mach-xen/asm/desc_32.h
6180===================================================================
6181--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/desc_32.h 2008-12-01 11:29:05.000000000 +0100
6182+++ head-2008-12-01/include/asm-x86/mach-xen/asm/desc_32.h 2008-12-01 11:32:38.000000000 +0100
6183@@ -4,8 +4,6 @@
6184 #include <asm/ldt.h>
6185 #include <asm/segment.h>
6186
6187-#define CPU_16BIT_STACK_SIZE 1024
6188-
6189 #ifndef __ASSEMBLY__
6190
6191 #include <linux/preempt.h>
6192@@ -15,8 +13,6 @@
6193
6194 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
6195
6196-DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
6197-
6198 struct Xgt_desc_struct {
6199 unsigned short size;
6200 unsigned long address __attribute__((packed));
6201@@ -32,11 +28,6 @@ static inline struct desc_struct *get_cp
6202 return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
6203 }
6204
6205-/*
6206- * This is the ldt that every process will get unless we need
6207- * something other than this.
6208- */
6209-extern struct desc_struct default_ldt[];
6210 extern struct desc_struct idt_table[];
6211 extern void set_intr_gate(unsigned int irq, void * addr);
6212
6213@@ -63,8 +54,8 @@ static inline void pack_gate(__u32 *a, _
6214 #define DESCTYPE_DPL3 0x60 /* DPL-3 */
6215 #define DESCTYPE_S 0x10 /* !system */
6216
6217+#ifndef CONFIG_XEN
6218 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
6219-#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
6220
6221 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
6222 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
6223@@ -75,6 +66,7 @@ static inline void pack_gate(__u32 *a, _
6224 #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
6225 #define store_tr(tr) __asm__ ("str %0":"=m" (tr))
6226 #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
6227+#endif
6228
6229 #if TLS_SIZE != 24
6230 # error update this code.
6231@@ -90,22 +82,43 @@ static inline void load_TLS(struct threa
6232 }
6233
6234 #ifndef CONFIG_XEN
6235+#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6236+#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6237+#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6238+
6239 static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
6240 {
6241 __u32 *lp = (__u32 *)((char *)dt + entry*8);
6242 *lp = entry_a;
6243 *(lp+1) = entry_b;
6244 }
6245-
6246-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6247-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6248+#define set_ldt native_set_ldt
6249 #else
6250 extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
6251 extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
6252+#define set_ldt xen_set_ldt
6253+#endif
6254+
6255+#ifndef CONFIG_XEN
6256+static inline fastcall void native_set_ldt(const void *addr,
6257+ unsigned int entries)
6258+{
6259+ if (likely(entries == 0))
6260+ __asm__ __volatile__("lldt %w0"::"q" (0));
6261+ else {
6262+ unsigned cpu = smp_processor_id();
6263+ __u32 a, b;
6264+
6265+ pack_descriptor(&a, &b, (unsigned long)addr,
6266+ entries * sizeof(struct desc_struct) - 1,
6267+ DESCTYPE_LDT, 0);
6268+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6269+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
6270+ }
6271+}
6272 #endif
6273-#ifndef CONFIG_X86_NO_IDT
6274-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6275
6276+#ifndef CONFIG_X86_NO_IDT
6277 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
6278 {
6279 __u32 a, b;
6280@@ -125,14 +138,6 @@ static inline void __set_tss_desc(unsign
6281 }
6282 #endif
6283
6284-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
6285-{
6286- __u32 a, b;
6287- pack_descriptor(&a, &b, (unsigned long)addr,
6288- entries * sizeof(struct desc_struct) - 1,
6289- DESCTYPE_LDT, 0);
6290- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6291-}
6292
6293 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
6294
6295@@ -163,36 +168,22 @@ static inline void set_ldt_desc(unsigned
6296
6297 static inline void clear_LDT(void)
6298 {
6299- int cpu = get_cpu();
6300-
6301- /*
6302- * NB. We load the default_ldt for lcall7/27 handling on demand, as
6303- * it slows down context switching. Noone uses it anyway.
6304- */
6305- cpu = cpu; /* XXX avoid compiler warning */
6306- xen_set_ldt(NULL, 0);
6307- put_cpu();
6308+ set_ldt(NULL, 0);
6309 }
6310
6311 /*
6312 * load one particular LDT into the current CPU
6313 */
6314-static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
6315+static inline void load_LDT_nolock(mm_context_t *pc)
6316 {
6317- void *segments = pc->ldt;
6318- int count = pc->size;
6319-
6320- if (likely(!count))
6321- segments = NULL;
6322-
6323- xen_set_ldt(segments, count);
6324+ set_ldt(pc->ldt, pc->size);
6325 }
6326
6327 static inline void load_LDT(mm_context_t *pc)
6328 {
6329- int cpu = get_cpu();
6330- load_LDT_nolock(pc, cpu);
6331- put_cpu();
6332+ preempt_disable();
6333+ load_LDT_nolock(pc);
6334+ preempt_enable();
6335 }
6336
6337 static inline unsigned long get_desc_base(unsigned long *desc)
6338@@ -204,6 +195,29 @@ static inline unsigned long get_desc_bas
6339 return base;
6340 }
6341
6342+#else /* __ASSEMBLY__ */
6343+
6344+/*
6345+ * GET_DESC_BASE reads the descriptor base of the specified segment.
6346+ *
6347+ * Args:
6348+ * idx - descriptor index
6349+ * gdt - GDT pointer
6350+ * base - 32bit register to which the base will be written
6351+ * lo_w - lo word of the "base" register
6352+ * lo_b - lo byte of the "base" register
6353+ * hi_b - hi byte of the low word of the "base" register
6354+ *
6355+ * Example:
6356+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
6357+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
6358+ */
6359+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
6360+ movb idx*8+4(gdt), lo_b; \
6361+ movb idx*8+7(gdt), hi_b; \
6362+ shll $16, base; \
6363+ movw idx*8+2(gdt), lo_w;
6364+
6365 #endif /* !__ASSEMBLY__ */
6366
6367 #endif
6368Index: head-2008-12-01/include/asm-x86/mach-xen/asm/dma-mapping_32.h
6369===================================================================
6370--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-04-02 12:34:02.000000000 +0200
6371+++ head-2008-12-01/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2008-12-01 11:32:38.000000000 +0100
6372@@ -127,10 +127,10 @@ dma_get_cache_alignment(void)
6373 return (1 << INTERNODE_CACHE_SHIFT);
6374 }
6375
6376-#define dma_is_consistent(d) (1)
6377+#define dma_is_consistent(d, h) (1)
6378
6379 static inline void
6380-dma_cache_sync(void *vaddr, size_t size,
6381+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
6382 enum dma_data_direction direction)
6383 {
6384 flush_write_buffers();
6385Index: head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h
6386===================================================================
6387--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:29:05.000000000 +0100
6388+++ head-2008-12-01/include/asm-x86/mach-xen/asm/fixmap_32.h 2008-12-01 11:32:38.000000000 +0100
6389@@ -13,13 +13,16 @@
6390 #ifndef _ASM_FIXMAP_H
6391 #define _ASM_FIXMAP_H
6392
6393-
6394 /* used by vmalloc.c, vsyscall.lds.S.
6395 *
6396 * Leave one empty page between vmalloc'ed areas and
6397 * the start of the fixmap.
6398 */
6399 extern unsigned long __FIXADDR_TOP;
6400+#ifdef CONFIG_COMPAT_VDSO
6401+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
6402+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
6403+#endif
6404
6405 #ifndef __ASSEMBLY__
6406 #include <linux/kernel.h>
6407Index: head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h
6408===================================================================
6409--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100
6410+++ head-2008-12-01/include/asm-x86/mach-xen/asm/highmem.h 2008-12-01 11:32:38.000000000 +0100
6411@@ -85,7 +85,7 @@ static inline void clear_user_highpage(s
6412
6413 void copy_highpage(struct page *to, struct page *from);
6414 static inline void copy_user_highpage(struct page *to, struct page *from,
6415- unsigned long vaddr)
6416+ unsigned long vaddr, struct vm_area_struct *vma)
6417 {
6418 copy_highpage(to, from);
6419 }
6420Index: head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h
6421===================================================================
6422--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:29:05.000000000 +0100
6423+++ head-2008-12-01/include/asm-x86/mach-xen/asm/hypervisor.h 2008-12-01 11:32:38.000000000 +0100
6424@@ -46,15 +46,6 @@
6425 #include <asm/percpu.h>
6426 #include <asm/ptrace.h>
6427 #include <asm/page.h>
6428-#if defined(__i386__)
6429-# ifdef CONFIG_X86_PAE
6430-# include <asm-generic/pgtable-nopud.h>
6431-# else
6432-# include <asm-generic/pgtable-nopmd.h>
6433-# endif
6434-#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
6435-# include <asm-generic/pgtable-nopud.h>
6436-#endif
6437
6438 extern shared_info_t *HYPERVISOR_shared_info;
6439
6440Index: head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h
6441===================================================================
6442--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:29:05.000000000 +0100
6443+++ head-2008-12-01/include/asm-x86/mach-xen/asm/io_32.h 2008-12-01 11:32:38.000000000 +0100
6444@@ -269,11 +269,7 @@ static inline void flush_write_buffers(v
6445
6446 #endif /* __KERNEL__ */
6447
6448-#ifdef SLOW_IO_BY_JUMPING
6449-#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
6450-#else
6451 #define __SLOW_DOWN_IO "outb %%al,$0x80;"
6452-#endif
6453
6454 static inline void slow_down_io(void) {
6455 __asm__ __volatile__(
6456Index: head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags_32.h
6457===================================================================
6458--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200
6459+++ head-2008-12-01/include/asm-x86/mach-xen/asm/irqflags_32.h 2008-12-01 11:32:38.000000000 +0100
6460@@ -22,9 +22,6 @@
6461
6462 #define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
6463
6464-#define raw_local_save_flags(flags) \
6465- do { (flags) = __raw_local_save_flags(); } while (0)
6466-
6467 #define raw_local_irq_restore(x) \
6468 do { \
6469 vcpu_info_t *_vcpu; \
6470@@ -66,18 +63,6 @@ void raw_safe_halt(void);
6471 */
6472 void halt(void);
6473
6474-static inline int raw_irqs_disabled_flags(unsigned long flags)
6475-{
6476- return (flags != 0);
6477-}
6478-
6479-#define raw_irqs_disabled() \
6480-({ \
6481- unsigned long flags = __raw_local_save_flags(); \
6482- \
6483- raw_irqs_disabled_flags(flags); \
6484-})
6485-
6486 /*
6487 * For spinlocks, etc:
6488 */
6489@@ -90,9 +75,62 @@ static inline int raw_irqs_disabled_flag
6490 flags; \
6491 })
6492
6493+#else
6494+/* Offsets into shared_info_t. */
6495+#define evtchn_upcall_pending /* 0 */
6496+#define evtchn_upcall_mask 1
6497+
6498+#define sizeof_vcpu_shift 6
6499+
6500+#ifdef CONFIG_SMP
6501+#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
6502+ shl $sizeof_vcpu_shift,%esi ; \
6503+ addl HYPERVISOR_shared_info,%esi
6504+#else
6505+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
6506+#endif
6507+
6508+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
6509+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
6510+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
6511+#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6512+ __DISABLE_INTERRUPTS
6513+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6514+ __ENABLE_INTERRUPTS
6515+#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
6516+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
6517+ __TEST_PENDING ; \
6518+ jnz 14f /* process more events if necessary... */ ; \
6519+ movl PT_ESI(%esp), %esi ; \
6520+ sysexit ; \
6521+14: __DISABLE_INTERRUPTS ; \
6522+ TRACE_IRQS_OFF ; \
6523+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
6524+ push %esp ; \
6525+ call evtchn_do_upcall ; \
6526+ add $4,%esp ; \
6527+ jmp ret_from_intr
6528+#define INTERRUPT_RETURN iret
6529+#endif /* __ASSEMBLY__ */
6530+
6531+#ifndef __ASSEMBLY__
6532+#define raw_local_save_flags(flags) \
6533+ do { (flags) = __raw_local_save_flags(); } while (0)
6534+
6535 #define raw_local_irq_save(flags) \
6536 do { (flags) = __raw_local_irq_save(); } while (0)
6537
6538+static inline int raw_irqs_disabled_flags(unsigned long flags)
6539+{
6540+ return (flags != 0);
6541+}
6542+
6543+#define raw_irqs_disabled() \
6544+({ \
6545+ unsigned long flags = __raw_local_save_flags(); \
6546+ \
6547+ raw_irqs_disabled_flags(flags); \
6548+})
6549 #endif /* __ASSEMBLY__ */
6550
6551 /*
6552Index: head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h
6553===================================================================
6554--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200
6555+++ head-2008-12-01/include/asm-x86/mach-xen/asm/mmu_context_32.h 2008-12-01 11:32:38.000000000 +0100
6556@@ -27,14 +27,13 @@ static inline void enter_lazy_tlb(struct
6557 static inline void __prepare_arch_switch(void)
6558 {
6559 /*
6560- * Save away %fs and %gs. No need to save %es and %ds, as those
6561- * are always kernel segments while inside the kernel. Must
6562- * happen before reload of cr3/ldt (i.e., not in __switch_to).
6563+ * Save away %fs. No need to save %gs, as it was saved on the
6564+ * stack on entry. No need to save %es and %ds, as those are
6565+ * always kernel segments while inside the kernel.
6566 */
6567- asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
6568- : "=m" (current->thread.fs),
6569- "=m" (current->thread.gs));
6570- asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
6571+ asm volatile ( "mov %%fs,%0"
6572+ : "=m" (current->thread.fs));
6573+ asm volatile ( "movl %0,%%fs"
6574 : : "r" (0) );
6575 }
6576
6577@@ -89,14 +88,14 @@ static inline void switch_mm(struct mm_s
6578 * tlb flush IPI delivery. We must reload %cr3.
6579 */
6580 load_cr3(next->pgd);
6581- load_LDT_nolock(&next->context, cpu);
6582+ load_LDT_nolock(&next->context);
6583 }
6584 }
6585 #endif
6586 }
6587
6588-#define deactivate_mm(tsk, mm) \
6589- asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
6590+#define deactivate_mm(tsk, mm) \
6591+ asm("movl %0,%%fs": :"r" (0));
6592
6593 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
6594 {
6595Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h
6596===================================================================
6597--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:29:05.000000000 +0100
6598+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable-3level.h 2008-12-01 11:32:38.000000000 +0100
6599@@ -1,8 +1,6 @@
6600 #ifndef _I386_PGTABLE_3LEVEL_H
6601 #define _I386_PGTABLE_3LEVEL_H
6602
6603-#include <asm-generic/pgtable-nopud.h>
6604-
6605 /*
6606 * Intel Physical Address Extension (PAE) Mode - three-level page
6607 * tables on PPro+ CPUs.
6608@@ -75,6 +73,23 @@ static inline void set_pte(pte_t *ptep,
6609 xen_l3_entry_update((pudptr), (pudval))
6610
6611 /*
6612+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6613+ * entry, so clear the bottom half first and enforce ordering with a compiler
6614+ * barrier.
6615+ */
6616+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6617+{
6618+ if ((mm != current->mm && mm != &init_mm)
6619+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6620+ ptep->pte_low = 0;
6621+ smp_wmb();
6622+ ptep->pte_high = 0;
6623+ }
6624+}
6625+
6626+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6627+
6628+/*
6629 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
6630 * the TLB via cr3 if the top-level pgd is changed...
6631 * We do not let the generic code free and clear pgd entries due to
6632@@ -93,45 +108,16 @@ static inline void pud_clear (pud_t * pu
6633 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
6634 pmd_index(address))
6635
6636-static inline int pte_none(pte_t pte)
6637-{
6638- return !(pte.pte_low | pte.pte_high);
6639-}
6640-
6641-/*
6642- * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6643- * entry, so clear the bottom half first and enforce ordering with a compiler
6644- * barrier.
6645- */
6646-static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6647+static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
6648 {
6649- if ((mm != current->mm && mm != &init_mm)
6650- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6651- ptep->pte_low = 0;
6652- smp_wmb();
6653+ uint64_t val = __pte_val(res);
6654+ if (__cmpxchg64(ptep, val, 0) != val) {
6655+ /* xchg acts as a barrier before the setting of the high bits */
6656+ res.pte_low = xchg(&ptep->pte_low, 0);
6657+ res.pte_high = ptep->pte_high;
6658 ptep->pte_high = 0;
6659 }
6660-}
6661-
6662-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6663-
6664-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6665-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6666-{
6667- pte_t pte = *ptep;
6668- if (!pte_none(pte)) {
6669- if ((mm != &init_mm) ||
6670- HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6671- uint64_t val = __pte_val(pte);
6672- if (__cmpxchg64(ptep, val, 0) != val) {
6673- /* xchg acts as a barrier before the setting of the high bits */
6674- pte.pte_low = xchg(&ptep->pte_low, 0);
6675- pte.pte_high = ptep->pte_high;
6676- ptep->pte_high = 0;
6677- }
6678- }
6679- }
6680- return pte;
6681+ return res;
6682 }
6683
6684 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6685@@ -160,6 +146,11 @@ static inline int pte_same(pte_t a, pte_
6686
6687 #define pte_page(x) pfn_to_page(pte_pfn(x))
6688
6689+static inline int pte_none(pte_t pte)
6690+{
6691+ return !(pte.pte_low | pte.pte_high);
6692+}
6693+
6694 #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
6695 ((_pte).pte_high << (32-PAGE_SHIFT)))
6696 #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
6697Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h
6698===================================================================
6699--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:29:05.000000000 +0100
6700+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_32.h 2008-12-01 11:32:38.000000000 +0100
6701@@ -38,14 +38,14 @@ struct vm_area_struct;
6702 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
6703 extern unsigned long empty_zero_page[1024];
6704 extern pgd_t *swapper_pg_dir;
6705-extern kmem_cache_t *pgd_cache;
6706-extern kmem_cache_t *pmd_cache;
6707+extern struct kmem_cache *pgd_cache;
6708+extern struct kmem_cache *pmd_cache;
6709 extern spinlock_t pgd_lock;
6710 extern struct page *pgd_list;
6711
6712-void pmd_ctor(void *, kmem_cache_t *, unsigned long);
6713-void pgd_ctor(void *, kmem_cache_t *, unsigned long);
6714-void pgd_dtor(void *, kmem_cache_t *, unsigned long);
6715+void pmd_ctor(void *, struct kmem_cache *, unsigned long);
6716+void pgd_ctor(void *, struct kmem_cache *, unsigned long);
6717+void pgd_dtor(void *, struct kmem_cache *, unsigned long);
6718 void pgtable_cache_init(void);
6719 void paging_init(void);
6720
6721@@ -276,7 +276,6 @@ static inline pte_t pte_mkhuge(pte_t pte
6722 #define pte_update(mm, addr, ptep) do { } while (0)
6723 #define pte_update_defer(mm, addr, ptep) do { } while (0)
6724
6725-
6726 /*
6727 * We only update the dirty/accessed state if we set
6728 * the dirty bit by hand in the kernel, since the hardware
6729@@ -342,6 +341,19 @@ do { \
6730 __young; \
6731 })
6732
6733+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6734+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6735+{
6736+ pte_t pte = *ptep;
6737+ if (!pte_none(pte)
6738+ && (mm != &init_mm
6739+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
6740+ pte = raw_ptep_get_and_clear(ptep, pte);
6741+ pte_update(mm, addr, ptep);
6742+ }
6743+ return pte;
6744+}
6745+
6746 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6747 #define ptep_get_and_clear_full(mm, addr, ptep, full) \
6748 ((full) ? ({ \
6749Index: head-2008-12-01/include/asm-x86/mach-xen/asm/processor_32.h
6750===================================================================
6751--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/processor_32.h 2008-12-01 11:29:05.000000000 +0100
6752+++ head-2008-12-01/include/asm-x86/mach-xen/asm/processor_32.h 2008-12-01 11:32:38.000000000 +0100
6753@@ -20,6 +20,7 @@
6754 #include <linux/threads.h>
6755 #include <asm/percpu.h>
6756 #include <linux/cpumask.h>
6757+#include <linux/init.h>
6758 #include <xen/interface/physdev.h>
6759
6760 /* flag for disabling the tsc */
6761@@ -73,6 +74,7 @@ struct cpuinfo_x86 {
6762 #endif
6763 unsigned char x86_max_cores; /* cpuid returned max cores value */
6764 unsigned char apicid;
6765+ unsigned short x86_clflush_size;
6766 #ifdef CONFIG_SMP
6767 unsigned char booted_cores; /* number of cores as seen by OS */
6768 __u8 phys_proc_id; /* Physical processor id. */
6769@@ -114,6 +116,8 @@ extern struct cpuinfo_x86 cpu_data[];
6770 extern int cpu_llc_id[NR_CPUS];
6771 extern char ignore_fpu_irq;
6772
6773+void __init cpu_detect(struct cpuinfo_x86 *c);
6774+
6775 extern void identify_cpu(struct cpuinfo_x86 *);
6776 extern void print_cpu_info(struct cpuinfo_x86 *);
6777 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6778@@ -146,8 +150,8 @@ static inline void detect_ht(struct cpui
6779 #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6780 #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6781
6782-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
6783- unsigned int *ecx, unsigned int *edx)
6784+static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6785+ unsigned int *ecx, unsigned int *edx)
6786 {
6787 /* ecx is often an input as well as an output. */
6788 __asm__(XEN_CPUID
6789@@ -158,59 +162,6 @@ static inline void __cpuid(unsigned int
6790 : "0" (*eax), "2" (*ecx));
6791 }
6792
6793-/*
6794- * Generic CPUID function
6795- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6796- * resulting in stale register contents being returned.
6797- */
6798-static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
6799-{
6800- *eax = op;
6801- *ecx = 0;
6802- __cpuid(eax, ebx, ecx, edx);
6803-}
6804-
6805-/* Some CPUID calls want 'count' to be placed in ecx */
6806-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
6807- int *edx)
6808-{
6809- *eax = op;
6810- *ecx = count;
6811- __cpuid(eax, ebx, ecx, edx);
6812-}
6813-
6814-/*
6815- * CPUID functions returning a single datum
6816- */
6817-static inline unsigned int cpuid_eax(unsigned int op)
6818-{
6819- unsigned int eax, ebx, ecx, edx;
6820-
6821- cpuid(op, &eax, &ebx, &ecx, &edx);
6822- return eax;
6823-}
6824-static inline unsigned int cpuid_ebx(unsigned int op)
6825-{
6826- unsigned int eax, ebx, ecx, edx;
6827-
6828- cpuid(op, &eax, &ebx, &ecx, &edx);
6829- return ebx;
6830-}
6831-static inline unsigned int cpuid_ecx(unsigned int op)
6832-{
6833- unsigned int eax, ebx, ecx, edx;
6834-
6835- cpuid(op, &eax, &ebx, &ecx, &edx);
6836- return ecx;
6837-}
6838-static inline unsigned int cpuid_edx(unsigned int op)
6839-{
6840- unsigned int eax, ebx, ecx, edx;
6841-
6842- cpuid(op, &eax, &ebx, &ecx, &edx);
6843- return edx;
6844-}
6845-
6846 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6847
6848 /*
6849@@ -480,9 +431,9 @@ struct thread_struct {
6850 .vm86_info = NULL, \
6851 .sysenter_cs = __KERNEL_CS, \
6852 .io_bitmap_ptr = NULL, \
6853+ .gs = __KERNEL_PDA, \
6854 }
6855
6856-#ifndef CONFIG_X86_NO_TSS
6857 /*
6858 * Note that the .io_bitmap member must be extra-big. This is because
6859 * the CPU will access an additional byte beyond the end of the IO
6860@@ -497,26 +448,9 @@ struct thread_struct {
6861 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6862 }
6863
6864-static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6865-{
6866- tss->esp0 = thread->esp0;
6867- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6868- if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6869- tss->ss1 = thread->sysenter_cs;
6870- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6871- }
6872-}
6873-#define load_esp0(tss, thread) \
6874- __load_esp0(tss, thread)
6875-#else
6876-#define load_esp0(tss, thread) do { \
6877- if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6878- BUG(); \
6879-} while (0)
6880-#endif
6881-
6882 #define start_thread(regs, new_eip, new_esp) do { \
6883- __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
6884+ __asm__("movl %0,%%fs": :"r" (0)); \
6885+ regs->xgs = 0; \
6886 set_fs(USER_DS); \
6887 regs->xds = __USER_DS; \
6888 regs->xes = __USER_DS; \
6889@@ -526,26 +460,6 @@ static inline void __load_esp0(struct ts
6890 regs->esp = new_esp; \
6891 } while (0)
6892
6893-/*
6894- * These special macros can be used to get or set a debugging register
6895- */
6896-#define get_debugreg(var, register) \
6897- (var) = HYPERVISOR_get_debugreg((register))
6898-#define set_debugreg(value, register) \
6899- WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
6900-
6901-/*
6902- * Set IOPL bits in EFLAGS from given mask
6903- */
6904-static inline void set_iopl_mask(unsigned mask)
6905-{
6906- struct physdev_set_iopl set_iopl;
6907-
6908- /* Force the change at ring 0. */
6909- set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6910- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
6911-}
6912-
6913 /* Forward declaration, a strange C thing */
6914 struct task_struct;
6915 struct mm_struct;
6916@@ -637,6 +551,105 @@ static inline void rep_nop(void)
6917
6918 #define cpu_relax() rep_nop()
6919
6920+#define paravirt_enabled() 0
6921+#define __cpuid xen_cpuid
6922+
6923+#ifndef CONFIG_X86_NO_TSS
6924+static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6925+{
6926+ tss->esp0 = thread->esp0;
6927+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6928+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6929+ tss->ss1 = thread->sysenter_cs;
6930+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6931+ }
6932+}
6933+#define load_esp0(tss, thread) \
6934+ __load_esp0(tss, thread)
6935+#else
6936+#define load_esp0(tss, thread) do { \
6937+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6938+ BUG(); \
6939+} while (0)
6940+#endif
6941+
6942+
6943+/*
6944+ * These special macros can be used to get or set a debugging register
6945+ */
6946+#define get_debugreg(var, register) \
6947+ (var) = HYPERVISOR_get_debugreg(register)
6948+#define set_debugreg(value, register) \
6949+ WARN_ON(HYPERVISOR_set_debugreg(register, value))
6950+
6951+#define set_iopl_mask xen_set_iopl_mask
6952+
6953+/*
6954+ * Set IOPL bits in EFLAGS from given mask
6955+ */
6956+static inline void xen_set_iopl_mask(unsigned mask)
6957+{
6958+ struct physdev_set_iopl set_iopl;
6959+
6960+ /* Force the change at ring 0. */
6961+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6962+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
6963+}
6964+
6965+
6966+/*
6967+ * Generic CPUID function
6968+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6969+ * resulting in stale register contents being returned.
6970+ */
6971+static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
6972+{
6973+ *eax = op;
6974+ *ecx = 0;
6975+ __cpuid(eax, ebx, ecx, edx);
6976+}
6977+
6978+/* Some CPUID calls want 'count' to be placed in ecx */
6979+static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
6980+ int *edx)
6981+{
6982+ *eax = op;
6983+ *ecx = count;
6984+ __cpuid(eax, ebx, ecx, edx);
6985+}
6986+
6987+/*
6988+ * CPUID functions returning a single datum
6989+ */
6990+static inline unsigned int cpuid_eax(unsigned int op)
6991+{
6992+ unsigned int eax, ebx, ecx, edx;
6993+
6994+ cpuid(op, &eax, &ebx, &ecx, &edx);
6995+ return eax;
6996+}
6997+static inline unsigned int cpuid_ebx(unsigned int op)
6998+{
6999+ unsigned int eax, ebx, ecx, edx;
7000+
7001+ cpuid(op, &eax, &ebx, &ecx, &edx);
7002+ return ebx;
7003+}
7004+static inline unsigned int cpuid_ecx(unsigned int op)
7005+{
7006+ unsigned int eax, ebx, ecx, edx;
7007+
7008+ cpuid(op, &eax, &ebx, &ecx, &edx);
7009+ return ecx;
7010+}
7011+static inline unsigned int cpuid_edx(unsigned int op)
7012+{
7013+ unsigned int eax, ebx, ecx, edx;
7014+
7015+ cpuid(op, &eax, &ebx, &ecx, &edx);
7016+ return edx;
7017+}
7018+
7019 /* generic versions from gas */
7020 #define GENERIC_NOP1 ".byte 0x90\n"
7021 #define GENERIC_NOP2 ".byte 0x89,0xf6\n"
7022@@ -736,4 +749,8 @@ extern unsigned long boot_option_idle_ov
7023 extern void enable_sep_cpu(void);
7024 extern int sysenter_setup(void);
7025
7026+extern int init_gdt(int cpu, struct task_struct *idle);
7027+extern void cpu_set_gdt(int);
7028+extern void secondary_cpu_init(void);
7029+
7030 #endif /* __ASM_I386_PROCESSOR_H */
7031Index: head-2008-12-01/include/asm-x86/mach-xen/asm/segment_32.h
7032===================================================================
7033--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/segment_32.h 2008-12-01 11:29:05.000000000 +0100
7034+++ head-2008-12-01/include/asm-x86/mach-xen/asm/segment_32.h 2008-12-01 11:32:38.000000000 +0100
7035@@ -39,7 +39,7 @@
7036 * 25 - APM BIOS support
7037 *
7038 * 26 - ESPFIX small SS
7039- * 27 - unused
7040+ * 27 - PDA [ per-cpu private data area ]
7041 * 28 - unused
7042 * 29 - unused
7043 * 30 - unused
7044@@ -74,6 +74,9 @@
7045 #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
7046 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
7047
7048+#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
7049+#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
7050+
7051 #define GDT_ENTRY_DOUBLEFAULT_TSS 31
7052
7053 /*
7054Index: head-2008-12-01/include/asm-x86/mach-xen/asm/smp_32.h
7055===================================================================
7056--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/smp_32.h 2008-12-01 11:29:05.000000000 +0100
7057+++ head-2008-12-01/include/asm-x86/mach-xen/asm/smp_32.h 2008-12-01 11:32:38.000000000 +0100
7058@@ -8,6 +8,7 @@
7059 #include <linux/kernel.h>
7060 #include <linux/threads.h>
7061 #include <linux/cpumask.h>
7062+#include <asm/pda.h>
7063 #endif
7064
7065 #ifdef CONFIG_X86_LOCAL_APIC
7066@@ -56,7 +57,7 @@ extern void cpu_uninit(void);
7067 * from the initial startup. We map APIC_BASE very early in page_setup(),
7068 * so this is correct in the x86 case.
7069 */
7070-#define raw_smp_processor_id() (current_thread_info()->cpu)
7071+#define raw_smp_processor_id() (read_pda(cpu_number))
7072
7073 extern cpumask_t cpu_possible_map;
7074 #define cpu_callin_map cpu_possible_map
7075Index: head-2008-12-01/include/asm-x86/mach-xen/asm/system_32.h
7076===================================================================
7077--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/system_32.h 2008-12-01 11:29:05.000000000 +0100
7078+++ head-2008-12-01/include/asm-x86/mach-xen/asm/system_32.h 2008-12-01 11:32:38.000000000 +0100
7079@@ -139,17 +139,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
7080 #define write_cr4(x) \
7081 __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
7082
7083-/*
7084- * Clear and set 'TS' bit respectively
7085- */
7086+#define wbinvd() \
7087+ __asm__ __volatile__ ("wbinvd": : :"memory")
7088+
7089+/* Clear the 'TS' bit */
7090 #define clts() (HYPERVISOR_fpu_taskswitch(0))
7091+
7092+/* Set the 'TS' bit */
7093 #define stts() (HYPERVISOR_fpu_taskswitch(1))
7094
7095 #endif /* __KERNEL__ */
7096
7097-#define wbinvd() \
7098- __asm__ __volatile__ ("wbinvd": : :"memory")
7099-
7100 static inline unsigned long get_limit(unsigned long segment)
7101 {
7102 unsigned long __limit;
7103Index: head-2008-12-01/include/asm-x86/mach-xen/asm/desc_64.h
7104===================================================================
7105--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100
7106+++ head-2008-12-01/include/asm-x86/mach-xen/asm/desc_64.h 2008-12-01 11:32:38.000000000 +0100
7107@@ -9,62 +9,11 @@
7108
7109 #include <linux/string.h>
7110 #include <linux/smp.h>
7111+#include <asm/desc_defs.h>
7112
7113 #include <asm/segment.h>
7114 #include <asm/mmu.h>
7115
7116-// 8 byte segment descriptor
7117-struct desc_struct {
7118- u16 limit0;
7119- u16 base0;
7120- unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
7121- unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
7122-} __attribute__((packed));
7123-
7124-struct n_desc_struct {
7125- unsigned int a,b;
7126-};
7127-
7128-enum {
7129- GATE_INTERRUPT = 0xE,
7130- GATE_TRAP = 0xF,
7131- GATE_CALL = 0xC,
7132-};
7133-
7134-// 16byte gate
7135-struct gate_struct {
7136- u16 offset_low;
7137- u16 segment;
7138- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
7139- u16 offset_middle;
7140- u32 offset_high;
7141- u32 zero1;
7142-} __attribute__((packed));
7143-
7144-#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
7145-#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
7146-#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
7147-
7148-enum {
7149- DESC_TSS = 0x9,
7150- DESC_LDT = 0x2,
7151-};
7152-
7153-// LDT or TSS descriptor in the GDT. 16 bytes.
7154-struct ldttss_desc {
7155- u16 limit0;
7156- u16 base0;
7157- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
7158- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
7159- u32 base3;
7160- u32 zero1;
7161-} __attribute__((packed));
7162-
7163-struct desc_ptr {
7164- unsigned short size;
7165- unsigned long address;
7166-} __attribute__((packed)) ;
7167-
7168 extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
7169
7170 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
7171Index: head-2008-12-01/include/asm-x86/mach-xen/asm/dma-mapping_64.h
7172===================================================================
7173--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2008-12-01 11:29:05.000000000 +0100
7174+++ head-2008-12-01/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2008-12-01 11:32:38.000000000 +0100
7175@@ -64,6 +64,9 @@ static inline int dma_mapping_error(dma_
7176 return (dma_addr == bad_dma_address);
7177 }
7178
7179+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
7180+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
7181+
7182 extern void *dma_alloc_coherent(struct device *dev, size_t size,
7183 dma_addr_t *dma_handle, gfp_t gfp);
7184 extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
7185@@ -181,12 +184,13 @@ static inline int dma_get_cache_alignmen
7186 return boot_cpu_data.x86_clflush_size;
7187 }
7188
7189-#define dma_is_consistent(h) 1
7190+#define dma_is_consistent(d, h) 1
7191
7192 extern int dma_set_mask(struct device *dev, u64 mask);
7193
7194 static inline void
7195-dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
7196+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
7197+ enum dma_data_direction dir)
7198 {
7199 flush_write_buffers();
7200 }
7201Index: head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h
7202===================================================================
7203--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:29:05.000000000 +0100
7204+++ head-2008-12-01/include/asm-x86/mach-xen/asm/pgtable_64.h 2008-12-01 11:32:38.000000000 +0100
7205@@ -235,19 +235,18 @@ extern unsigned int __kernel_page_user;
7206
7207 static inline unsigned long pgd_bad(pgd_t pgd)
7208 {
7209- unsigned long val = __pgd_val(pgd);
7210- val &= ~PTE_MASK;
7211- val &= ~(_PAGE_USER | _PAGE_DIRTY);
7212- return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
7213+ return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
7214 }
7215
7216-static inline unsigned long pud_bad(pud_t pud)
7217-{
7218- unsigned long val = __pud_val(pud);
7219- val &= ~PTE_MASK;
7220- val &= ~(_PAGE_USER | _PAGE_DIRTY);
7221- return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
7222-}
7223+static inline unsigned long pud_bad(pud_t pud)
7224+{
7225+ return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
7226+}
7227+
7228+static inline unsigned long pmd_bad(pmd_t pmd)
7229+{
7230+ return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
7231+}
7232
7233 #define set_pte_at(_mm,addr,ptep,pteval) do { \
7234 if (((_mm) != current->mm && (_mm) != &init_mm) || \
7235@@ -402,8 +401,6 @@ static inline int pmd_large(pmd_t pte) {
7236 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
7237 #endif
7238 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
7239-#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
7240- != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
7241 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
7242 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
7243
7244Index: head-2008-12-01/include/asm-x86/mach-xen/asm/processor_64.h
7245===================================================================
7246--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/processor_64.h 2008-12-01 11:29:05.000000000 +0100
7247+++ head-2008-12-01/include/asm-x86/mach-xen/asm/processor_64.h 2008-12-01 11:32:38.000000000 +0100
7248@@ -484,6 +484,14 @@ static inline void __mwait(unsigned long
7249 : :"a" (eax), "c" (ecx));
7250 }
7251
7252+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
7253+{
7254+ /* "mwait %eax,%ecx;" */
7255+ asm volatile(
7256+ "sti; .byte 0x0f,0x01,0xc9;"
7257+ : :"a" (eax), "c" (ecx));
7258+}
7259+
7260 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
7261
7262 #define stack_current() \
7263Index: head-2008-12-01/include/asm-x86/mach-xen/asm/smp_64.h
7264===================================================================
7265--- head-2008-12-01.orig/include/asm-x86/mach-xen/asm/smp_64.h 2008-12-01 11:29:05.000000000 +0100
7266+++ head-2008-12-01/include/asm-x86/mach-xen/asm/smp_64.h 2008-12-01 11:32:38.000000000 +0100
7267@@ -88,11 +88,6 @@ extern u8 x86_cpu_to_log_apicid[NR_CPUS]
7268 extern u8 bios_cpu_apicid[];
7269
7270 #ifdef CONFIG_X86_LOCAL_APIC
7271-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
7272-{
7273- return cpus_addr(cpumask)[0];
7274-}
7275-
7276 static inline int cpu_present_to_apicid(int mps_cpu)
7277 {
7278 if (mps_cpu < NR_CPUS)
7279@@ -127,13 +122,6 @@ static __inline int logical_smp_processo
7280 #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
7281 #else
7282 #define cpu_physical_id(cpu) boot_cpu_id
7283-static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
7284- void *info, int retry, int wait)
7285-{
7286- /* Disable interrupts here? */
7287- func(info);
7288- return 0;
7289-}
7290 #endif /* !CONFIG_SMP */
7291 #endif
7292
7293Index: head-2008-12-01/kernel/kexec.c
7294===================================================================
7295--- head-2008-12-01.orig/kernel/kexec.c 2008-12-01 11:23:01.000000000 +0100
7296+++ head-2008-12-01/kernel/kexec.c 2008-12-01 11:32:38.000000000 +0100
7297@@ -372,7 +372,7 @@ static struct page *kimage_alloc_pages(g
7298 if (limit == ~0UL)
7299 address_bits = BITS_PER_LONG;
7300 else
7301- address_bits = long_log2(limit);
7302+ address_bits = ilog2(limit);
7303
7304 if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
7305 __free_pages(pages, order);
7306Index: head-2008-12-01/net/core/dev.c
7307===================================================================
7308--- head-2008-12-01.orig/net/core/dev.c 2008-12-01 11:29:05.000000000 +0100
7309+++ head-2008-12-01/net/core/dev.c 2008-12-01 11:32:38.000000000 +0100
7310@@ -1751,10 +1751,10 @@ inline int skb_checksum_setup(struct sk_
7311 goto out;
7312 switch (skb->nh.iph->protocol) {
7313 case IPPROTO_TCP:
7314- skb->csum = offsetof(struct tcphdr, check);
7315+ skb->csum_offset = offsetof(struct tcphdr, check);
7316 break;
7317 case IPPROTO_UDP:
7318- skb->csum = offsetof(struct udphdr, check);
7319+ skb->csum_offset = offsetof(struct udphdr, check);
7320 break;
7321 default:
7322 if (net_ratelimit())
7323@@ -1763,7 +1763,7 @@ inline int skb_checksum_setup(struct sk_
7324 " %d packet", skb->nh.iph->protocol);
7325 goto out;
7326 }
7327- if ((skb->h.raw + skb->csum + 2) > skb->tail)
7328+ if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7329 goto out;
7330 skb->ip_summed = CHECKSUM_PARTIAL;
7331 skb->proto_csum_blank = 0;